From 033548ee6e969f7d534fe92774eecac27ba3b755 Mon Sep 17 00:00:00 2001 From: mdbarnesUCSD Date: Thu, 11 Sep 2025 15:49:42 -0700 Subject: [PATCH 1/4] Support decompose_fit in higher contexts so that they are not collapsed to 96 --- SigProfilerAssignment/decompose_subroutines.py | 5 +++-- SigProfilerAssignment/decomposition.py | 1 + SigProfilerAssignment/single_sample.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/SigProfilerAssignment/decompose_subroutines.py b/SigProfilerAssignment/decompose_subroutines.py index 7697df7..8ed00bd 100644 --- a/SigProfilerAssignment/decompose_subroutines.py +++ b/SigProfilerAssignment/decompose_subroutines.py @@ -301,6 +301,7 @@ def signature_decomposition( m_for_subgroups="SBS", volume=None, add_background_signatures=True, + collapse_to_SBS96=True, ): originalProcessAvg = originalProcessAvg.reset_index() if not os.path.exists(directory + "/Solution_Stats"): @@ -332,12 +333,12 @@ def signature_decomposition( sigDatabase = pd.read_csv(signature_database, sep="\t", index_col=0) # indx = sigDatabase.index() if ( - sigDatabase.shape[0] == 1536 + sigDatabase.shape[0] == 1536 and collapse_to_SBS96 ): # collapse the 1596 context into 96 only for the deocmposition sigDatabase = sigDatabase.groupby(sigDatabase.index.str[1:8]).sum() elif ( - sigDatabase.shape[0] == 288 + sigDatabase.shape[0] == 288 and collapse_to_SBS96 ): # collapse the 288 context into 96 only for the deocmposition # sigDatabase = pd.DataFrame(processAvg, index=index) sigDatabase = sigDatabase.groupby(sigDatabase.index.str[2:9]).sum() diff --git a/SigProfilerAssignment/decomposition.py b/SigProfilerAssignment/decomposition.py index 8a99119..22788ae 100644 --- a/SigProfilerAssignment/decomposition.py +++ b/SigProfilerAssignment/decomposition.py @@ -860,6 +860,7 @@ def spa_analyze( m_for_subgroups=m_for_subgroups, volume=volume, add_background_signatures=add_background_signatures, + collapse_to_SBS96=collapse_to_SBS96, ) # final_signatures = sub.signature_decomposition(processAvg, m, layer_directory2, genome_build=genome_build) # extract the global signatures and new signatures from the final_signatures dictionary diff --git a/SigProfilerAssignment/single_sample.py b/SigProfilerAssignment/single_sample.py index c62e34c..190ae0a 100644 --- a/SigProfilerAssignment/single_sample.py +++ b/SigProfilerAssignment/single_sample.py @@ -379,6 +379,7 @@ def add_signatures( # compute the estimated genome est_genome = np.dot(W1, np.array(newExposure)) if solver == "nnls": + # import pdb; breakpoint(); reg = nnls(W1, genome[:, 0]) weights = reg[0] est_genome = np.dot(W1, weights) From 9e51375c8a40ef21bad5526d62e3afa4f7862951 Mon Sep 17 00:00:00 2001 From: mdbarnesUCSD Date: Thu, 11 Sep 2025 16:40:22 -0700 Subject: [PATCH 2/4] Fix variable reusage error in nested for loop --- .../decompose_subroutines.py | 26 +++++++++++++++---- SigProfilerAssignment/single_sample.py | 1 - 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/SigProfilerAssignment/decompose_subroutines.py b/SigProfilerAssignment/decompose_subroutines.py index 8ed00bd..110b078 100644 --- a/SigProfilerAssignment/decompose_subroutines.py +++ b/SigProfilerAssignment/decompose_subroutines.py @@ -519,12 +519,12 @@ def signature_decomposition( decomposed_signatures = [] contribution_percentages = [] - for j in np.nonzero(exposures)[0]: - listofinformation[count * 3] = signames[j] + for sig_idx in np.nonzero(exposures)[0]: + listofinformation[count * 3] = signames[sig_idx] listofinformation[count * 3 + 1] = round(exposure_percentages[count], 2) contribution_percentages.append(round(exposure_percentages[count], 2)) listofinformation[count * 3 + 2] = "%" - decomposed_signatures.append(signames[j]) + decomposed_signatures.append(signames[sig_idx]) count += 1 ListToTumple = tuple( [mtype, letters[i]] @@ -540,7 +540,12 @@ def signature_decomposition( weights = [] basis_names = [] nonzero_exposures = exposures[np.nonzero(exposures)] - denovo_name = mutation_context + letters[i] + if signature_database is not None: + # A custom database was provided, so use the original column names from the de novo file. + denovo_name = originalProcessAvg.columns[i+1] + else: + # No custom database was provided, so use the default naming convention (e.g., SBS96A, SBS96B, etc.). + denovo_name = mutation_context + letters[i] for info in range(0, len(listofinformation), 3): # print(info) sigName = listofinformation[info] @@ -570,8 +575,19 @@ def signature_decomposition( mtype_par = "32" else: mtype_par = "none" + + can_plot_decomposition = make_decomposition_plots + # Check for unsupported, non-collapsed contexts before attempting to plot + if mtype_par in ["288", "1536"] and not collapse_to_SBS96 and can_plot_decomposition: + print(f"\nWarning: Decomposition plots for the {mtype_par} context are not currently supported in this workflow when collapse_to_SBS96 is False. " + f"Skipping plot generation for signature {j}.") + can_plot_decomposition = False # Temporarily disable plotting for this specific signature + # only create decomposition plots for COSMIC signatures - if mtype_par != "none" and make_decomposition_plots == True: + if ( + mtype_par != "none" + and can_plot_decomposition == True + ): # reformat the first column of cosmic signature dataframe cosmic_sigs_DF = sigDatabases_DF.copy(deep=True) cosmic_sigs_DF.columns = ["MutationType"] + cosmic_sigs_DF.columns[ diff --git a/SigProfilerAssignment/single_sample.py b/SigProfilerAssignment/single_sample.py index 190ae0a..c62e34c 100644 --- a/SigProfilerAssignment/single_sample.py +++ b/SigProfilerAssignment/single_sample.py @@ -379,7 +379,6 @@ def add_signatures( # compute the estimated genome est_genome = np.dot(W1, np.array(newExposure)) if solver == "nnls": - # import pdb; breakpoint(); reg = nnls(W1, genome[:, 0]) weights = reg[0] est_genome = np.dot(W1, weights) From 39e5a777e18287b55e51ce736b6e144ca7dbd8b1 Mon Sep 17 00:00:00 2001 From: Mark Barnes Date: Fri, 14 Nov 2025 16:41:56 -0800 Subject: [PATCH 3/4] Support custom signature names --- .../decompose_subroutines.py | 37 +++++++++++++------ SigProfilerAssignment/decomposition.py | 37 ++++++++++++++----- 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/SigProfilerAssignment/decompose_subroutines.py b/SigProfilerAssignment/decompose_subroutines.py index 110b078..d151dc1 100644 --- a/SigProfilerAssignment/decompose_subroutines.py +++ b/SigProfilerAssignment/decompose_subroutines.py @@ -408,11 +408,23 @@ def signature_decomposition( denovo_signature_names = make_letter_ids( signatures.shape[1], mtype=mutation_context ) + # Get original signature names if available (preserve original names regardless of database type) + # This will be used throughout to preserve original names in logs and output files + if originalProcessAvg is not None and hasattr(originalProcessAvg, 'columns') and len(originalProcessAvg.columns) > 1: + # After reset_index(), the first column is mutation types, so signature names start at index 1 + original_signature_names = originalProcessAvg.columns[1:].tolist() + else: + original_signature_names = None # lognote.write("\n********** Starting Signature Decomposition **********\n\n") activity_percentages = [] merger = PdfWriter() for i, j in zip(range(signatures.shape[1]), denovo_signature_names): + # Determine the signature name to use (original if available, otherwise default) + if original_signature_names is not None and i < len(original_signature_names): + current_signature_name = original_signature_names[i] + else: + current_signature_name = mutation_context + letters[i] # Only for context SBS96 if signatures.shape[0] == 96: lognote = open( @@ -424,7 +436,7 @@ def signature_decomposition( ) lognote.write( "\n\n\n\n######################## Decomposing " - + j + + current_signature_name + " ########################\n" ) lognote.close() @@ -474,7 +486,7 @@ def signature_decomposition( ) lognote.write( "\n\n\n\n######################## Decomposing " - + j + + current_signature_name + " ########################\n" ) lognote.close() @@ -527,7 +539,7 @@ def signature_decomposition( decomposed_signatures.append(signames[sig_idx]) count += 1 ListToTumple = tuple( - [mtype, letters[i]] + [mtype, current_signature_name] + listofinformation + [L1dist * 100] + [L2dist * 100] @@ -540,11 +552,12 @@ def signature_decomposition( weights = [] basis_names = [] nonzero_exposures = exposures[np.nonzero(exposures)] - if signature_database is not None: - # A custom database was provided, so use the original column names from the de novo file. - denovo_name = originalProcessAvg.columns[i+1] + # Use original signature names if available (preserve original names regardless of database type) + if original_signature_names is not None and i < len(original_signature_names): + # Original signature names are available, use them + denovo_name = original_signature_names[i] else: - # No custom database was provided, so use the default naming convention (e.g., SBS96A, SBS96B, etc.). + # Fallback to default naming convention (e.g., SBS96A, SBS96B, etc.) denovo_name = mutation_context + letters[i] for info in range(0, len(listofinformation), 3): # print(info) @@ -648,25 +661,25 @@ def signature_decomposition( fh.close() dictionary.update( - {"{}".format(mutation_context + letters[i]): decomposed_signatures} + {"{}".format(current_signature_name): decomposed_signatures} ) else: - newsig.append(mutation_context + letters[i]) + newsig.append(current_signature_name) newsigmatrixidx.append(i) fh = open( directory + "/De_Novo_map_to_COSMIC_" + mutation_context + ".csv", "a" ) fh.write( "Signature {}-{}, Signature {}-{}, {}, {}, {}, {}, {}\n".format( - mtype, letters[i], mtype, letters[i], 0, 0, 0, 1, 1 + mtype, current_signature_name, mtype, current_signature_name, 0, 0, 0, 1, 1 ) ) fh.close() dictionary.update( { - "{}".format(mutation_context + letters[i]): [ - "{}".format(mutation_context + letters[i]) + "{}".format(current_signature_name): [ + "{}".format(current_signature_name) ] } ) diff --git a/SigProfilerAssignment/decomposition.py b/SigProfilerAssignment/decomposition.py index 22788ae..87f7010 100644 --- a/SigProfilerAssignment/decomposition.py +++ b/SigProfilerAssignment/decomposition.py @@ -786,10 +786,29 @@ def spa_analyze( allsigids = processAvg.columns.to_list() except: allsigids = list(listOfSignatures) + + # Preserve originalProcessAvg with original column names BEFORE converting to numpy array + # This is needed to maintain original de novo signature names in decomposition plots + if isinstance(processAvg, pd.DataFrame): + originalProcessAvg = processAvg.copy() + else: + originalProcessAvg = pd.DataFrame( + processAvg, index=index, columns=listOfSignatures + ) + processAvg = np.array(processAvg) signature_names = sub.make_letter_ids( idlenth=processAvg.shape[1], mtype=mutation_context ) + + # Use original signature names if available (preserve original names regardless of database type) + # This ensures exposureAvg column names match attribution dictionary keys + if originalProcessAvg is not None and hasattr(originalProcessAvg, 'columns') and len(originalProcessAvg.columns) > 0: + # Use original column names from the signatures file + exposure_signature_names = list(listOfSignatures) + else: + # Fallback to default naming convention + exposure_signature_names = signature_names exposureAvg_dummy = ( pd.DataFrame( @@ -801,16 +820,10 @@ def spa_analyze( .rename_axis("Samples") ) exposureAvg = exposureAvg_dummy - exposureAvg.columns = signature_names + exposureAvg.columns = exposure_signature_names ############################# # layer_directory2 = output+"/Decompose_Solution" - if isinstance(processAvg, pd.DataFrame): - pass - else: - originalProcessAvg = pd.DataFrame( - processAvg, index=index, columns=listOfSignatures - ) try: if not os.path.exists(layer_directory2): os.makedirs(layer_directory2) @@ -819,21 +832,27 @@ def spa_analyze( if ( processAvg.shape[0] == 1536 and collapse_to_SBS96 == True - ): # collapse the 1596 context into 96 only for the deocmposition + ): # collapse the 1536 context into 96 only for the decomposition processAvg = pd.DataFrame(processAvg, index=index) processAvg = processAvg.groupby(processAvg.index.str[1:8]).sum() genomes = genomes.groupby(genomes.index.str[1:8]).sum() index = genomes.index + # Also collapse originalProcessAvg to match the collapsed context + # originalProcessAvg has mutation types in the index, so group by index directly + originalProcessAvg = originalProcessAvg.groupby(originalProcessAvg.index.str[1:8]).sum() processAvg = np.array(processAvg) if ( processAvg.shape[0] == 288 and collapse_to_SBS96 == True - ): # collapse the 288 context into 96 only for the deocmposition + ): # collapse the 288 context into 96 only for the decomposition processAvg = pd.DataFrame(processAvg, index=index) processAvg = processAvg.groupby(processAvg.index.str[2:9]).sum() genomes = pd.DataFrame(genomes, index=index) genomes = genomes.groupby(genomes.index.str[2:9]).sum() index = genomes.index + # Also collapse originalProcessAvg to match the collapsed context + # originalProcessAvg has mutation types in the index, so group by index directly + originalProcessAvg = originalProcessAvg.groupby(originalProcessAvg.index.str[2:9]).sum() processAvg = np.array(processAvg) print("\n Decomposing De Novo Signatures .....") From 6ed837d5953ca15d4fd8d05c255cde279fad098a Mon Sep 17 00:00:00 2001 From: Mark Barnes Date: Fri, 14 Nov 2025 17:04:00 -0800 Subject: [PATCH 4/4] Pass plotting the correct context type --- SigProfilerAssignment/decompose_subroutines.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/SigProfilerAssignment/decompose_subroutines.py b/SigProfilerAssignment/decompose_subroutines.py index d151dc1..b5daf36 100644 --- a/SigProfilerAssignment/decompose_subroutines.py +++ b/SigProfilerAssignment/decompose_subroutines.py @@ -589,6 +589,10 @@ def signature_decomposition( else: mtype_par = "none" + # Update mtype_par to reflect the correct dimension for plotting + if collapse_to_SBS96 and signatures.shape[0] == 96 and mtype_par in ["288", "1536"]: + mtype_par = "96" + can_plot_decomposition = make_decomposition_plots # Check for unsupported, non-collapsed contexts before attempting to plot if mtype_par in ["288", "1536"] and not collapse_to_SBS96 and can_plot_decomposition: