diff --git a/SigProfilerAssignment/decompose_subroutines.py b/SigProfilerAssignment/decompose_subroutines.py index 7697df7..b5daf36 100644 --- a/SigProfilerAssignment/decompose_subroutines.py +++ b/SigProfilerAssignment/decompose_subroutines.py @@ -301,6 +301,7 @@ def signature_decomposition( m_for_subgroups="SBS", volume=None, add_background_signatures=True, + collapse_to_SBS96=True, ): originalProcessAvg = originalProcessAvg.reset_index() if not os.path.exists(directory + "/Solution_Stats"): @@ -332,12 +333,12 @@ def signature_decomposition( sigDatabase = pd.read_csv(signature_database, sep="\t", index_col=0) # indx = sigDatabase.index() if ( - sigDatabase.shape[0] == 1536 + sigDatabase.shape[0] == 1536 and collapse_to_SBS96 ): # collapse the 1596 context into 96 only for the deocmposition sigDatabase = sigDatabase.groupby(sigDatabase.index.str[1:8]).sum() elif ( - sigDatabase.shape[0] == 288 + sigDatabase.shape[0] == 288 and collapse_to_SBS96 ): # collapse the 288 context into 96 only for the deocmposition # sigDatabase = pd.DataFrame(processAvg, index=index) sigDatabase = sigDatabase.groupby(sigDatabase.index.str[2:9]).sum() @@ -407,11 +408,23 @@ def signature_decomposition( denovo_signature_names = make_letter_ids( signatures.shape[1], mtype=mutation_context ) + # Get original signature names if available (preserve original names regardless of database type) + # This will be used throughout to preserve original names in logs and output files + if originalProcessAvg is not None and hasattr(originalProcessAvg, 'columns') and len(originalProcessAvg.columns) > 1: + # After reset_index(), the first column is mutation types, so signature names start at index 1 + original_signature_names = originalProcessAvg.columns[1:].tolist() + else: + original_signature_names = None # lognote.write("\n********** Starting Signature Decomposition **********\n\n") activity_percentages = [] merger = PdfWriter() for i, j in zip(range(signatures.shape[1]), denovo_signature_names): + # Determine the signature name to use (original if available, otherwise default) + if original_signature_names is not None and i < len(original_signature_names): + current_signature_name = original_signature_names[i] + else: + current_signature_name = mutation_context + letters[i] # Only for context SBS96 if signatures.shape[0] == 96: lognote = open( @@ -423,7 +436,7 @@ def signature_decomposition( ) lognote.write( "\n\n\n\n######################## Decomposing " - + j + + current_signature_name + " ########################\n" ) lognote.close() @@ -473,7 +486,7 @@ def signature_decomposition( ) lognote.write( "\n\n\n\n######################## Decomposing " - + j + + current_signature_name + " ########################\n" ) lognote.close() @@ -518,15 +531,15 @@ def signature_decomposition( decomposed_signatures = [] contribution_percentages = [] - for j in np.nonzero(exposures)[0]: - listofinformation[count * 3] = signames[j] + for sig_idx in np.nonzero(exposures)[0]: + listofinformation[count * 3] = signames[sig_idx] listofinformation[count * 3 + 1] = round(exposure_percentages[count], 2) contribution_percentages.append(round(exposure_percentages[count], 2)) listofinformation[count * 3 + 2] = "%" - decomposed_signatures.append(signames[j]) + decomposed_signatures.append(signames[sig_idx]) count += 1 ListToTumple = tuple( - [mtype, letters[i]] + [mtype, current_signature_name] + listofinformation + [L1dist * 100] + [L2dist * 100] @@ -539,7 +552,13 @@ def signature_decomposition( weights = [] basis_names = [] nonzero_exposures = exposures[np.nonzero(exposures)] - denovo_name = mutation_context + letters[i] + # Use original signature names if available (preserve original names regardless of database type) + if original_signature_names is not None and i < len(original_signature_names): + # Original signature names are available, use them + denovo_name = original_signature_names[i] + else: + # Fallback to default naming convention (e.g., SBS96A, SBS96B, etc.) + denovo_name = mutation_context + letters[i] for info in range(0, len(listofinformation), 3): # print(info) sigName = listofinformation[info] @@ -569,8 +588,23 @@ def signature_decomposition( mtype_par = "32" else: mtype_par = "none" + + # Update mtype_par to reflect the correct dimension for plotting + if collapse_to_SBS96 and signatures.shape[0] == 96 and mtype_par in ["288", "1536"]: + mtype_par = "96" + + can_plot_decomposition = make_decomposition_plots + # Check for unsupported, non-collapsed contexts before attempting to plot + if mtype_par in ["288", "1536"] and not collapse_to_SBS96 and can_plot_decomposition: + print(f"\nWarning: Decomposition plots for the {mtype_par} context are not currently supported in this workflow when collapse_to_SBS96 is False. " + f"Skipping plot generation for signature {j}.") + can_plot_decomposition = False # Temporarily disable plotting for this specific signature + # only create decomposition plots for COSMIC signatures - if mtype_par != "none" and make_decomposition_plots == True: + if ( + mtype_par != "none" + and can_plot_decomposition == True + ): # reformat the first column of cosmic signature dataframe cosmic_sigs_DF = sigDatabases_DF.copy(deep=True) cosmic_sigs_DF.columns = ["MutationType"] + cosmic_sigs_DF.columns[ @@ -631,25 +665,25 @@ def signature_decomposition( fh.close() dictionary.update( - {"{}".format(mutation_context + letters[i]): decomposed_signatures} + {"{}".format(current_signature_name): decomposed_signatures} ) else: - newsig.append(mutation_context + letters[i]) + newsig.append(current_signature_name) newsigmatrixidx.append(i) fh = open( directory + "/De_Novo_map_to_COSMIC_" + mutation_context + ".csv", "a" ) fh.write( "Signature {}-{}, Signature {}-{}, {}, {}, {}, {}, {}\n".format( - mtype, letters[i], mtype, letters[i], 0, 0, 0, 1, 1 + mtype, current_signature_name, mtype, current_signature_name, 0, 0, 0, 1, 1 ) ) fh.close() dictionary.update( { - "{}".format(mutation_context + letters[i]): [ - "{}".format(mutation_context + letters[i]) + "{}".format(current_signature_name): [ + "{}".format(current_signature_name) ] } ) diff --git a/SigProfilerAssignment/decomposition.py b/SigProfilerAssignment/decomposition.py index 8a99119..87f7010 100644 --- a/SigProfilerAssignment/decomposition.py +++ b/SigProfilerAssignment/decomposition.py @@ -786,10 +786,29 @@ def spa_analyze( allsigids = processAvg.columns.to_list() except: allsigids = list(listOfSignatures) + + # Preserve originalProcessAvg with original column names BEFORE converting to numpy array + # This is needed to maintain original de novo signature names in decomposition plots + if isinstance(processAvg, pd.DataFrame): + originalProcessAvg = processAvg.copy() + else: + originalProcessAvg = pd.DataFrame( + processAvg, index=index, columns=listOfSignatures + ) + processAvg = np.array(processAvg) signature_names = sub.make_letter_ids( idlenth=processAvg.shape[1], mtype=mutation_context ) + + # Use original signature names if available (preserve original names regardless of database type) + # This ensures exposureAvg column names match attribution dictionary keys + if originalProcessAvg is not None and hasattr(originalProcessAvg, 'columns') and len(originalProcessAvg.columns) > 0: + # Use original column names from the signatures file + exposure_signature_names = list(listOfSignatures) + else: + # Fallback to default naming convention + exposure_signature_names = signature_names exposureAvg_dummy = ( pd.DataFrame( @@ -801,16 +820,10 @@ def spa_analyze( .rename_axis("Samples") ) exposureAvg = exposureAvg_dummy - exposureAvg.columns = signature_names + exposureAvg.columns = exposure_signature_names ############################# # layer_directory2 = output+"/Decompose_Solution" - if isinstance(processAvg, pd.DataFrame): - pass - else: - originalProcessAvg = pd.DataFrame( - processAvg, index=index, columns=listOfSignatures - ) try: if not os.path.exists(layer_directory2): os.makedirs(layer_directory2) @@ -819,21 +832,27 @@ def spa_analyze( if ( processAvg.shape[0] == 1536 and collapse_to_SBS96 == True - ): # collapse the 1596 context into 96 only for the deocmposition + ): # collapse the 1536 context into 96 only for the decomposition processAvg = pd.DataFrame(processAvg, index=index) processAvg = processAvg.groupby(processAvg.index.str[1:8]).sum() genomes = genomes.groupby(genomes.index.str[1:8]).sum() index = genomes.index + # Also collapse originalProcessAvg to match the collapsed context + # originalProcessAvg has mutation types in the index, so group by index directly + originalProcessAvg = originalProcessAvg.groupby(originalProcessAvg.index.str[1:8]).sum() processAvg = np.array(processAvg) if ( processAvg.shape[0] == 288 and collapse_to_SBS96 == True - ): # collapse the 288 context into 96 only for the deocmposition + ): # collapse the 288 context into 96 only for the decomposition processAvg = pd.DataFrame(processAvg, index=index) processAvg = processAvg.groupby(processAvg.index.str[2:9]).sum() genomes = pd.DataFrame(genomes, index=index) genomes = genomes.groupby(genomes.index.str[2:9]).sum() index = genomes.index + # Also collapse originalProcessAvg to match the collapsed context + # originalProcessAvg has mutation types in the index, so group by index directly + originalProcessAvg = originalProcessAvg.groupby(originalProcessAvg.index.str[2:9]).sum() processAvg = np.array(processAvg) print("\n Decomposing De Novo Signatures .....") @@ -860,6 +879,7 @@ def spa_analyze( m_for_subgroups=m_for_subgroups, volume=volume, add_background_signatures=add_background_signatures, + collapse_to_SBS96=collapse_to_SBS96, ) # final_signatures = sub.signature_decomposition(processAvg, m, layer_directory2, genome_build=genome_build) # extract the global signatures and new signatures from the final_signatures dictionary