Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 49 additions & 15 deletions SigProfilerAssignment/decompose_subroutines.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ def signature_decomposition(
m_for_subgroups="SBS",
volume=None,
add_background_signatures=True,
collapse_to_SBS96=True,
):
originalProcessAvg = originalProcessAvg.reset_index()
if not os.path.exists(directory + "/Solution_Stats"):
Expand Down Expand Up @@ -332,12 +333,12 @@ def signature_decomposition(
sigDatabase = pd.read_csv(signature_database, sep="\t", index_col=0)
# indx = sigDatabase.index()
if (
sigDatabase.shape[0] == 1536
sigDatabase.shape[0] == 1536 and collapse_to_SBS96
): # collapse the 1596 context into 96 only for the deocmposition
sigDatabase = sigDatabase.groupby(sigDatabase.index.str[1:8]).sum()

elif (
sigDatabase.shape[0] == 288
sigDatabase.shape[0] == 288 and collapse_to_SBS96
): # collapse the 288 context into 96 only for the deocmposition
# sigDatabase = pd.DataFrame(processAvg, index=index)
sigDatabase = sigDatabase.groupby(sigDatabase.index.str[2:9]).sum()
Expand Down Expand Up @@ -407,11 +408,23 @@ def signature_decomposition(
denovo_signature_names = make_letter_ids(
signatures.shape[1], mtype=mutation_context
)
# Get original signature names if available (preserve original names regardless of database type)
# This will be used throughout to preserve original names in logs and output files
if originalProcessAvg is not None and hasattr(originalProcessAvg, 'columns') and len(originalProcessAvg.columns) > 1:
# After reset_index(), the first column is mutation types, so signature names start at index 1
original_signature_names = originalProcessAvg.columns[1:].tolist()
else:
original_signature_names = None
# lognote.write("\n********** Starting Signature Decomposition **********\n\n")
activity_percentages = []
merger = PdfWriter()

for i, j in zip(range(signatures.shape[1]), denovo_signature_names):
# Determine the signature name to use (original if available, otherwise default)
if original_signature_names is not None and i < len(original_signature_names):
current_signature_name = original_signature_names[i]
else:
current_signature_name = mutation_context + letters[i]
# Only for context SBS96
if signatures.shape[0] == 96:
lognote = open(
Expand All @@ -423,7 +436,7 @@ def signature_decomposition(
)
lognote.write(
"\n\n\n\n######################## Decomposing "
+ j
+ current_signature_name
+ " ########################\n"
)
lognote.close()
Expand Down Expand Up @@ -473,7 +486,7 @@ def signature_decomposition(
)
lognote.write(
"\n\n\n\n######################## Decomposing "
+ j
+ current_signature_name
+ " ########################\n"
)
lognote.close()
Expand Down Expand Up @@ -518,15 +531,15 @@ def signature_decomposition(
decomposed_signatures = []
contribution_percentages = []

for j in np.nonzero(exposures)[0]:
listofinformation[count * 3] = signames[j]
for sig_idx in np.nonzero(exposures)[0]:
listofinformation[count * 3] = signames[sig_idx]
listofinformation[count * 3 + 1] = round(exposure_percentages[count], 2)
contribution_percentages.append(round(exposure_percentages[count], 2))
listofinformation[count * 3 + 2] = "%"
decomposed_signatures.append(signames[j])
decomposed_signatures.append(signames[sig_idx])
count += 1
ListToTumple = tuple(
[mtype, letters[i]]
[mtype, current_signature_name]
+ listofinformation
+ [L1dist * 100]
+ [L2dist * 100]
Expand All @@ -539,7 +552,13 @@ def signature_decomposition(
weights = []
basis_names = []
nonzero_exposures = exposures[np.nonzero(exposures)]
denovo_name = mutation_context + letters[i]
# Use original signature names if available (preserve original names regardless of database type)
if original_signature_names is not None and i < len(original_signature_names):
# Original signature names are available, use them
denovo_name = original_signature_names[i]
else:
# Fallback to default naming convention (e.g., SBS96A, SBS96B, etc.)
denovo_name = mutation_context + letters[i]
for info in range(0, len(listofinformation), 3):
# print(info)
sigName = listofinformation[info]
Expand Down Expand Up @@ -569,8 +588,23 @@ def signature_decomposition(
mtype_par = "32"
else:
mtype_par = "none"

# Update mtype_par to reflect the correct dimension for plotting
if collapse_to_SBS96 and signatures.shape[0] == 96 and mtype_par in ["288", "1536"]:
mtype_par = "96"

can_plot_decomposition = make_decomposition_plots
# Check for unsupported, non-collapsed contexts before attempting to plot
if mtype_par in ["288", "1536"] and not collapse_to_SBS96 and can_plot_decomposition:
print(f"\nWarning: Decomposition plots for the {mtype_par} context are not currently supported in this workflow when collapse_to_SBS96 is False. "
f"Skipping plot generation for signature {j}.")
can_plot_decomposition = False # Temporarily disable plotting for this specific signature

# only create decomposition plots for COSMIC signatures
if mtype_par != "none" and make_decomposition_plots == True:
if (
mtype_par != "none"
and can_plot_decomposition == True
):
# reformat the first column of cosmic signature dataframe
cosmic_sigs_DF = sigDatabases_DF.copy(deep=True)
cosmic_sigs_DF.columns = ["MutationType"] + cosmic_sigs_DF.columns[
Expand Down Expand Up @@ -631,25 +665,25 @@ def signature_decomposition(
fh.close()

dictionary.update(
{"{}".format(mutation_context + letters[i]): decomposed_signatures}
{"{}".format(current_signature_name): decomposed_signatures}
)

else:
newsig.append(mutation_context + letters[i])
newsig.append(current_signature_name)
newsigmatrixidx.append(i)
fh = open(
directory + "/De_Novo_map_to_COSMIC_" + mutation_context + ".csv", "a"
)
fh.write(
"Signature {}-{}, Signature {}-{}, {}, {}, {}, {}, {}\n".format(
mtype, letters[i], mtype, letters[i], 0, 0, 0, 1, 1
mtype, current_signature_name, mtype, current_signature_name, 0, 0, 0, 1, 1
)
)
fh.close()
dictionary.update(
{
"{}".format(mutation_context + letters[i]): [
"{}".format(mutation_context + letters[i])
"{}".format(current_signature_name): [
"{}".format(current_signature_name)
]
}
)
Expand Down
38 changes: 29 additions & 9 deletions SigProfilerAssignment/decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,10 +786,29 @@ def spa_analyze(
allsigids = processAvg.columns.to_list()
except:
allsigids = list(listOfSignatures)

# Preserve originalProcessAvg with original column names BEFORE converting to numpy array
# This is needed to maintain original de novo signature names in decomposition plots
if isinstance(processAvg, pd.DataFrame):
originalProcessAvg = processAvg.copy()
else:
originalProcessAvg = pd.DataFrame(
processAvg, index=index, columns=listOfSignatures
)

processAvg = np.array(processAvg)
signature_names = sub.make_letter_ids(
idlenth=processAvg.shape[1], mtype=mutation_context
)

# Use original signature names if available (preserve original names regardless of database type)
# This ensures exposureAvg column names match attribution dictionary keys
if originalProcessAvg is not None and hasattr(originalProcessAvg, 'columns') and len(originalProcessAvg.columns) > 0:
# Use original column names from the signatures file
exposure_signature_names = list(listOfSignatures)
else:
# Fallback to default naming convention
exposure_signature_names = signature_names

exposureAvg_dummy = (
pd.DataFrame(
Expand All @@ -801,16 +820,10 @@ def spa_analyze(
.rename_axis("Samples")
)
exposureAvg = exposureAvg_dummy
exposureAvg.columns = signature_names
exposureAvg.columns = exposure_signature_names

#############################
# layer_directory2 = output+"/Decompose_Solution"
if isinstance(processAvg, pd.DataFrame):
pass
else:
originalProcessAvg = pd.DataFrame(
processAvg, index=index, columns=listOfSignatures
)
try:
if not os.path.exists(layer_directory2):
os.makedirs(layer_directory2)
Expand All @@ -819,21 +832,27 @@ def spa_analyze(

if (
processAvg.shape[0] == 1536 and collapse_to_SBS96 == True
): # collapse the 1596 context into 96 only for the deocmposition
): # collapse the 1536 context into 96 only for the decomposition
processAvg = pd.DataFrame(processAvg, index=index)
processAvg = processAvg.groupby(processAvg.index.str[1:8]).sum()
genomes = genomes.groupby(genomes.index.str[1:8]).sum()
index = genomes.index
# Also collapse originalProcessAvg to match the collapsed context
# originalProcessAvg has mutation types in the index, so group by index directly
originalProcessAvg = originalProcessAvg.groupby(originalProcessAvg.index.str[1:8]).sum()
processAvg = np.array(processAvg)

if (
processAvg.shape[0] == 288 and collapse_to_SBS96 == True
): # collapse the 288 context into 96 only for the deocmposition
): # collapse the 288 context into 96 only for the decomposition
processAvg = pd.DataFrame(processAvg, index=index)
processAvg = processAvg.groupby(processAvg.index.str[2:9]).sum()
genomes = pd.DataFrame(genomes, index=index)
genomes = genomes.groupby(genomes.index.str[2:9]).sum()
index = genomes.index
# Also collapse originalProcessAvg to match the collapsed context
# originalProcessAvg has mutation types in the index, so group by index directly
originalProcessAvg = originalProcessAvg.groupby(originalProcessAvg.index.str[2:9]).sum()
processAvg = np.array(processAvg)

print("\n Decomposing De Novo Signatures .....")
Expand All @@ -860,6 +879,7 @@ def spa_analyze(
m_for_subgroups=m_for_subgroups,
volume=volume,
add_background_signatures=add_background_signatures,
collapse_to_SBS96=collapse_to_SBS96,
)
# final_signatures = sub.signature_decomposition(processAvg, m, layer_directory2, genome_build=genome_build)
# extract the global signatures and new signatures from the final_signatures dictionary
Expand Down