Skip to content

Commit 6f61680

Browse files
authored
Merge pull request #41 from AlexandrovLab/decomp_mods
Decomp mods
2 parents 3c258eb + 5b844d8 commit 6f61680

34 files changed

+5977
-13227
lines changed

MANIFEST.in

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,5 @@ include SigProfilerAssignment/data/Reference_Signatures/GRCh38/*
66
include SigProfilerAssignment/data/Reference_Signatures/mm9/*
77
include SigProfilerAssignment/data/Reference_Signatures/mm10/*
88
include SigProfilerAssignment/data/Reference_Signatures/rn6/*
9-
include SigProfilerAssignment/src/FormatFiles/*
10-
include SigProfilerAssignment/src/Fonts/*
11-
include SigProfilerAssignment/src/*
9+
include SigProfilerAssignment/DecompositionPlots/reference_files/Fonts/*
10+
include SigProfilerAssignment/DecompositionPlots/reference_files/*

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44

55

6-
<img src="SigProfilerAssignment/src/figures/SigProfilerAssignment.png" alt="drawing" width="1000"/>
6+
<img src="SigProfilerAssignment/figures/SigProfilerAssignment.png" alt="drawing" width="1000"/>
77

88
# SigProfilerAssignment
99
SigProfilerAssignment is a new mutational attribution and decomposition tool that performs the following functions:
@@ -72,7 +72,7 @@ spa_analyze( samples, output, signatures=None, signature_database=None,decompo
7272
``` -->
7373
### Decompose Fit
7474
Decomposes the De Novo Signatures into COSMIC Signatures and assigns COSMIC signatures into samples.
75-
<img src="SigProfilerAssignment/src/figures/decomp_pic.jpg" alt="drawing" width="600"/>
75+
<img src="SigProfilerAssignment/figures/decomp_pic.jpg" alt="drawing" width="600"/>
7676

7777
```python
7878
from SigProfilerAssignment import Analyzer as Analyze
@@ -88,7 +88,7 @@ Analyze.decompose_fit(samples,
8888
```
8989
### *De Novo* Fit
9090
Attributes mutations of given Samples to input denovo signatures.
91-
<img src="SigProfilerAssignment/src/figures/denovo_fit.jpg" alt="drawing" width="600"/>
91+
<img src="SigProfilerAssignment/figures/denovo_fit.jpg" alt="drawing" width="600"/>
9292

9393
```python
9494
from SigProfilerAssignment import Analyzer as Analyze
@@ -102,7 +102,7 @@ Analyze.denovo_fit( samples,
102102
### COSMIC Fit
103103
Attributes mutations of given Samples to input COSMIC signatures. Note that penalties associated with denovo fit and COSMIC fits are different.
104104

105-
<img src="SigProfilerAssignment/src/figures/cosmic_fit.jpg" alt="drawing" width="600"/>
105+
<img src="SigProfilerAssignment/figures/cosmic_fit.jpg" alt="drawing" width="600"/>
106106

107107
```python
108108
from SigProfilerAssignment import Analyzer as Analyze
Lines changed: 341 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Created: March 10th, 2020
5+
@author: Mark Barnes
6+
7+
PlotDecomposition works with matrix formats SigProfiler SBS-96, SBS-1536, DBS-78,
8+
and ID-83. This program is intended to take two matrices.
9+
10+
(1) Sample matrix - A SigProfiler formatted SBS-96, SBS-1536, DBS-78, or ID-83
11+
matrix.
12+
(2) Basis matrix - A SigProfiler formatted SBS-96, SBS-1536, DBS-78, or ID-83
13+
matrix that is the decomposition of (1).
14+
15+
When running the function 'run_PlotDecomposition' a plot of the decomposition will
16+
be generated and saved to the output folder. Refer to the function below to learn
17+
more about the parameters required to generate the decomposition plot.
18+
"""
19+
20+
import os
21+
import pandas as pd
22+
import numpy as np
23+
import scipy.stats
24+
import sigProfilerPlotting as pltCNV
25+
from SigProfilerAssignment.DecompositionPlots import SigProfilerPlottingMatrix as sigPlt
26+
from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_SBS96 as spd_96
27+
from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_SBS288 as spd_288
28+
from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_SBS1536 as spd_1536
29+
from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_DBS78 as spd_78
30+
from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_ID83 as spd_83
31+
from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_CNV48 as cnv_48
32+
from SigProfilerAssignment import decompose_subroutines as sub
33+
# imports for working with plots in memory
34+
import io
35+
from PIL import Image
36+
from reportlab.lib.utils import ImageReader
37+
# Global Variables
38+
SBS_CONTEXTS = ["6", "24", "96", "288", "384", "1536", "6144"]
39+
DBS_CONTEXTS = ["78", "186", "1248", "2976"]
40+
ID_CONTEXTS = ["28", "83", "415"]
41+
CNV_CONTEXTS = ["48"]
42+
mtype_options = ["6", "24", "96", "384", "1536", "6144", "28", "83", "415", "78", "186", "1248", "2976"]
43+
44+
# Helper function for converting BytesIO to image so it can be plotted by reportlab
45+
def bytes_to_img(byte_png):
46+
byte_png.seek(0)
47+
tmp_im=Image.open(byte_png)
48+
image = ImageReader(tmp_im)
49+
return image
50+
51+
# Helper function to convert byte array to image array
52+
def open_byte_to_img_dict(byte_dict):
53+
img_dict = dict()
54+
for name in byte_dict.keys():
55+
tmp_img = bytes_to_img(byte_dict[name])
56+
img_dict[name] = tmp_img
57+
return img_dict
58+
59+
60+
61+
def calculate_similarities(denovo, denovo_name, est_denovo):
62+
from numpy import inf
63+
64+
# If matrix is 1536 context, then collapse it to 96 format
65+
if denovo.shape[0]==1536:
66+
index = denovo.iloc[:,0]
67+
denovo_tmp = pd.DataFrame(denovo, index=index)
68+
denovo_tmp = denovo.groupby(denovo_tmp.index.str[1:8]).sum()
69+
denovo = pd.DataFrame(denovo_tmp)
70+
denovo = denovo.reset_index()
71+
elif denovo.shape[0]==288:
72+
index = denovo.iloc[:,0]
73+
denovo_tmp = pd.DataFrame(denovo, index=index)
74+
denovo_tmp = denovo.groupby(denovo_tmp.index.str[2:9]).sum()
75+
denovo = pd.DataFrame(denovo_tmp)
76+
denovo = denovo.reset_index()
77+
78+
79+
sample_names = [denovo_name]
80+
81+
if sample_names is False:
82+
sample_names = ["None"]*denovo.shape[1]
83+
84+
cosine_similarity_list = []
85+
cosine_distance_list = []
86+
correlation_list = []
87+
correlation_distance_list = []
88+
kl_divergence_list = []
89+
l1_norm_list = []
90+
l2_norm_list = []
91+
relative_l1_list = []
92+
relative_l2_list = []
93+
94+
p_i = denovo[denovo_name]
95+
q_i = est_denovo
96+
97+
cosine_similarity_list.append(round(sub.cos_sim(p_i,q_i ),3))
98+
cosine_distance_list.append(round(scipy.spatial.distance.cosine(p_i, q_i),3))
99+
correlation_list.append(round(scipy.stats.pearsonr(p_i,q_i)[0],3))
100+
correlation_distance_list.append(round(1-scipy.stats.pearsonr(p_i,q_i)[0],3))
101+
kl_divergence_list.append(round(scipy.stats.entropy(p_i,q_i),4))
102+
l1_norm_list.append(round(np.linalg.norm(p_i-q_i , ord=1),2))
103+
relative_l1_list.append(round((l1_norm_list[-1]/np.linalg.norm(p_i, ord=1))*100,3))
104+
l2_norm_list.append(round(np.linalg.norm(p_i-q_i , ord=2),2))
105+
relative_l2_list.append(round((l2_norm_list[-1]/np.linalg.norm(p_i, ord=2))*100,3))
106+
kl_divergence_list = np.array(kl_divergence_list)
107+
kl_divergence_list[kl_divergence_list == inf] =1000
108+
109+
similarities_dataframe = pd.DataFrame({"Sample Names": sample_names, \
110+
"Cosine Similarity": cosine_similarity_list, \
111+
"Cosine Distance": cosine_distance_list, \
112+
"Correlation Distance": correlation_distance_list, \
113+
"Correlation Coefficient": correlation_list, \
114+
"L1 Norm": l1_norm_list, \
115+
"L1 Norm %":relative_l1_list, \
116+
"L2 Norm": l2_norm_list, \
117+
"L2 Norm %": relative_l2_list, \
118+
"KL Divergence": kl_divergence_list})
119+
similarities_dataframe = similarities_dataframe.set_index("Sample Names")
120+
121+
return similarities_dataframe
122+
123+
124+
def genSBS_pngs(denovo_mtx, basis_mtx, output_path, project, mtype):
125+
denovo_plots = dict()
126+
basis_plots = dict()
127+
if mtype == "1536" or mtype == "288":
128+
denovo_plots = sigPlt.plotSBS(denovo_mtx, output_path, project, mtype, True)
129+
basis_plots = sigPlt.plotSBS(basis_mtx, output_path, project, "96", True)
130+
elif mtype == "96":
131+
denovo_plots = sigPlt.plotSBS(denovo_mtx, output_path, project, mtype, True)
132+
basis_plots = sigPlt.plotSBS(basis_mtx, output_path, project, mtype, True)
133+
return denovo_plots,basis_plots
134+
135+
def genDBS_pngs(denovo_mtx, basis_mtx, output_path, project, mtype):
136+
denovo_plots = dict()
137+
basis_plots = dict()
138+
denovo_plots = sigPlt.plotDBS(denovo_mtx, output_path, project, mtype, True)
139+
basis_plots = sigPlt.plotDBS(basis_mtx, output_path, project, mtype, True)
140+
return denovo_plots,basis_plots
141+
142+
def genID_pngs(denovo_mtx, basis_mtx, output_path, project, mtype):
143+
denovo_plots = dict()
144+
basis_plots = dict()
145+
denovo_plots = sigPlt.plotID(denovo_mtx, output_path, project, mtype, True)
146+
basis_plots = sigPlt.plotID(basis_mtx, output_path, project, mtype, True)
147+
return denovo_plots,basis_plots
148+
149+
def genCNV_pngs(denovo_mtx, basis_mtx, output_path, project, mtype):
150+
denovo_plots = dict()
151+
basis_plots = dict()
152+
denovo_plots = pltCNV.plotCNV(denovo_mtx, output_path, project, plot_type="pdf", percentage=True, aggregate=False, read_from_file=False, write_to_file=False)
153+
basis_plots = pltCNV.plotCNV(basis_mtx, output_path, project, plot_type="pdf", percentage=True, aggregate=False, read_from_file=False, write_to_file=False)
154+
return denovo_plots,basis_plots
155+
156+
# signames, weights
157+
def gen_sub_plots(denovo_mtx, basis_mtx, output_path, project, mtype):
158+
159+
if mtype in SBS_CONTEXTS:
160+
if not os.path.exists(output_path):
161+
os.makedirs(output_path)
162+
denovo_plots,basis_plots=genSBS_pngs(denovo_mtx, basis_mtx, output_path, project, mtype)
163+
return denovo_plots,basis_plots
164+
elif mtype in DBS_CONTEXTS:
165+
if not os.path.exists(output_path):
166+
os.makedirs(output_path)
167+
denovo_plots,basis_plots=genDBS_pngs(denovo_mtx, basis_mtx, output_path, project, mtype)
168+
return denovo_plots,basis_plots
169+
elif mtype in ID_CONTEXTS:
170+
if not os.path.exists(output_path):
171+
os.makedirs(output_path)
172+
denovo_plots,basis_plots=genID_pngs(denovo_mtx, basis_mtx, output_path, project, mtype)
173+
return denovo_plots,basis_plots
174+
elif mtype in CNV_CONTEXTS:
175+
if not os.path.exists(output_path):
176+
os.makedirs(output_path)
177+
denovo_plots, basis_plots=genCNV_pngs(denovo_mtx, basis_mtx, output_path, project, mtype)
178+
return denovo_plots,basis_plots
179+
else:
180+
print("ERROR: mtype is " + mtype + " and is not yet supported.")
181+
182+
# generate the plot for the reconstruction
183+
def gen_reconstructed_png(denovo_name, basis_mtx, basis_names, weights, output_path, project, mtype):
184+
reconstruction_plot=dict()
185+
col_names=[denovo_name]
186+
mut_col = basis_mtx.iloc[:,0]
187+
188+
recon_plot = basis_mtx[basis_names[0]]*float(weights[0].strip("%"))/100
189+
190+
for i in range(1,len(weights)):
191+
recon_plot = recon_plot + basis_mtx[basis_names[i]]*(float(weights[i].strip("%"))/100)
192+
193+
recon_plot = pd.Series(recon_plot, name=denovo_name)
194+
reconstruction_mtx = pd.concat([mut_col, recon_plot], axis=1)
195+
if mtype in SBS_CONTEXTS:
196+
if mtype == "1536" or mtype == "288":
197+
reconstruction_plot=sigPlt.plotSBS(reconstruction_mtx, output_path, "reconstruction_" + project, "96", True)
198+
else:
199+
reconstruction_plot=sigPlt.plotSBS(reconstruction_mtx, output_path, "reconstruction_" + project, mtype, True)
200+
elif mtype in DBS_CONTEXTS:
201+
reconstruction_plot=sigPlt.plotDBS(reconstruction_mtx, output_path, "reconstruction_" + project, mtype, True)
202+
elif mtype in ID_CONTEXTS:
203+
reconstruction_plot=sigPlt.plotID(reconstruction_mtx, output_path, "reconstruction_" + project, mtype, True)
204+
elif mtype in CNV_CONTEXTS:
205+
reconstruction_plot = pltCNV.plotCNV(reconstruction_mtx, output_path, "reconstruction_"+project, plot_type="pdf", \
206+
percentage=True, aggregate=False, read_from_file=False, write_to_file=False)
207+
else:
208+
print("ERROR: mtype is " + mtype + " and is not yet supported.")
209+
210+
return reconstruction_mtx,reconstruction_plot
211+
212+
213+
def gen_decomposition(denovo_name, basis_names, weights, output_path, project, \
214+
mtype, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \
215+
reconstruction=False, statistics=None, sig_version=None, custom_text=None):
216+
217+
"""
218+
Generate the correct plot based on mtype.
219+
220+
Parameters:
221+
----------
222+
denovo_name: (String) Name of denovo signature
223+
basis_names: (List of Strings) Names of basis signatures
224+
weights: (List of Strings) Percentile contribution for each basis signature
225+
output_path: (String) Path to existing output directory
226+
project: (String) Project name appended to file names
227+
mtype: (String) The context 'mtype_options' has valid values
228+
denovo_plots_dict (Dictionary) Signatures are keys, ByteIO plots are values
229+
basis_plots_dict (Dictionary) Signatures are keys, ByteIO plots are values
230+
reconstruction_plot_dict (Dictionary) Signatures are keys, ByteIO plots are values
231+
reconstruction: (Boolean) True to generate plot w/ reconstruction
232+
statistics: (Pandas Dataframe) Output from calculate_similarities()
233+
"""
234+
235+
if mtype == "6":
236+
print("Need to add support for SBS6 Decomposition")
237+
elif mtype == "24":
238+
print("Need to add support for SBS24 Decomposition")
239+
elif mtype == "96":
240+
byte_plot=spd_96.gen_decomposition(denovo_name, basis_names, weights, output_path, \
241+
project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \
242+
reconstruction, statistics, sig_version, custom_text)
243+
return byte_plot
244+
elif mtype == "288":
245+
byte_plot=spd_288.gen_decomposition(denovo_name, basis_names, weights, output_path, \
246+
project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \
247+
reconstruction, statistics, sig_version, custom_text)
248+
return byte_plot
249+
elif mtype == "384":
250+
print("Need to add support for SBS24 Decomposition")
251+
elif mtype == "1536":
252+
byte_plot=spd_1536.gen_decomposition(denovo_name, basis_names, weights, output_path, \
253+
project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \
254+
reconstruction, statistics, sig_version, custom_text)
255+
return byte_plot
256+
elif mtype == "6144":
257+
print("Need to add support for SBS6144 Decomposition")
258+
elif mtype == "28":
259+
print("Need to add support for ID28 Decomposition")
260+
elif mtype == "83":
261+
byte_plot=spd_83.gen_decomposition(denovo_name, basis_names, weights, output_path, \
262+
project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \
263+
reconstruction, statistics, sig_version, custom_text)
264+
return byte_plot
265+
elif mtype == "415":
266+
print("Need to add support for ID415 Decomposition")
267+
elif mtype == "78":
268+
byte_plot=spd_78.gen_decomposition(denovo_name, basis_names, weights, output_path, \
269+
project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \
270+
reconstruction, statistics, sig_version, custom_text)
271+
return byte_plot
272+
elif mtype == "186":
273+
print("Need to add support for DBS186 Decomposition")
274+
elif mtype == "1248":
275+
print("Need to add support for DBS1248 Decomposition")
276+
elif mtype == "2976":
277+
print("Need to add support for DBS2976 Decomposition")
278+
elif mtype == "48":
279+
byte_plot=cnv_48.gen_decomposition(denovo_name, basis_names, weights, output_path, \
280+
project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \
281+
reconstruction, statistics, sig_version, custom_text)
282+
return byte_plot
283+
284+
285+
286+
def run_PlotDecomposition(denovo_mtx, denovo_name, basis_mtx, basis_names, \
287+
weights, nonzero_exposures, output_path, project, mtype, sig_version=None, custom_text=None):
288+
"""
289+
Generates a decomposition plot of the denovo_mtx using the basis_mtx.
290+
291+
Parameters:
292+
----------
293+
294+
denovo_mtx: Pandas Dataframe. This format represents the catalog of mutations seperated by tab.
295+
296+
denovo_name: String. The name of the one sample in denovo_mtx to decompose.
297+
298+
basis_mtx: Pandas Dataframe. This format represents the catalog of mutations seperated by tab.
299+
300+
basis_names: List of Strings. The names of the samples in denovo_mtx that
301+
the denovo_name sample from denovo_mtx is decomposed into.
302+
ie. basis_names=["SBS1", "SBS5", "SBS15", "SBS20"]
303+
304+
weights: List of Strings. The percentile weight corresponding to each basis
305+
in basis_names. Refer to example function call below for more detail.
306+
ie. weights=["11.58%", "42.38%", "16.46%", "29.58%"]
307+
308+
output_path: String. Path to where to store the output.
309+
310+
project: String. This string is appended to the file name output.
311+
312+
mtype: String. The context of the data. Valid values include: "96", "1536","78", and "83".
313+
314+
sig_version: String. The version of signatures being used.
315+
316+
custom_text: String. A custom message displayed on decomposition plot.
317+
318+
Returns:
319+
-------
320+
None.
321+
"""
322+
323+
denovo_plots_dict,basis_plots_dict=gen_sub_plots(denovo_mtx, basis_mtx, output_path, project, mtype)
324+
reconstructed_mtx,reconstruction_plot_dict = gen_reconstructed_png(denovo_name, basis_mtx, basis_names, \
325+
weights, output_path, project, mtype)
326+
327+
present_sigs=np.array(basis_mtx[basis_names])
328+
reconstructed_mtx = np.dot(present_sigs,nonzero_exposures)
329+
330+
denovo_plots_dict = open_byte_to_img_dict(denovo_plots_dict)
331+
basis_plots_dict = open_byte_to_img_dict(basis_plots_dict)
332+
reconstruction_plot_dict = open_byte_to_img_dict(reconstruction_plot_dict)
333+
334+
statistics=calculate_similarities(denovo_mtx, denovo_name, reconstructed_mtx)
335+
byte_plot = gen_decomposition(denovo_name, basis_names, weights, output_path, project, \
336+
mtype, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \
337+
reconstruction=True, statistics=statistics, sig_version=sig_version, \
338+
custom_text=custom_text)
339+
340+
341+
return byte_plot

0 commit comments

Comments
 (0)