|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +""" |
| 4 | +Created: March 10th, 2020 |
| 5 | +@author: Mark Barnes |
| 6 | +
|
| 7 | +PlotDecomposition works with matrix formats SigProfiler SBS-96, SBS-1536, DBS-78, |
| 8 | +and ID-83. This program is intended to take two matrices. |
| 9 | +
|
| 10 | +(1) Sample matrix - A SigProfiler formatted SBS-96, SBS-1536, DBS-78, or ID-83 |
| 11 | +matrix. |
| 12 | +(2) Basis matrix - A SigProfiler formatted SBS-96, SBS-1536, DBS-78, or ID-83 |
| 13 | +matrix that is the decomposition of (1). |
| 14 | +
|
| 15 | +When running the function 'run_PlotDecomposition' a plot of the decomposition will |
| 16 | +be generated and saved to the output folder. Refer to the function below to learn |
| 17 | +more about the parameters required to generate the decomposition plot. |
| 18 | +""" |
| 19 | + |
| 20 | +import os |
| 21 | +import pandas as pd |
| 22 | +import numpy as np |
| 23 | +import scipy.stats |
| 24 | +import sigProfilerPlotting as pltCNV |
| 25 | +from SigProfilerAssignment.DecompositionPlots import SigProfilerPlottingMatrix as sigPlt |
| 26 | +from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_SBS96 as spd_96 |
| 27 | +from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_SBS288 as spd_288 |
| 28 | +from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_SBS1536 as spd_1536 |
| 29 | +from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_DBS78 as spd_78 |
| 30 | +from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_ID83 as spd_83 |
| 31 | +from SigProfilerAssignment.DecompositionPlots import PlotDecomposition_CNV48 as cnv_48 |
| 32 | +from SigProfilerAssignment import decompose_subroutines as sub |
| 33 | +# imports for working with plots in memory |
| 34 | +import io |
| 35 | +from PIL import Image |
| 36 | +from reportlab.lib.utils import ImageReader |
| 37 | +# Global Variables |
| 38 | +SBS_CONTEXTS = ["6", "24", "96", "288", "384", "1536", "6144"] |
| 39 | +DBS_CONTEXTS = ["78", "186", "1248", "2976"] |
| 40 | +ID_CONTEXTS = ["28", "83", "415"] |
| 41 | +CNV_CONTEXTS = ["48"] |
| 42 | +mtype_options = ["6", "24", "96", "384", "1536", "6144", "28", "83", "415", "78", "186", "1248", "2976"] |
| 43 | + |
| 44 | +# Helper function for converting BytesIO to image so it can be plotted by reportlab |
| 45 | +def bytes_to_img(byte_png): |
| 46 | + byte_png.seek(0) |
| 47 | + tmp_im=Image.open(byte_png) |
| 48 | + image = ImageReader(tmp_im) |
| 49 | + return image |
| 50 | + |
| 51 | +# Helper function to convert byte array to image array |
| 52 | +def open_byte_to_img_dict(byte_dict): |
| 53 | + img_dict = dict() |
| 54 | + for name in byte_dict.keys(): |
| 55 | + tmp_img = bytes_to_img(byte_dict[name]) |
| 56 | + img_dict[name] = tmp_img |
| 57 | + return img_dict |
| 58 | + |
| 59 | + |
| 60 | + |
| 61 | +def calculate_similarities(denovo, denovo_name, est_denovo): |
| 62 | + from numpy import inf |
| 63 | + |
| 64 | + # If matrix is 1536 context, then collapse it to 96 format |
| 65 | + if denovo.shape[0]==1536: |
| 66 | + index = denovo.iloc[:,0] |
| 67 | + denovo_tmp = pd.DataFrame(denovo, index=index) |
| 68 | + denovo_tmp = denovo.groupby(denovo_tmp.index.str[1:8]).sum() |
| 69 | + denovo = pd.DataFrame(denovo_tmp) |
| 70 | + denovo = denovo.reset_index() |
| 71 | + elif denovo.shape[0]==288: |
| 72 | + index = denovo.iloc[:,0] |
| 73 | + denovo_tmp = pd.DataFrame(denovo, index=index) |
| 74 | + denovo_tmp = denovo.groupby(denovo_tmp.index.str[2:9]).sum() |
| 75 | + denovo = pd.DataFrame(denovo_tmp) |
| 76 | + denovo = denovo.reset_index() |
| 77 | + |
| 78 | + |
| 79 | + sample_names = [denovo_name] |
| 80 | + |
| 81 | + if sample_names is False: |
| 82 | + sample_names = ["None"]*denovo.shape[1] |
| 83 | + |
| 84 | + cosine_similarity_list = [] |
| 85 | + cosine_distance_list = [] |
| 86 | + correlation_list = [] |
| 87 | + correlation_distance_list = [] |
| 88 | + kl_divergence_list = [] |
| 89 | + l1_norm_list = [] |
| 90 | + l2_norm_list = [] |
| 91 | + relative_l1_list = [] |
| 92 | + relative_l2_list = [] |
| 93 | + |
| 94 | + p_i = denovo[denovo_name] |
| 95 | + q_i = est_denovo |
| 96 | + |
| 97 | + cosine_similarity_list.append(round(sub.cos_sim(p_i,q_i ),3)) |
| 98 | + cosine_distance_list.append(round(scipy.spatial.distance.cosine(p_i, q_i),3)) |
| 99 | + correlation_list.append(round(scipy.stats.pearsonr(p_i,q_i)[0],3)) |
| 100 | + correlation_distance_list.append(round(1-scipy.stats.pearsonr(p_i,q_i)[0],3)) |
| 101 | + kl_divergence_list.append(round(scipy.stats.entropy(p_i,q_i),4)) |
| 102 | + l1_norm_list.append(round(np.linalg.norm(p_i-q_i , ord=1),2)) |
| 103 | + relative_l1_list.append(round((l1_norm_list[-1]/np.linalg.norm(p_i, ord=1))*100,3)) |
| 104 | + l2_norm_list.append(round(np.linalg.norm(p_i-q_i , ord=2),2)) |
| 105 | + relative_l2_list.append(round((l2_norm_list[-1]/np.linalg.norm(p_i, ord=2))*100,3)) |
| 106 | + kl_divergence_list = np.array(kl_divergence_list) |
| 107 | + kl_divergence_list[kl_divergence_list == inf] =1000 |
| 108 | + |
| 109 | + similarities_dataframe = pd.DataFrame({"Sample Names": sample_names, \ |
| 110 | + "Cosine Similarity": cosine_similarity_list, \ |
| 111 | + "Cosine Distance": cosine_distance_list, \ |
| 112 | + "Correlation Distance": correlation_distance_list, \ |
| 113 | + "Correlation Coefficient": correlation_list, \ |
| 114 | + "L1 Norm": l1_norm_list, \ |
| 115 | + "L1 Norm %":relative_l1_list, \ |
| 116 | + "L2 Norm": l2_norm_list, \ |
| 117 | + "L2 Norm %": relative_l2_list, \ |
| 118 | + "KL Divergence": kl_divergence_list}) |
| 119 | + similarities_dataframe = similarities_dataframe.set_index("Sample Names") |
| 120 | + |
| 121 | + return similarities_dataframe |
| 122 | + |
| 123 | + |
| 124 | +def genSBS_pngs(denovo_mtx, basis_mtx, output_path, project, mtype): |
| 125 | + denovo_plots = dict() |
| 126 | + basis_plots = dict() |
| 127 | + if mtype == "1536" or mtype == "288": |
| 128 | + denovo_plots = sigPlt.plotSBS(denovo_mtx, output_path, project, mtype, True) |
| 129 | + basis_plots = sigPlt.plotSBS(basis_mtx, output_path, project, "96", True) |
| 130 | + elif mtype == "96": |
| 131 | + denovo_plots = sigPlt.plotSBS(denovo_mtx, output_path, project, mtype, True) |
| 132 | + basis_plots = sigPlt.plotSBS(basis_mtx, output_path, project, mtype, True) |
| 133 | + return denovo_plots,basis_plots |
| 134 | + |
| 135 | +def genDBS_pngs(denovo_mtx, basis_mtx, output_path, project, mtype): |
| 136 | + denovo_plots = dict() |
| 137 | + basis_plots = dict() |
| 138 | + denovo_plots = sigPlt.plotDBS(denovo_mtx, output_path, project, mtype, True) |
| 139 | + basis_plots = sigPlt.plotDBS(basis_mtx, output_path, project, mtype, True) |
| 140 | + return denovo_plots,basis_plots |
| 141 | + |
| 142 | +def genID_pngs(denovo_mtx, basis_mtx, output_path, project, mtype): |
| 143 | + denovo_plots = dict() |
| 144 | + basis_plots = dict() |
| 145 | + denovo_plots = sigPlt.plotID(denovo_mtx, output_path, project, mtype, True) |
| 146 | + basis_plots = sigPlt.plotID(basis_mtx, output_path, project, mtype, True) |
| 147 | + return denovo_plots,basis_plots |
| 148 | + |
| 149 | +def genCNV_pngs(denovo_mtx, basis_mtx, output_path, project, mtype): |
| 150 | + denovo_plots = dict() |
| 151 | + basis_plots = dict() |
| 152 | + denovo_plots = pltCNV.plotCNV(denovo_mtx, output_path, project, plot_type="pdf", percentage=True, aggregate=False, read_from_file=False, write_to_file=False) |
| 153 | + basis_plots = pltCNV.plotCNV(basis_mtx, output_path, project, plot_type="pdf", percentage=True, aggregate=False, read_from_file=False, write_to_file=False) |
| 154 | + return denovo_plots,basis_plots |
| 155 | + |
| 156 | +# signames, weights |
| 157 | +def gen_sub_plots(denovo_mtx, basis_mtx, output_path, project, mtype): |
| 158 | + |
| 159 | + if mtype in SBS_CONTEXTS: |
| 160 | + if not os.path.exists(output_path): |
| 161 | + os.makedirs(output_path) |
| 162 | + denovo_plots,basis_plots=genSBS_pngs(denovo_mtx, basis_mtx, output_path, project, mtype) |
| 163 | + return denovo_plots,basis_plots |
| 164 | + elif mtype in DBS_CONTEXTS: |
| 165 | + if not os.path.exists(output_path): |
| 166 | + os.makedirs(output_path) |
| 167 | + denovo_plots,basis_plots=genDBS_pngs(denovo_mtx, basis_mtx, output_path, project, mtype) |
| 168 | + return denovo_plots,basis_plots |
| 169 | + elif mtype in ID_CONTEXTS: |
| 170 | + if not os.path.exists(output_path): |
| 171 | + os.makedirs(output_path) |
| 172 | + denovo_plots,basis_plots=genID_pngs(denovo_mtx, basis_mtx, output_path, project, mtype) |
| 173 | + return denovo_plots,basis_plots |
| 174 | + elif mtype in CNV_CONTEXTS: |
| 175 | + if not os.path.exists(output_path): |
| 176 | + os.makedirs(output_path) |
| 177 | + denovo_plots, basis_plots=genCNV_pngs(denovo_mtx, basis_mtx, output_path, project, mtype) |
| 178 | + return denovo_plots,basis_plots |
| 179 | + else: |
| 180 | + print("ERROR: mtype is " + mtype + " and is not yet supported.") |
| 181 | + |
| 182 | +# generate the plot for the reconstruction |
| 183 | +def gen_reconstructed_png(denovo_name, basis_mtx, basis_names, weights, output_path, project, mtype): |
| 184 | + reconstruction_plot=dict() |
| 185 | + col_names=[denovo_name] |
| 186 | + mut_col = basis_mtx.iloc[:,0] |
| 187 | + |
| 188 | + recon_plot = basis_mtx[basis_names[0]]*float(weights[0].strip("%"))/100 |
| 189 | + |
| 190 | + for i in range(1,len(weights)): |
| 191 | + recon_plot = recon_plot + basis_mtx[basis_names[i]]*(float(weights[i].strip("%"))/100) |
| 192 | + |
| 193 | + recon_plot = pd.Series(recon_plot, name=denovo_name) |
| 194 | + reconstruction_mtx = pd.concat([mut_col, recon_plot], axis=1) |
| 195 | + if mtype in SBS_CONTEXTS: |
| 196 | + if mtype == "1536" or mtype == "288": |
| 197 | + reconstruction_plot=sigPlt.plotSBS(reconstruction_mtx, output_path, "reconstruction_" + project, "96", True) |
| 198 | + else: |
| 199 | + reconstruction_plot=sigPlt.plotSBS(reconstruction_mtx, output_path, "reconstruction_" + project, mtype, True) |
| 200 | + elif mtype in DBS_CONTEXTS: |
| 201 | + reconstruction_plot=sigPlt.plotDBS(reconstruction_mtx, output_path, "reconstruction_" + project, mtype, True) |
| 202 | + elif mtype in ID_CONTEXTS: |
| 203 | + reconstruction_plot=sigPlt.plotID(reconstruction_mtx, output_path, "reconstruction_" + project, mtype, True) |
| 204 | + elif mtype in CNV_CONTEXTS: |
| 205 | + reconstruction_plot = pltCNV.plotCNV(reconstruction_mtx, output_path, "reconstruction_"+project, plot_type="pdf", \ |
| 206 | + percentage=True, aggregate=False, read_from_file=False, write_to_file=False) |
| 207 | + else: |
| 208 | + print("ERROR: mtype is " + mtype + " and is not yet supported.") |
| 209 | + |
| 210 | + return reconstruction_mtx,reconstruction_plot |
| 211 | + |
| 212 | + |
| 213 | +def gen_decomposition(denovo_name, basis_names, weights, output_path, project, \ |
| 214 | + mtype, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \ |
| 215 | + reconstruction=False, statistics=None, sig_version=None, custom_text=None): |
| 216 | + |
| 217 | + """ |
| 218 | + Generate the correct plot based on mtype. |
| 219 | +
|
| 220 | + Parameters: |
| 221 | + ---------- |
| 222 | + denovo_name: (String) Name of denovo signature |
| 223 | + basis_names: (List of Strings) Names of basis signatures |
| 224 | + weights: (List of Strings) Percentile contribution for each basis signature |
| 225 | + output_path: (String) Path to existing output directory |
| 226 | + project: (String) Project name appended to file names |
| 227 | + mtype: (String) The context 'mtype_options' has valid values |
| 228 | + denovo_plots_dict (Dictionary) Signatures are keys, ByteIO plots are values |
| 229 | + basis_plots_dict (Dictionary) Signatures are keys, ByteIO plots are values |
| 230 | + reconstruction_plot_dict (Dictionary) Signatures are keys, ByteIO plots are values |
| 231 | + reconstruction: (Boolean) True to generate plot w/ reconstruction |
| 232 | + statistics: (Pandas Dataframe) Output from calculate_similarities() |
| 233 | + """ |
| 234 | + |
| 235 | + if mtype == "6": |
| 236 | + print("Need to add support for SBS6 Decomposition") |
| 237 | + elif mtype == "24": |
| 238 | + print("Need to add support for SBS24 Decomposition") |
| 239 | + elif mtype == "96": |
| 240 | + byte_plot=spd_96.gen_decomposition(denovo_name, basis_names, weights, output_path, \ |
| 241 | + project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \ |
| 242 | + reconstruction, statistics, sig_version, custom_text) |
| 243 | + return byte_plot |
| 244 | + elif mtype == "288": |
| 245 | + byte_plot=spd_288.gen_decomposition(denovo_name, basis_names, weights, output_path, \ |
| 246 | + project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \ |
| 247 | + reconstruction, statistics, sig_version, custom_text) |
| 248 | + return byte_plot |
| 249 | + elif mtype == "384": |
| 250 | + print("Need to add support for SBS24 Decomposition") |
| 251 | + elif mtype == "1536": |
| 252 | + byte_plot=spd_1536.gen_decomposition(denovo_name, basis_names, weights, output_path, \ |
| 253 | + project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \ |
| 254 | + reconstruction, statistics, sig_version, custom_text) |
| 255 | + return byte_plot |
| 256 | + elif mtype == "6144": |
| 257 | + print("Need to add support for SBS6144 Decomposition") |
| 258 | + elif mtype == "28": |
| 259 | + print("Need to add support for ID28 Decomposition") |
| 260 | + elif mtype == "83": |
| 261 | + byte_plot=spd_83.gen_decomposition(denovo_name, basis_names, weights, output_path, \ |
| 262 | + project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \ |
| 263 | + reconstruction, statistics, sig_version, custom_text) |
| 264 | + return byte_plot |
| 265 | + elif mtype == "415": |
| 266 | + print("Need to add support for ID415 Decomposition") |
| 267 | + elif mtype == "78": |
| 268 | + byte_plot=spd_78.gen_decomposition(denovo_name, basis_names, weights, output_path, \ |
| 269 | + project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \ |
| 270 | + reconstruction, statistics, sig_version, custom_text) |
| 271 | + return byte_plot |
| 272 | + elif mtype == "186": |
| 273 | + print("Need to add support for DBS186 Decomposition") |
| 274 | + elif mtype == "1248": |
| 275 | + print("Need to add support for DBS1248 Decomposition") |
| 276 | + elif mtype == "2976": |
| 277 | + print("Need to add support for DBS2976 Decomposition") |
| 278 | + elif mtype == "48": |
| 279 | + byte_plot=cnv_48.gen_decomposition(denovo_name, basis_names, weights, output_path, \ |
| 280 | + project, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \ |
| 281 | + reconstruction, statistics, sig_version, custom_text) |
| 282 | + return byte_plot |
| 283 | + |
| 284 | + |
| 285 | + |
| 286 | +def run_PlotDecomposition(denovo_mtx, denovo_name, basis_mtx, basis_names, \ |
| 287 | + weights, nonzero_exposures, output_path, project, mtype, sig_version=None, custom_text=None): |
| 288 | + """ |
| 289 | + Generates a decomposition plot of the denovo_mtx using the basis_mtx. |
| 290 | +
|
| 291 | + Parameters: |
| 292 | + ---------- |
| 293 | +
|
| 294 | + denovo_mtx: Pandas Dataframe. This format represents the catalog of mutations seperated by tab. |
| 295 | +
|
| 296 | + denovo_name: String. The name of the one sample in denovo_mtx to decompose. |
| 297 | +
|
| 298 | + basis_mtx: Pandas Dataframe. This format represents the catalog of mutations seperated by tab. |
| 299 | +
|
| 300 | + basis_names: List of Strings. The names of the samples in denovo_mtx that |
| 301 | + the denovo_name sample from denovo_mtx is decomposed into. |
| 302 | + ie. basis_names=["SBS1", "SBS5", "SBS15", "SBS20"] |
| 303 | +
|
| 304 | + weights: List of Strings. The percentile weight corresponding to each basis |
| 305 | + in basis_names. Refer to example function call below for more detail. |
| 306 | + ie. weights=["11.58%", "42.38%", "16.46%", "29.58%"] |
| 307 | +
|
| 308 | + output_path: String. Path to where to store the output. |
| 309 | +
|
| 310 | + project: String. This string is appended to the file name output. |
| 311 | +
|
| 312 | + mtype: String. The context of the data. Valid values include: "96", "1536","78", and "83". |
| 313 | +
|
| 314 | + sig_version: String. The version of signatures being used. |
| 315 | +
|
| 316 | + custom_text: String. A custom message displayed on decomposition plot. |
| 317 | +
|
| 318 | + Returns: |
| 319 | + ------- |
| 320 | + None. |
| 321 | + """ |
| 322 | + |
| 323 | + denovo_plots_dict,basis_plots_dict=gen_sub_plots(denovo_mtx, basis_mtx, output_path, project, mtype) |
| 324 | + reconstructed_mtx,reconstruction_plot_dict = gen_reconstructed_png(denovo_name, basis_mtx, basis_names, \ |
| 325 | + weights, output_path, project, mtype) |
| 326 | + |
| 327 | + present_sigs=np.array(basis_mtx[basis_names]) |
| 328 | + reconstructed_mtx = np.dot(present_sigs,nonzero_exposures) |
| 329 | + |
| 330 | + denovo_plots_dict = open_byte_to_img_dict(denovo_plots_dict) |
| 331 | + basis_plots_dict = open_byte_to_img_dict(basis_plots_dict) |
| 332 | + reconstruction_plot_dict = open_byte_to_img_dict(reconstruction_plot_dict) |
| 333 | + |
| 334 | + statistics=calculate_similarities(denovo_mtx, denovo_name, reconstructed_mtx) |
| 335 | + byte_plot = gen_decomposition(denovo_name, basis_names, weights, output_path, project, \ |
| 336 | + mtype, denovo_plots_dict, basis_plots_dict, reconstruction_plot_dict, \ |
| 337 | + reconstruction=True, statistics=statistics, sig_version=sig_version, \ |
| 338 | + custom_text=custom_text) |
| 339 | + |
| 340 | + |
| 341 | + return byte_plot |
0 commit comments