Skip to content

Commit df1d5bc

Browse files
authored
Merge pull request #34 from AlexandrovLab/vcf_input_integration
Vcf input integration
2 parents 17c7ed8 + 04d6659 commit df1d5bc

File tree

5 files changed

+44
-19
lines changed

5 files changed

+44
-19
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ Analyze.denovo_fit( samples,
101101
```
102102
### COSMIC Fit
103103
Attributes mutations of given Samples to input COSMIC signatures. Note that penalties associated with denovo fit and COSMIC fits are different.
104+
104105
<img src="SigProfilerAssignment/src/figures/cosmic_fit.jpg" alt="drawing" width="600"/>
105106

106107
```python
@@ -120,7 +121,7 @@ Analyze.cosmic_fit( samples,
120121
## Main Parameters
121122
| Parameter | Variable Type | Parameter Description |
122123
| --------------------- | -------- |-------- |
123-
| **samples** | String | Path to a tab delimilted file that contains the samples table where the rows are mutation types and colunms are sample IDs. |
124+
| **samples** | String | Path to a tab delimilted file that contains the samples table where the rows are mutation types and colunms are sample IDs. or Path to VCF files directory if input files are VCF Files. |
124125
| **output** | String | Path to the output folder. |
125126
| **signatures** | String | Path to a tab delimited file that contains the signature table where the rows are mutation types and colunms are signature IDs. |
126127
| **genome_build** | String | The reference genome build. List of supported genomes: "GRCh37", "GRCh38", "mm9", "mm10" and "rn6". The default value is "GRCh37". If the selected genome is not in the supported list, the default genome will be used. |
@@ -129,6 +130,7 @@ Analyze.cosmic_fit( samples,
129130
| **make_plots** | Boolean | Toggle on and off for making and saving all plots. Default value is True. |
130131
| **signature_subgroups** | List | Removes the signatures corresponding to specific subtypes for better fitting. The usage is given above. Default value is None. |
131132
| **exome** | Boolean | Defines if the exome renormalized signatures will be used. The default value is False. |
133+
| **vcf_opts**|Dict with keys 'project_name' and 'vcf_context'| Reqd options if vcf files are provided as input. 'project_name' key takes a string of the cohort of VCF samples and 'vcf_context' takes what context type of the mutation matrix to be considered for assignment. Valid options include '96', '6', '24', '4608', '288', '18','6144', '384', '1536', 'DINUC'|
132134
| **verbose** | Boolean | Prints statements. Default value is False. |
133135

134136

SigProfilerAssignment/Analyzer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from SigProfilerAssignment import decomposition as decomp
22

3-
def decompose_fit(samples, output, signatures=None, signature_database=None,nnls_add_penalty=0.05, nnls_remove_penalty=0.01, initial_remove_penalty=0.05,genome_build="GRCh37", cosmic_version=3.3, make_plots=True, collapse_to_SBS96=True,connected_sigs=True, verbose=False,devopts=None,new_signature_thresh_hold=0.8,signature_subgroups=None,exome=False):
3+
def decompose_fit(samples, output, signatures=None, signature_database=None,nnls_add_penalty=0.05, nnls_remove_penalty=0.01, initial_remove_penalty=0.05,genome_build="GRCh37", cosmic_version=3.3, make_plots=True, collapse_to_SBS96=True,connected_sigs=True, verbose=False,devopts=None,new_signature_thresh_hold=0.8,signature_subgroups=None,exome=False,vcf_opts=None):
44

5-
decomp.spa_analyze(samples=samples, output=output, signatures=signatures, signature_database=signature_database,nnls_add_penalty=nnls_add_penalty, nnls_remove_penalty=nnls_remove_penalty, initial_remove_penalty=initial_remove_penalty,genome_build=genome_build, cosmic_version=cosmic_version, make_plots=make_plots, collapse_to_SBS96=collapse_to_SBS96,connected_sigs=connected_sigs, verbose=verbose,decompose_fit_option= True,denovo_refit_option=False,cosmic_fit_option=False,devopts=devopts,new_signature_thresh_hold=new_signature_thresh_hold,signature_subgroups=signature_subgroups,exome=exome)
5+
decomp.spa_analyze(samples=samples, output=output, signatures=signatures, signature_database=signature_database,nnls_add_penalty=nnls_add_penalty, nnls_remove_penalty=nnls_remove_penalty, initial_remove_penalty=initial_remove_penalty,genome_build=genome_build, cosmic_version=cosmic_version, make_plots=make_plots, collapse_to_SBS96=collapse_to_SBS96,connected_sigs=connected_sigs, verbose=verbose,decompose_fit_option= True,denovo_refit_option=False,cosmic_fit_option=False,devopts=devopts,new_signature_thresh_hold=new_signature_thresh_hold,signature_subgroups=signature_subgroups,exome=exome,vcf_opts=vcf_opts)
66

7-
def denovo_fit(samples, output, signatures=None, signature_database=None,nnls_add_penalty=0.05,nnls_remove_penalty=0.01, initial_remove_penalty=0.05, genome_build="GRCh37", cosmic_version=3.3, make_plots=True, collapse_to_SBS96=True,connected_sigs=True, verbose=False,devopts=None,new_signature_thresh_hold=0.8):
8-
decomp.spa_analyze(samples=samples, output=output, signatures=signatures, signature_database=signature_database,nnls_add_penalty=nnls_add_penalty, nnls_remove_penalty=nnls_remove_penalty, initial_remove_penalty=initial_remove_penalty,genome_build=genome_build, cosmic_version=cosmic_version, new_signature_thresh_hold=new_signature_thresh_hold, make_plots=make_plots, collapse_to_SBS96=collapse_to_SBS96,connected_sigs=connected_sigs, verbose=verbose,decompose_fit_option= False,denovo_refit_option=True,cosmic_fit_option=False,devopts=devopts)
7+
def denovo_fit(samples, output, signatures=None, signature_database=None,nnls_add_penalty=0.05,nnls_remove_penalty=0.01, initial_remove_penalty=0.05, genome_build="GRCh37", cosmic_version=3.3, make_plots=True, collapse_to_SBS96=True,connected_sigs=True, verbose=False,devopts=None,new_signature_thresh_hold=0.8,vcf_opts=None):
8+
decomp.spa_analyze(samples=samples, output=output, signatures=signatures, signature_database=signature_database,nnls_add_penalty=nnls_add_penalty, nnls_remove_penalty=nnls_remove_penalty, initial_remove_penalty=initial_remove_penalty,genome_build=genome_build, cosmic_version=cosmic_version, new_signature_thresh_hold=new_signature_thresh_hold, make_plots=make_plots, collapse_to_SBS96=collapse_to_SBS96,connected_sigs=connected_sigs, verbose=verbose,decompose_fit_option= False,denovo_refit_option=True,cosmic_fit_option=False,devopts=devopts,vcf_opts=vcf_opts)
99

10-
def cosmic_fit(samples, output, signatures=None, signature_database=None,nnls_add_penalty=0.05, nnls_remove_penalty=0.01, initial_remove_penalty=0.05,genome_build="GRCh37", cosmic_version=3.3, make_plots=True, collapse_to_SBS96=True,connected_sigs=True, verbose=False,devopts=None,signature_subgroups=None,exome=False):
11-
decomp.spa_analyze(samples=samples, output=output, signatures=signatures, signature_database=signature_database,nnls_add_penalty=nnls_add_penalty, nnls_remove_penalty=nnls_remove_penalty, initial_remove_penalty=initial_remove_penalty,genome_build=genome_build, cosmic_version=cosmic_version, make_plots=make_plots, collapse_to_SBS96=collapse_to_SBS96,connected_sigs=connected_sigs, verbose=verbose,decompose_fit_option= False,denovo_refit_option=False,cosmic_fit_option=True,devopts=devopts,signature_subgroups=signature_subgroups,exome=exome)
10+
def cosmic_fit(samples, output, signatures=None, signature_database=None,nnls_add_penalty=0.05, nnls_remove_penalty=0.01, initial_remove_penalty=0.05,genome_build="GRCh37", cosmic_version=3.3, make_plots=True, collapse_to_SBS96=True,connected_sigs=True, verbose=False,devopts=None,signature_subgroups=None,exome=False,vcf_opts=None):
11+
decomp.spa_analyze(samples=samples, output=output, signatures=signatures, signature_database=signature_database,nnls_add_penalty=nnls_add_penalty, nnls_remove_penalty=nnls_remove_penalty, initial_remove_penalty=initial_remove_penalty,genome_build=genome_build, cosmic_version=cosmic_version, make_plots=make_plots, collapse_to_SBS96=collapse_to_SBS96,connected_sigs=connected_sigs, verbose=verbose,decompose_fit_option= False,denovo_refit_option=False,cosmic_fit_option=True,devopts=devopts,signature_subgroups=signature_subgroups,exome=exome,vcf_opts=vcf_opts)

SigProfilerAssignment/decomposition.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from SigProfilerAssignment import decompose_sub_routines as sub
1515
import numpy as np
1616
import pandas as pd
17+
import SigProfilerMatrixGenerator
18+
from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as datadump
1719

1820
#import SigProfilerExtractor as cosmic
1921
import os,sys
@@ -22,7 +24,7 @@
2224
def spa_analyze( samples, output, signatures=None, signature_database=None,decompose_fit_option= True,denovo_refit_option=True,cosmic_fit_option=True, nnls_add_penalty=0.05,
2325
nnls_remove_penalty=0.01, initial_remove_penalty=0.05, de_novo_fit_penalty=0.02,
2426
genome_build="GRCh37", cosmic_version=3.3, make_plots=True, collapse_to_SBS96=True,connected_sigs=True, verbose=False,devopts=None,new_signature_thresh_hold=0.8,
25-
signature_subgroups=None, exome=False):
27+
signature_subgroups=None, exome=False,vcf_opts=None):
2628

2729

2830
"""
@@ -74,11 +76,25 @@ def spa_analyze( samples, output, signatures=None, signature_database=None,dec
7476
if (denovo_refit_option == True or decompose_fit_option ==True) and signatures is None:
7577
raise Exception("If denovo_refit or decompose_fit is True, signatures cannot be empty")
7678

77-
try:
78-
genomes = pd.read_csv(samples, sep = "\t", index_col = 0)
79-
except:
80-
genomes = samples
81-
genomes = pd.DataFrame(genomes)
79+
if vcf_opts is not None:
80+
if 'project_name' in vcf_opts:
81+
project_name = vcf_opts['project_name']
82+
else:
83+
project_name = 'Input_vcffiles'
84+
85+
if 'vcf_context' in vcf_opts:
86+
vcf_context = vcf_opts['vcf_context']
87+
else:
88+
vcf_context ='96'
89+
90+
data = datadump.SigProfilerMatrixGeneratorFunc(project_name, genome_build, samples, exome=exome, bed_file=None, chrom_based=False, plot=False, gs=False)
91+
genomes = data[vcf_context]
92+
else:
93+
try:
94+
genomes = pd.read_csv(samples, sep = "\t", index_col = 0)
95+
except:
96+
genomes = samples
97+
genomes = pd.DataFrame(genomes)
8298

8399
# if signatures is None:
84100
# processAvg = sub.getProcessAvg(genomes, genome_build=genome_build, cosmic_version=cosmic_version)[0]

setup.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
if os.path.exists("dist"):
77
shutil.rmtree("dist")
88

9-
VERSION = '0.0.10'
10-
9+
VERSION = '0.0.11'
1110

1211
with open('README.md') as f:
1312
long_description = f.read()
@@ -18,7 +17,7 @@ def write_version_py(filename='SigProfilerAssignment/version.py'):
1817
# THIS FILE IS GENERATED FROM SigProfilerAssignment SETUP.PY
1918
short_version = '%(version)s'
2019
version = '%(version)s'
21-
Update = '1. Reset reference genome to GRCh37 if a not supported genome is selected. 2. Add support for COSMIC exome reference signatures'
20+
Update = 'Integration of VCF files as input'
2221
2322
"""
2423
fh = open(filename, 'w')

test.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@ def main():
1313

1414
signatures = dir_inp+"Results_scenario_8/SBS96/All_Solutions/SBS96_3_Signatures/Signatures/SBS96_S3_Signatures.txt"
1515
samples = dir_inp+"Input_scenario_8/Samples.txt"
16+
# samples = spa.__path__[0]+'/data/vcftest/' If input is a directory of vcf files.
1617
output="output_example/"
1718
sigs= "COSMIC_v3_SBS_GRCh37_noSBS84-85.txt"
18-
19+
20+
21+
# vcf_opts={'project_name': 'test_sample','vcf_context': '288' } # Uncomment this If vcf files are provided as input.
1922

2023
# signature_subgroups = ['remove_MMR_deficiency_signatures',
2124
# 'remove_POL_deficiency_signatures',
@@ -39,14 +42,18 @@ def main():
3942
genome_build="GRCh37",
4043
verbose=False,
4144
new_signature_thresh_hold=0.8,
42-
signature_subgroups=signature_subgroups)
45+
signature_subgroups=signature_subgroups,
46+
# vcf_opts=vcf_opts
47+
)
4348

4449
Analyze.denovo_fit( samples,
4550
output,
4651
signatures=signatures,
4752
signature_database=None,
4853
genome_build="GRCh37",
49-
verbose=False)
54+
verbose=False,
55+
# vcf_opts=vcf_opts
56+
)
5057

5158
Analyze.cosmic_fit( samples,
5259
output,
@@ -56,6 +63,7 @@ def main():
5663
verbose=False,
5764
collapse_to_SBS96=True,
5865
signature_subgroups=signature_subgroups,
66+
# vcf_opts=vcf_opts
5967
)
6068

6169
if __name__ == '__main__':

0 commit comments

Comments
 (0)