Skip to content

Commit 8bcdf81

Browse files
author
Philipp Rentzsch
authored
Merge pull request #16 from kircherlab/development
Intergrate v1.6 scripts
2 parents f75f8f6 + 0a31de3 commit 8bcdf81

40 files changed

+22965
-34324
lines changed

.gitignore

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# data folders
2-
data/*
2+
data/prescored/*
3+
data/annotations/*
34
input/*
45
output/*
5-
config/*
66

77
# compiled files
88
*.pyc
@@ -12,12 +12,14 @@ config/*
1212

1313
# snakemake
1414
.snakemake*
15+
envs/*
1516

1617
# data files
1718
*.gz
1819
*.vcf
1920
*.tsv
2021
*.zip
22+
*.tbi
2123

2224
# temp files
2325
.*

CADD.sh

Lines changed: 33 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
#!/bin/bash
22

3-
usage="$(basename "$0") [-o <outfile>] [-g <genomebuild>] [-v <caddversion>] [-a] <infile> -- CADD version 1.5
3+
usage="$(basename "$0") [-o <outfile>] [-g <genomebuild>] [-v <caddversion>] [-a] <infile> -- CADD version 1.6
44
55
where:
66
-h show this help text
77
-o out tsv.gz file (generated from input file name if not set)
88
-g genome build (supported are GRCh37 and GRCh38 [default: GRCh38])
9-
-v CADD version (either v1.4 or v1.5 [default: v1.5])
9+
-v CADD version (only v1.6 possible with this set of scripts [default: v1.6])
1010
-a include annotation in output
11-
input vcf of vcf.gz file (required)"
11+
input vcf of vcf.gz file (required)
12+
-q print basic information about snakemake run
13+
-p print full information about the snakemake run
14+
-c number of cores that snakemake is allowed to use [default: 1]
15+
"
1216

1317
unset OPTARG
1418
unset OPTIND
@@ -17,8 +21,10 @@ export LC_ALL=C
1721
GENOMEBUILD="GRCh38"
1822
ANNOTATION=false
1923
OUTFILE=""
20-
VERSION="v1.5"
21-
while getopts ':ho:g:v:a' option; do
24+
VERSION="v1.6"
25+
VERBOSE="-q"
26+
CORES="1"
27+
while getopts ':ho:g:v:c:aqp' option; do
2228
case "$option" in
2329
h) echo "$usage"
2430
exit
@@ -29,8 +35,14 @@ while getopts ':ho:g:v:a' option; do
2935
;;
3036
v) VERSION=$OPTARG
3137
;;
38+
v) CORES=$OPTARG
39+
;;
3240
a) ANNOTATION=true
3341
;;
42+
q) VERBOSE=""
43+
;;
44+
p) VERBOSE="-p"
45+
;;
3446
\?) printf "illegal option: -%s\n" "$OPTARG" >&2
3547
echo "$usage" >&2
3648
exit 1
@@ -41,7 +53,7 @@ shift $((OPTIND-1))
4153

4254
INFILE=$1
4355

44-
echo "CADD-v1.5 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved."
56+
echo "CADD-v1.6 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved."
4557

4658
set -ueo pipefail
4759

@@ -72,37 +84,18 @@ then
7284
exit 1
7385
fi
7486

75-
if [ "$VERSION" != "v1.4" ] && [ "$VERSION" != "v1.5" ]
87+
if [ "$VERSION" != "v1.6" ]
7688
then
77-
echo "Unknown/Unsupported CADD version $VERSION. This script currently only supports v1.4 and v1.5."
89+
echo "Unknown/Unsupported CADD version $VERSION. This set of script currently only supports v1.6."
90+
echo "If you want to score another version of CADD, please download the accordingly tagged version of the scripts"
7891
exit 1
7992
fi
8093

81-
if [ "$VERSION" == "v1.5" ] && [ "$GENOMEBUILD" == "GRCh37" ]
82-
then
83-
echo "Please note that CADD scores for GRCh37 version v1.5 are the same as in v1.4."
84-
VERSION="v1.4"
85-
fi
86-
8794
if [ "$ANNOTATION" = 'true' ]
8895
then
89-
ANNO_FOLDER="incl_anno"
96+
CONFIG=$CADD/config/config_${GENOMEBUILD}_${VERSION}.yml
9097
else
91-
ANNO_FOLDER="no_anno"
92-
fi
93-
94-
# Pipeline configuration
95-
PRESCORED_FOLDER=$CADD/data/prescored/${GENOMEBUILD}_${VERSION}/$ANNO_FOLDER
96-
REFERENCE_CONFIG=$CADD/config/references_${GENOMEBUILD}_${VERSION}.cfg
97-
IMPUTE_CONFIG=$CADD/config/impute_${GENOMEBUILD}_${VERSION}.cfg
98-
MODEL=$CADD/data/models/$GENOMEBUILD/CADD${VERSION}-$GENOMEBUILD.mod
99-
CONVERSION_TABLE=$CADD/data/models/$GENOMEBUILD/conversionTable_CADD${VERSION}-$GENOMEBUILD.txt
100-
101-
# determine VEP database version
102-
DBVERSION=92
103-
if [ "$GENOMEBUILD" == "GRCh38" ] && [ "$VERSION" == "v1.5" ]
104-
then
105-
DBVERSION=95
98+
CONFIG=$CADD/config/config_${GENOMEBUILD}_${VERSION}_noanno.yml
10699
fi
107100

108101
# Setup temporary folder that is removed reliably on exit and is outside of
@@ -111,101 +104,19 @@ TMP_FOLDER=$(mktemp -d)
111104
trap "rm -rf $TMP_FOLDER" ERR EXIT
112105

113106
# Temp files
114-
TMP_PRE=$TMP_FOLDER/$NAME.pre.tsv.gz
115-
TMP_VCF=$TMP_FOLDER/$NAME.vcf
116-
TMP_ANNO=$TMP_FOLDER/$NAME.anno.tsv.gz
117-
TMP_IMP=$TMP_FOLDER/$NAME.csv.gz
118-
TMP_NOV=$TMP_FOLDER/$NAME.nov.tsv.gz
119-
120-
mkdir -p $TMP_FOLDER
121-
122-
### Pipeline
107+
TMP_INFILE=$TMP_FOLDER/$NAME.$FILEFORMAT
108+
TMP_OUTFILE=$TMP_FOLDER/$NAME.tsv.gz
123109

124-
# Loading the environment
125-
if [ "$VERSION" == "v1.4" ]
126-
then
127-
source activate cadd-env
128-
else
129-
source activate cadd-env-v1.5
130-
fi
131-
132-
# File preparation
133-
if [ "$FILEFORMAT" == "vcf" ]
134-
then
135-
cat $INFILE \
136-
| python $CADD/src/scripts/VCF2vepVCF.py \
137-
| sort -k1,1 -k2,2n -k3,3 -k4,4 \
138-
| uniq > $TMP_VCF
139-
else
140-
zcat $INFILE \
141-
| python $CADD/src/scripts/VCF2vepVCF.py \
142-
| sort -k1,1 -k2,2n -k3,3 -k4,4 \
143-
| uniq > $TMP_VCF
144-
fi
145-
146-
# Prescoring
147-
echo '## Prescored variant file' | gzip -c > $TMP_PRE;
148-
if [ -d $PRESCORED_FOLDER ]
149-
then
150-
for PRESCORED in $(ls $PRESCORED_FOLDER/*.tsv.gz)
151-
do
152-
cat $TMP_VCF \
153-
| python $CADD/src/scripts/extract_scored.py --header \
154-
-p $PRESCORED --found_out=$TMP_PRE.tmp \
155-
> $TMP_VCF.tmp;
156-
gzip -c $TMP_PRE.tmp >> $TMP_PRE
157-
mv $TMP_VCF.tmp $TMP_VCF;
158-
done;
159-
rm $TMP_PRE.tmp
160-
fi
110+
cp $INFILE $TMP_INFILE
161111

162-
# Variant annotation
163-
cat $TMP_VCF \
164-
| vep --quiet --cache --buffer 1000 --no_stats --offline --vcf \
165-
--dir $CADD/data/annotations/${GENOMEBUILD}_${VERSION}/vep \
166-
--species homo_sapiens --db_version=$DBVERSION \
167-
--assembly $GENOMEBUILD --regulatory --sift b \
168-
--polyphen b --per_gene --ccds --domains --numbers --canonical \
169-
--total_length --force_overwrite --format vcf --output_file STDOUT \
170-
--warning_file STDERR \
171-
| python $CADD/src/scripts/annotateVEPvcf.py -c $REFERENCE_CONFIG \
172-
| gzip -c > $TMP_ANNO
173-
rm $TMP_VCF
174-
175-
# Imputation
176-
zcat $TMP_ANNO \
177-
| python $CADD/src/scripts/trackTransformation.py -b \
178-
-c $IMPUTE_CONFIG -o $TMP_IMP --noheader;
179-
180-
# Score prediction
181-
python $CADD/src/scripts/predictSKmodel.py \
182-
-i $TMP_IMP -m $MODEL -a $TMP_ANNO \
183-
| python $CADD/src/scripts/max_line_hierarchy.py --all \
184-
| python $CADD/src/scripts/appendPHREDscore.py -t $CONVERSION_TABLE \
185-
| gzip -c > $TMP_NOV;
186-
rm $TMP_ANNO
187-
rm $TMP_IMP
188-
189-
if [ "$ANNOTATION" = 'false' ]
190-
then
191-
if [ "$GENOMEBUILD" == "GRCh38" ]
192-
then
193-
COLUMNS="1-4,124,125"
194-
else
195-
COLUMNS="1-4,106,107"
196-
fi
197-
zcat $TMP_NOV | cut -f $COLUMNS | uniq | gzip -c > $TMP_NOV.tmp
198-
mv $TMP_NOV.tmp $TMP_NOV
199-
fi
112+
echo "Running snakemake pipeline:"
113+
echo snakemake $TMP_OUTFILE --use-conda --conda-prefix $CADD/envs --cores $CORES
114+
echo --configfile $CONFIG --snakefile $CADD/Snakefile $VERBOSE
115+
snakemake $TMP_OUTFILE --use-conda --conda-prefix $CADD/envs --cores $CORES \
116+
--configfile $CONFIG --snakefile $CADD/Snakefile $VERBOSE
200117

201-
# Join pre and novel scored variants
202-
{
203-
echo "##CADD $GENOMEBUILD-$VERSION (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.";
204-
head -n 1 < <(zcat $TMP_NOV);
205-
zcat $TMP_PRE $TMP_NOV | grep -v "^#" | sort -k1,1 -k2,2n -k3,3 -k4,4 || true;
206-
} | bgzip -c > $OUTFILE;
207-
rm $TMP_NOV
208-
rm $TMP_PRE
118+
mv $TMP_OUTFILE $OUTFILE
119+
rm $TMP_INFILE # is in temp folder, should not be necessary
209120

210121
OUTFILE=$(echo $OUTFILE | sed 's/^\.\///')
211122
echo -e "\nCADD scored variants written to file: $OUTFILE"

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Copyright (c) University of Washington, Hudson-Alpha Institute for
2-
Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.
2+
Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved.
33

44
Permission is hereby granted, to all non-commercial users and licensees of CADD
55
(Combined Annotation Dependent Framework, licensed by the University of

README.md

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Please check our [website for updates and further information](http://cadd.gs.wa
2424

2525
## Offline Installation
2626

27-
This section describes how users can setup CADD version 1.5 on their own system. Please note that this requires between 100 GB - 1 TB of disc space and at least 12 GB of RAM.
27+
This section describes how users can setup CADD version 1.6 on their own system. Please note that this requires between 100 GB - 1 TB of disc space and at least 12 GB of RAM.
2828

2929
### Prerequisite
3030

@@ -35,10 +35,12 @@ wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
3535
bash Miniconda2-latest-Linux-x86_64.sh -p $HOME/miniconda2 -b
3636
export PATH=$HOME/miniconda2/bin:$PATH
3737
```
38+
- snakemake (installed via conda)
39+
```bash
40+
conda install -c conda-forge -c bioconda snakemake
41+
```
3842

39-
*Note: You can also install CADD without conda by installing the dependencies otherwise. You can find the list of tools (VEP) and (python) libraries in the [`src/environment.yml`](https://github.com/kircherlab/CADD-scripts/blob/master/src/environment.yml) file (for CADD GRCh38-v1.5 use [`src/environment_v1.5.yml`](https://github.com/kircherlab/CADD-scripts/blob/master/src/environment_v1.5.yml)). In this case, you will also have to disable the line `source activate cadd-env` in `CADD.sh`*
40-
41-
*Note2: If you are using an existing conda installation, please make sure it is [a version >=4.4.0](https://github.com/conda/conda/issues/3200).*
43+
*Note2: If you are using an existing conda installation, please make sure it is [a version >=4.4.0](https://github.com/conda/conda/issues/3200). Make also sure to use snakemake >= 4.0 as some command line parameters are not available in earlier versions. *
4244

4345
### Setup
4446

@@ -58,48 +60,50 @@ This is the easier way of installing CADD, just run:
5860
./install.sh
5961
```
6062

61-
You first state which parts you want to install (the environment as well as at least one genome build including annotation tracks are neccessary for a quick start) and the script should manage loading and unpacking the neccessary files.
63+
You first state which parts you want to install (the environments as well as at least one genome build including annotation tracks are neccessary for a quick start) and the script should manage loading and unpacking the neccessary files.
6264

6365
#### Manual installation
6466

65-
Running CADD depends on three big building blocks (plus the repository containing this README which we assume you already downloaded):
67+
Running CADD depends on four big building blocks (plus the repository containing this README which we assume you already downloaded):
6668

69+
- snakemake
6770
- dependencies
6871
- genome annotations
6972
- prescored variants
7073

7174
**Installing dependencies**
7275

73-
As stated already in the Prerequisite you can install CADD dependencies without conda, although we heavily recommend doing so. This is because managing the various parts becomes very handy by relying it. To setup the neccessary environment, we only need to run the command:
76+
As of this version, dependencies have to be installed via conda and snakemake. This is because we are using two different enviroments for python2 and python3.
7477

7578
```bash
76-
conda env create -f src/environment.yml
79+
snakemake test/input.vcf --use-conda --create-envs-only --conda-prefix envs \
80+
--configfile config/config_GRCh38_v1.6.yml --snakefile Snakefile
7781
```
7882

79-
After the installing process (which will take a few minutes), the CADD conda environment will be loaded (via `source activate cadd-env` automatically in the `CADD.sh` script) and CADD can run without further settings.
83+
Please note that we installing both conda environments in the CADD subdirectory `envs` via `--conda-prefix envs`. If you do not want this behavior (we do this in order to not install the environments in all active directories you run CADD from), adjust or remove this parameter.
8084

8185
**Installing annotations**
8286

8387
Both version of CADD (for the different genome builds) rely on a big number of genomic annotations. Depending on which genome build you require you can get them from our website (be careful where you put them as these are really big files and have identical filenames) via:
8488

8589
```bash
8690
# for GRCh37 / hg19
87-
wget -c http://krishna.gs.washington.edu/download/CADD/v1.4/annotationsGRCh37.tar.gz
91+
wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/annotationsGRCh37_v1.6.tar.gz
8892
# for GRCh38 / hg38
89-
wget -c http://krishna.gs.washington.edu/download/CADD/v1.5/annotationsGRCh38.tar.gz
93+
wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/annotationsGRCh38_v1.6.tar.gz
9094
```
9195

9296
As those files are about 100 and 200 GB in size, downloads can take long (depending on your internet connection). We recommend to setup the process in the background and using a tool (like `wget -c` mentioned above) that allows you to continue an interrupted download.
9397

94-
To make sure you downloaded the files correctly, we recommend downloading md5 hash files from our website (e.g. `wget http://krishna.gs.washington.edu/download/CADD/v1.4/GRCh37/MD5SUMs`) and checking for completness (via `md5sum -c`).
98+
To make sure you downloaded the files correctly, we recommend downloading md5 hash files from our website (e.g. `wget wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/MD5SUMs`) and checking for completness (via `md5sum -c`).
9599

96100
The annotation files are finally put in the folder `data/annotations` and unpacked:
97101

98102
```bash
99103
cd data/annotations
100-
tar -zxvf annotationsGRCh37.tar.gz
104+
tar -zxvf annotationsGRCh37_v1.6.tar.gz
101105
mv GRCh37 GRCh37_v1.4
102-
tar -zxvf annotationsGRCh38.tar.gz
106+
tar -zxvf annotationsGRCh38_v1.6.tar.gz
103107
cd $OLDPWD
104108
```
105109

@@ -109,35 +113,25 @@ At this point you are ready to go, but if you want a faster version of CADD, you
109113

110114
### Running CADD
111115

112-
You run CADD via the script `CADD.sh` which technically only requieres an either vcf or vcf.gz input file as last argument. You can further specify the genome build via `-g`, CADD version via `-v`, request a fully annotated output (`-a` flag) and specify a seperate output file via `-o` (else inputfile name `.tsv.gz` is used). I.e:
116+
You run CADD via the script `CADD.sh` which technically only requieres an either vcf or vcf.gz input file as last argument. You can further specify the genome build via `-g`, CADD version via `-v` (deprecated, the new version of the scripts only support v1.6), request a fully annotated output (`-a` flag) and specify a seperate output file via `-o` (else inputfile name `.tsv.gz` is used). I.e:
113117

114118
```bash
115119
./CADD.sh test/input.vcf
116120

117-
./CADD.sh -a -g GRCh37 -v v1.4 -o output_inclAnno_GRCh37.tsv.gz test/input.vcf
121+
./CADD.sh -a -g GRCh37 -o output_inclAnno_GRCh37.tsv.gz test/input.vcf
118122
```
119123

120124
You can test whether your CADD is set up properly by comparing to the example files in the `test` directory.
121125

122126
### Update
123127

124-
Between versions 1.4 and 1.5, we adjusted the CADD repository slightly. If you used CADD before and obviously do not want to download all v1.4 files again, please proceed as follows:
125-
126-
1. update the repository (just overwriting is fine, however this dublicates some moved files so `git pull` is prefered)
127-
2. rename the annotation (and prescored) folder from `data/annotations/$GENOMEBUILD` to `data/annotations/${GENOMEBUILD}_${VERSION}`
128-
129-
```
130-
mv data/annotations/GRCh37 data/annotations/GRCh37_v1.4
131-
mv data/annotations/GRCh38 data/annotations/GRCh38_v1.4
128+
Version 1.6 includes some changes in comparison to v1.5. Next to the obvious switch of the pipeline into a Snakemake workflow which became necessary due to the ongoin issues with `conda activate`, the new models for v1.6 are extended by more specialized annotations for splicing variants, as well as a few minor changes in some other annotations (most prominent: fixed gerp for GRCh38) and changes in consequence categories which make this scripts incompatible with CADD v1.4 and v1.5. If you are still using those version, please use [version 1.5 of this repository](https://github.com/kircherlab/CADD-scripts/archive/CADD1.5.zip).
132129

133-
# if you have prescored files
134-
mv data/prescored/GRCh37 data/prescored/GRCh37_v1.4
135-
mv data/prescored/GRCh38 data/prescored/GRCh38_v1.4
136130
```
137131
138132
## Copyright
139133
Copyright (c) University of Washington, Hudson-Alpha Institute for
140-
Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.
134+
Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved.
141135
142136
Permission is hereby granted, to all non-commercial users and licensees of CADD
143137
(Combined Annotation Dependent Framework, licensed by the University of

0 commit comments

Comments
 (0)