Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions modules/nf-core/snpeff/build/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
# renovate: datasource=conda depName=bioconda/snpeff
- bioconda::snpeff=5.4.0a
147 changes: 147 additions & 0 deletions modules/nf-core/snpeff/build/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
process SNPEFF_BUILD {
tag "$db_name"
label 'process_low'

conda "${moduleDir}/environment.yml"
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/30/30669e5208952f30d59d0d559928772f082830d01a140a853fff13a2283a17b0/data'
: 'community.wave.seqera.io/library/snpeff:5.4.0a--eaf6ce30125b2b17'}"

input:
tuple val(meta_ref), path(fasta), path(annotation), path(cds), path(protein) // cds and protein are optional
// TODO: optionally use , arity: '0..*' or typed inputs? See https://github.com/nextflow-io/nextflow/issues/5111 & https://github.com/nextflow-io/nextflow/issues/1694
// TODO: optionally use stageAs to change name immediately without relying on the bash script to do it
val annotation_format // 'gff', 'gtf' or empty (falls back to detection in filename)
path snpeff_config_template // TODO: instead of requiring users to supply this file, it could also be read from the snpEff install directory
val db_name

output:
// TODO: `tuple val(meta_ref), path("snpeff_db"), emit: db` could serve as an alternative approach => outputs self-contained directory with snpEff config and database, but requires the use of a relative -dataDir option in the snpEff annotate command
tuple val(meta_ref), path("snpeff_db/data"), emit: db
tuple val(meta_ref), path("snpeff_db/snpEff.config"), emit: config
tuple val("${task.process}"), val('snpeff'), eval("snpEff -version 2>&1 | cut -f 2 -d '\t'"), topic: versions, emit: versions_snpeff

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''

// set db_name to meta.id of reference when not supplied
db_name = db_name ?: meta_ref.id

// extract gff/gtf format from filename if not provided
if (!annotation_format) {
def anno_name = annotation.name.toLowerCase()
if (anno_name.endsWith('.gtf') || anno_name.endsWith('.gtf.gz')) {
annotation_format = 'gtf'
} else if (anno_name.endsWith('.gff') || anno_name.endsWith('.gff.gz') ||
anno_name.endsWith('.gff3') || anno_name.endsWith('.gff3.gz')) {
annotation_format = 'gff'
} else {
error "Could not determine annotation format from filename: ${annotation.name}. " +
"Please provide annotation_format parameter ('gtf' or 'gff')."
}
}
if (annotation_format != 'gtf' && annotation_format != 'gff' && annotation_format != 'gff3' ) {
error "Invalid annotation_format: '${annotation_format}'. Must be 'gtf' or 'gff(3)'."
}
def annotation_file = (annotation_format == 'gtf') ? 'genes.gtf' : 'genes.gff'
def annotation_arg = (annotation_format == 'gtf') ? '-gtf22' : '-gff3'

// add cli arguments to skip checks for cds/protein files when they are not provided
def no_check_cds_arg = cds ? '' : '-noCheckCds'
def no_check_protein_arg = protein ? '' : '-noCheckProtein'

"""
# Create the directory structure snpEff expects
mkdir -p snpeff_db/data/${db_name}

# Copy and rename files to match snpEff naming requirements, unzipping them if necessary
if [[ "${fasta}" == *.gz ]]; then
gunzip -c ${fasta} > snpeff_db/data/${db_name}/sequences.fa
else
cp ${fasta} snpeff_db/data/${db_name}/sequences.fa
fi

if [[ "${annotation}" == *.gz ]]; then
gunzip -c ${annotation} > snpeff_db/data/${db_name}/${annotation_file}
else
cp ${annotation} snpeff_db/data/${db_name}/${annotation_file}
fi

# Only copy CDS and proteins files if provided
# Note: quotes around variables during file existence check are critical,
# otherwise the tests will default to true when the variable is an empty string
# e.g., `if [ -f \$undeclared_var ]` = `if [ -f ]` = true
if [ -f "${cds}" ]; then
if [[ "${cds}" == *.gz ]]; then
gunzip -c ${cds} > snpeff_db/data/${db_name}/cds.fa
else
cp ${cds} snpeff_db/data/${db_name}/cds.fa
fi
else
echo "No CDS file provided, skipping CDS check."
fi

if [ -f "${protein}" ]; then
if [[ "${protein}" == *.gz ]]; then
gunzip -c ${protein} > snpeff_db/data/${db_name}/protein.fa
else
cp "${protein}" "snpeff_db/data/${db_name}/protein.fa"
fi
else
echo "No protein file provided, skipping protein check."
fi

# Create snpEff config file, starting with the template file
# Note: dataDir in config file will be ignored since it is overridden by the CLI option
cp ${snpeff_config_template} snpeff_db/snpEff.config

# Append custom genome configuration to config
cat >> snpeff_db/snpEff.config << EOF
# ${db_name} genome configuration
${db_name}.genome : ${db_name}
EOF

# Build the database - dataDir is relative to the location of snpEff.config when provided as a relative path
snpEff build \\
-v \\
-c snpeff_db/snpEff.config \\
-dataDir ./data/ \\
${annotation_arg} \\
${no_check_cds_arg} \\
${no_check_protein_arg} \\
${args} \\
${db_name}
"""

stub:
db_name = db_name ?: meta_ref.id
// extract gff/gtf format from filename if not provided
if (!annotation_format) {
def anno_name = annotation.name.toLowerCase()
if (anno_name.endsWith('.gtf') || anno_name.endsWith('.gtf.gz')) {
annotation_format = 'gtf'
} else if (anno_name.endsWith('.gff') || anno_name.endsWith('.gff.gz') ||
anno_name.endsWith('.gff3') || anno_name.endsWith('.gff3.gz')) {
annotation_format = 'gff'
} else {
annotation_format = 'gtf'
}
}
def annotation_file = (annotation_format == 'gtf') ? 'genes.gtf' : 'genes.gff'
"""
# Create the expected directory structure
mkdir -p snpeff_db/data/${db_name}

# Create empty files matching the real process outputs
touch snpeff_db/data/${db_name}/sequences.fa
touch snpeff_db/data/${db_name}/${annotation_file}
touch snpeff_db/data/${db_name}/cds.fa
touch snpeff_db/data/${db_name}/protein.fa

# Create config file in correct location
touch snpeff_db/snpEff.config
"""
}
38 changes: 27 additions & 11 deletions modules/nf-core/snpeff/snpeff/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@ process SNPEFF_SNPEFF {

input:
tuple val(meta), path(vcf)
val db
tuple val(meta2), path(cache)
tuple val(meta_db), path(snpeff_db)
tuple val(meta_config), path(snpeff_config)
// tuple val(meta_db), path(snpeff_db) // TODO alternative option: provide the entire snpeff_db folder from SNPEFF_BUILD
val db_name // Optional name of the snpEff genome database, defaults to meta_db.id
// TODO optional db-path? as value/string to make it relative to snpEff.config or absolute path?
// TODO: add interval filter -fi , -filterInterval <file> : Only analyze changes that intersect with the intervals specified in this file (you may use this option many times)

output:
tuple val(meta), path("*.ann.vcf"), emit: vcf
tuple val(meta), val("${task.process}"), val('snpeff'), path("*.csv"), topic: multiqc_files, emit: report
tuple val(meta), val("${task.process}"), val('snpeff'), path("*.html"), topic: multiqc_files, emit: summary_html
tuple val(meta), val("${task.process}"), val('snpeff'), path("*.genes.txt"), topic: multiqc_files, emit: genes_txt
tuple val(meta), path("*.ann.vcf") , emit: vcf
tuple val(meta), path("*.csv") , emit: report
tuple val(meta), path("*.html") , emit: summary
tuple val(meta), path("*.genes.txt") , emit: genes
tuple val("${task.process}"), val('snpeff'), eval("snpEff -version 2>&1 | cut -f 2 -d '\t'"), topic: versions, emit: versions_snpeff

when:
Expand All @@ -32,15 +36,27 @@ process SNPEFF_SNPEFF {
avail_mem = (task.memory.mega * 0.8).intValue()
}
def prefix = task.ext.prefix ?: "${meta.id}"
def cache_command = cache ? "-dataDir \${PWD}/${cache}" : ""
// set db_name to meta_db.id of snpeff_db folder when not supplied
db_name = db_name ?: meta_db.id
// ensure snpeff_db and snpeff_config match
if( meta_config?.id && meta_db?.id && meta_config.id != meta_db.id ) {
error "[snpEff] snpEff database metadata id '${meta_db.id}' does not match config metadata id '${meta_config.id}'."
}
"""
snpEff \\
# The snpeff_db folder contains:
# snpEff.config (generated by BUILD)
# data/${db_name}/ (the actual database)

# NOTE: dataDir is relative to snpEff.config when provided as a relative path. Depends on whether or not self-contained database dir + config is passed as input (relative path) or if they are provided separately (absolute path).
snpEff ann \\
-Xmx${avail_mem}M \\
-XX:-UsePerfData \\
${db} \\
${args} \\
-config ${snpeff_config} \\
-dataDir ${snpeff_db} \\
-csvStats ${prefix}.csv \\
${cache_command} \\
-stats ${prefix}.html \\
${args} \\
${db_name} \\
${vcf} \\
> ${prefix}.ann.vcf
"""
Expand Down
Loading