nf-core · pmoris · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
@@ -0,0 +1,8 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  # renovate: datasource=conda depName=bioconda/snpeff
+  - bioconda::snpeff=5.4.0a
@@ -0,0 +1,147 @@
+process SNPEFF_BUILD {
+    tag "$db_name"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/30/30669e5208952f30d59d0d559928772f082830d01a140a853fff13a2283a17b0/data'
+        : 'community.wave.seqera.io/library/snpeff:5.4.0a--eaf6ce30125b2b17'}"
+
+    input:
+    tuple val(meta_ref), path(fasta), path(annotation), path(cds), path(protein)    // cds and protein are optional
+    // TODO: optionally use , arity: '0..*' or typed inputs? See https://github.com/nextflow-io/nextflow/issues/5111 & https://github.com/nextflow-io/nextflow/issues/1694
+    // TODO: optionally use stageAs to change name immediately without relying on the bash script to do it
+    val annotation_format           // 'gff', 'gtf' or empty (falls back to detection in filename)
+    path snpeff_config_template     // TODO: instead of requiring users to supply this file, it could also be read from the snpEff install directory
+    val db_name
+
+    output:
+    // TODO: `tuple val(meta_ref), path("snpeff_db"), emit: db` could serve as an alternative approach => outputs self-contained directory with snpEff config and database, but requires the use of a relative -dataDir option in the snpEff annotate command
+    tuple val(meta_ref), path("snpeff_db/data"),            emit: db
+    tuple val(meta_ref), path("snpeff_db/snpEff.config"),   emit: config
+    tuple val("${task.process}"), val('snpeff'), eval("snpEff -version 2>&1 | cut -f 2 -d '\t'"), topic: versions, emit: versions_snpeff
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+
+    // set db_name to meta.id of reference when not supplied
+    db_name = db_name ?: meta_ref.id
+
+    // extract gff/gtf format from filename if not provided
+    if (!annotation_format) {
+        def anno_name = annotation.name.toLowerCase()
+        if (anno_name.endsWith('.gtf') || anno_name.endsWith('.gtf.gz')) {
+            annotation_format = 'gtf'
+        } else if (anno_name.endsWith('.gff') || anno_name.endsWith('.gff.gz') ||
+                anno_name.endsWith('.gff3') || anno_name.endsWith('.gff3.gz')) {
+            annotation_format = 'gff'
+        } else {
+            error "Could not determine annotation format from filename: ${annotation.name}. " +
+                "Please provide annotation_format parameter ('gtf' or 'gff')."
+        }
+    }
+    if (annotation_format != 'gtf' && annotation_format != 'gff' && annotation_format != 'gff3' ) {
+        error "Invalid annotation_format: '${annotation_format}'. Must be 'gtf' or 'gff(3)'."
+    }
+    def annotation_file = (annotation_format == 'gtf') ? 'genes.gtf' : 'genes.gff'
+    def annotation_arg = (annotation_format == 'gtf') ? '-gtf22' : '-gff3'
+
+    // add cli arguments to skip checks for cds/protein files when they are not provided
+    def no_check_cds_arg = cds ? '' : '-noCheckCds'
+    def no_check_protein_arg = protein ? '' : '-noCheckProtein'
+
+    """
+    # Create the directory structure snpEff expects
+    mkdir -p snpeff_db/data/${db_name}
+
+    # Copy and rename files to match snpEff naming requirements, unzipping them if necessary
+    if [[ "${fasta}" == *.gz ]]; then
+        gunzip -c ${fasta} > snpeff_db/data/${db_name}/sequences.fa
+    else
+        cp ${fasta} snpeff_db/data/${db_name}/sequences.fa
+    fi
+
+    if [[ "${annotation}" == *.gz ]]; then
+        gunzip -c ${annotation} > snpeff_db/data/${db_name}/${annotation_file}
+    else
+        cp ${annotation} snpeff_db/data/${db_name}/${annotation_file}
+    fi
+
+    # Only copy CDS and proteins files if provided
+    # Note: quotes around variables during file existence check are critical,
+    # otherwise the tests will default to true when the variable is an empty string
+    # e.g., `if [ -f \$undeclared_var ]` = `if [ -f ]` = true
+    if [ -f "${cds}" ]; then
+        if [[ "${cds}" == *.gz ]]; then
+            gunzip -c ${cds} > snpeff_db/data/${db_name}/cds.fa
+        else
+            cp ${cds} snpeff_db/data/${db_name}/cds.fa
+        fi
+    else
+        echo "No CDS file provided, skipping CDS check."
+    fi
+
+    if [ -f "${protein}" ]; then
+        if [[ "${protein}" == *.gz ]]; then
+            gunzip -c ${protein} > snpeff_db/data/${db_name}/protein.fa
+        else
+            cp "${protein}" "snpeff_db/data/${db_name}/protein.fa"
+        fi
+    else
+        echo "No protein file provided, skipping protein check."
+    fi
+
+    # Create snpEff config file, starting with the template file
+    # Note: dataDir in config file will be ignored since it is overridden by the CLI option
+    cp ${snpeff_config_template} snpeff_db/snpEff.config
+
+    # Append custom genome configuration to config
+    cat >> snpeff_db/snpEff.config << EOF
+# ${db_name} genome configuration
+${db_name}.genome : ${db_name}
+EOF
+
+    # Build the database - dataDir is relative to the location of snpEff.config when provided as a relative path
+    snpEff build \\
+        -v \\
+        -c snpeff_db/snpEff.config \\
+        -dataDir ./data/ \\
+        ${annotation_arg} \\
+        ${no_check_cds_arg} \\
+        ${no_check_protein_arg} \\
+        ${args} \\
+        ${db_name}
+    """
+
+    stub:
+    db_name = db_name ?: meta_ref.id
+    // extract gff/gtf format from filename if not provided
+    if (!annotation_format) {
+        def anno_name = annotation.name.toLowerCase()
+        if (anno_name.endsWith('.gtf') || anno_name.endsWith('.gtf.gz')) {
+            annotation_format = 'gtf'
+        } else if (anno_name.endsWith('.gff') || anno_name.endsWith('.gff.gz') ||
+                anno_name.endsWith('.gff3') || anno_name.endsWith('.gff3.gz')) {
+            annotation_format = 'gff'
+        } else {
+            annotation_format = 'gtf'
+        }
+    }
+    def annotation_file = (annotation_format == 'gtf') ? 'genes.gtf' : 'genes.gff'
+    """
+    # Create the expected directory structure
+    mkdir -p snpeff_db/data/${db_name}
+
+    # Create empty files matching the real process outputs
+    touch snpeff_db/data/${db_name}/sequences.fa
+    touch snpeff_db/data/${db_name}/${annotation_file}
+    touch snpeff_db/data/${db_name}/cds.fa
+    touch snpeff_db/data/${db_name}/protein.fa
+
+    # Create config file in correct location
+    touch snpeff_db/snpEff.config
+    """
+}
@@ -9,14 +9,18 @@ process SNPEFF_SNPEFF {
 
     input:
     tuple val(meta), path(vcf)
-    val db
-    tuple val(meta2), path(cache)
+    tuple val(meta_db), path(snpeff_db)
+    tuple val(meta_config), path(snpeff_config)
+    // tuple val(meta_db), path(snpeff_db) // TODO alternative option: provide the entire snpeff_db folder from SNPEFF_BUILD
+    val db_name                         // Optional name of the snpEff genome database, defaults to meta_db.id
+    // TODO optional db-path? as value/string to make it relative to snpEff.config or absolute path?
+    // TODO: add interval filter -fi , -filterInterval  <file>   : Only analyze changes that intersect with the intervals specified in this file (you may use this option many times)
 
     output:
-    tuple val(meta), path("*.ann.vcf"), emit: vcf
-    tuple val(meta), val("${task.process}"), val('snpeff'), path("*.csv"), topic: multiqc_files, emit: report
-    tuple val(meta), val("${task.process}"), val('snpeff'), path("*.html"), topic: multiqc_files, emit: summary_html
-    tuple val(meta), val("${task.process}"), val('snpeff'), path("*.genes.txt"), topic: multiqc_files, emit: genes_txt
+    tuple val(meta), path("*.ann.vcf")    , emit: vcf
+    tuple val(meta), path("*.csv")        , emit: report
+    tuple val(meta), path("*.html")       , emit: summary
+    tuple val(meta), path("*.genes.txt")  , emit: genes
     tuple val("${task.process}"), val('snpeff'), eval("snpEff -version 2>&1 | cut -f 2 -d '\t'"), topic: versions, emit: versions_snpeff
 
     when:
@@ -32,15 +36,27 @@ process SNPEFF_SNPEFF {
         avail_mem = (task.memory.mega * 0.8).intValue()
     }
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def cache_command = cache ? "-dataDir \${PWD}/${cache}" : ""
+    // set db_name to meta_db.id of snpeff_db folder when not supplied
+    db_name = db_name ?: meta_db.id
+    // ensure snpeff_db and snpeff_config match
+    if( meta_config?.id && meta_db?.id && meta_config.id != meta_db.id ) {
+        error "[snpEff] snpEff database metadata id '${meta_db.id}' does not match config metadata id '${meta_config.id}'."
+    }
     """
-    snpEff \\
+    # The snpeff_db folder contains:
+    #   snpEff.config (generated by BUILD)
+    #   data/${db_name}/ (the actual database)
+
+    # NOTE: dataDir is relative to snpEff.config when provided as a relative path. Depends on whether or not self-contained database dir + config is passed as input (relative path) or if they are provided separately (absolute path).
+    snpEff ann \\
         -Xmx${avail_mem}M \\
         -XX:-UsePerfData \\
-        ${db} \\
-        ${args} \\
+        -config ${snpeff_config} \\
+        -dataDir ${snpeff_db} \\
         -csvStats ${prefix}.csv \\
-        ${cache_command} \\
+        -stats ${prefix}.html \\
+        ${args} \\
+        ${db_name} \\
         ${vcf} \\
         > ${prefix}.ann.vcf
     """