diff --git a/modules/nf-core/lsa/cosine/environment.yml b/modules/nf-core/lsa/cosine/environment.yml new file mode 100644 index 000000000000..67fda59b5823 --- /dev/null +++ b/modules/nf-core/lsa/cosine/environment.yml @@ -0,0 +1,10 @@ +--- +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-base=4.5.2 + - conda-forge::r-lsa=0.73.4 + - conda-forge::r-pheatmap=1.0.13 + - conda-forge::r-optparse=1.7.5 + - conda-forge::r-readr=2.1.6 diff --git a/modules/nf-core/lsa/cosine/main.nf b/modules/nf-core/lsa/cosine/main.nf new file mode 100644 index 000000000000..adff29bf8021 --- /dev/null +++ b/modules/nf-core/lsa/cosine/main.nf @@ -0,0 +1,40 @@ +process LSA_COSINE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/4d/4d94f159b95315adf8bf54fdc9db88db10a5aef72dca6245dd163b91e9e0437e/data' : + 'community.wave.seqera.io/library/r-base_r-lsa_r-pheatmap_r-optparse_pruned:901156bc11e60b28' }" + + input: + tuple val(meta), path(expression_matrix) + + output: + tuple val(meta), path("*_matrix.csv") , emit: matrix + tuple val(meta), path("*_heatmap.png"), emit: heatmap + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + template 'cosine.R' + + stub: + args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "dummy matrix data" > ${prefix}_matrix.csv + echo "dummy matrix data" > ${prefix}_heatmap.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + r-base: \$(R --version | sed -n 1p | sed 's/R version //g' | sed 's/ (.*//g') + r-lsa: \$(Rscript -e "library(lsa); cat(as.character(packageVersion('lsa')))") + r-pheatmap: \$(Rscript -e "library(pheatmap); cat(as.character(packageVersion('pheatmap')))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/lsa/cosine/meta.yml b/modules/nf-core/lsa/cosine/meta.yml new file mode 100644 index 000000000000..964cac6cd7de --- /dev/null +++ b/modules/nf-core/lsa/cosine/meta.yml @@ -0,0 +1,71 @@ +# name matches directory structure +name: "lsa_cosine" +description: Calculates the cosine similarity matrix between samples based on a gene + expression matrix. +keywords: + - similarity + - cosine + - clustering + - rnaseq + - heatmap +tools: + - "lsa": + description: "Latent Semantic Analysis (LSA) package for R." + homepage: "https://cran.r-project.org/web/packages/lsa/index.html" + documentation: "https://cran.r-project.org/web/packages/lsa/lsa.pdf" + licence: ["GPL-2"] + identifier: "" + - "pheatmap": + description: "Pretty Heatmaps package for R." + homepage: "https://cran.r-project.org/web/packages/pheatmap/index.html" + documentation: "https://cran.r-project.org/web/packages/pheatmap/pheatmap.pdf" + licence: ["GPL-2"] + identifier: biotools:pheatmap + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information, e.g. [ id:'test' ]. + - expression_matrix: + type: file + description: | + CSV file containing the expression matrix. + Rows should be features (genes) and columns should be samples. + pattern: "*.csv" + ontologies: + - edam: http://edamontology.org/format_3752 # CSV +output: + matrix: + - - meta: + type: map + description: | + Groovy Map containing sample information, e.g. [ id:'test' ]. + - "*_matrix.csv": + type: file + description: A square matrix (CSV) containing pairwise similarity scores. + pattern: "*_matrix.csv" + ontologies: + - edam: http://edamontology.org/format_3752 # CSV + heatmap: + - - meta: + type: map + description: | + Groovy Map containing sample information, e.g. [ id:'test' ]. + - "*_heatmap.png": + type: file + description: A PNG image visualizing the similarity matrix as a heatmap. + pattern: "*_heatmap.png" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions. + pattern: "versions.yml" + + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@miguelrosell" +maintainers: + - "@miguelrosell" diff --git a/modules/nf-core/lsa/cosine/templates/cosine.R b/modules/nf-core/lsa/cosine/templates/cosine.R new file mode 100644 index 000000000000..3f3f46fcb421 --- /dev/null +++ b/modules/nf-core/lsa/cosine/templates/cosine.R @@ -0,0 +1,83 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) + library(readr) + library(lsa) + library(pheatmap) +}) + +# ---- Configurable options via task.ext.args ---- +option_list <- list( + make_option(c("-m", "--method"), + type = "character", default = "cosine", + help = "Similarity method: cosine, pearson, spearman"), + make_option(c("-g", "--min_gene_mean"), + type = "double", default = 0.0, + help = "Minimum gene mean expression filter") +) + +opt <- parse_args( + OptionParser(option_list = option_list), + args = strsplit("$args", "[[:space:]]+")[[1]] +) + +# ---- Load table ---- +df <- as.data.frame( + read_csv("${expression_matrix}", + col_types = cols(.default = col_guess())) +) + +# ---- Set rownames from first column if character ---- +first_col <- names(df)[1] +if (is.character(df[[first_col]]) || is.factor(df[[first_col]])) { + rownames(df) <- df[[first_col]] + df[[first_col]] <- NULL +} + +# ---- Select numeric columns as the expression matrix ---- +mat <- as.matrix( + df[, sapply(df, is.numeric), drop = FALSE] +) + +# ---- Filter low-expression genes ---- +if (opt\$min_gene_mean > 0) { + keep <- rowMeans(mat, na.rm = TRUE) >= opt\$min_gene_mean + mat <- mat[keep, , drop = FALSE] +} + +# ---- Compute similarity ---- +if (opt\$method == "cosine") { + sim <- lsa::cosine(mat) +} else { + sim <- cor(mat, + method = opt\$method, use = "pairwise.complete.obs") +} +colnames(sim) <- colnames(mat) +rownames(sim) <- colnames(mat) + +# ---- Save matrix ---- +write.csv(sim, "${prefix}_matrix.csv", + quote = FALSE, row.names = TRUE) + +# ---- Save heatmap ---- +png("${prefix}_heatmap.png", width = 900, height = 700) +pheatmap(sim, + display_numbers = TRUE, + cluster_rows = FALSE, + cluster_cols = FALSE) +dev.off() + +# ---- Versions ---- +writeLines( + c( + '"${task.process}":', + paste(" r-base:", + paste0(R.version\$major, ".", R.version\$minor)), + paste(" r-lsa:", + as.character(packageVersion("lsa"))), + paste(" r-pheatmap:", + as.character(packageVersion("pheatmap"))) + ), + "versions.yml" +) diff --git a/modules/nf-core/lsa/cosine/tests/main.nf.test b/modules/nf-core/lsa/cosine/tests/main.nf.test new file mode 100644 index 000000000000..5210aa68513a --- /dev/null +++ b/modules/nf-core/lsa/cosine/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process LSA_COSINE" + script "../main.nf" + process "LSA_COSINE" + + tag "modules" + tag "modules_nfcore" + tag "lsa" + tag "lsa/cosine" + + // Test 1: Real run + test("Should run successfully with a simple CSV") { + + when { + process { + """ + // Create dummy CSV + def test_csv = file("test_data.csv") + test_csv.text = '''gene_id,sample_A,sample_B,sample_C + GENE_1,10.0,10.5,0.0 + GENE_2,50.0,50.5,2.0 + GENE_3,0.0,0.0,100.0 + GENE_4,100.0,99.5,10.0''' + + input[0] = [ [id:'test_sample'], test_csv ] + """ + } + } + + then { + assert process.success + // Snapshot versions and the matrix content (heatmap is binary, so we check name only) + assert snapshot( + process.out.matrix, + process.out.versions, + file(process.out.heatmap[0][1]).name + ).match() + } + + } + + // Test 2: Stub run (fast check) + test("Stub run") { + options "-stub" + + when { + process { + """ + def test_csv = file("test_data.csv") + test_csv.text = '''gene_id,sample_A,sample_B''' // content doesn't matter for stub + input[0] = [ [id:'test_sample'], test_csv ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } + +} diff --git a/modules/nf-core/lsa/cosine/tests/main.nf.test.snap b/modules/nf-core/lsa/cosine/tests/main.nf.test.snap new file mode 100644 index 000000000000..cb48832156a0 --- /dev/null +++ b/modules/nf-core/lsa/cosine/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "Stub run": { + "content": [ + { + "0": [ + [ + { + "id": "test_sample" + }, + "test_sample_matrix.csv:md5,3932b231a8a14016bc1e7a245f05246c" + ] + ], + "1": [ + [ + { + "id": "test_sample" + }, + "test_sample_heatmap.png:md5,3932b231a8a14016bc1e7a245f05246c" + ] + ], + "2": [ + "versions.yml:md5,b77351a76d91db12e57dd88f7e2e0184" + ], + "heatmap": [ + [ + { + "id": "test_sample" + }, + "test_sample_heatmap.png:md5,3932b231a8a14016bc1e7a245f05246c" + ] + ], + "matrix": [ + [ + { + "id": "test_sample" + }, + "test_sample_matrix.csv:md5,3932b231a8a14016bc1e7a245f05246c" + ] + ], + "versions": [ + "versions.yml:md5,b77351a76d91db12e57dd88f7e2e0184" + ] + } + ], + "timestamp": "2026-02-20T15:51:35.612855607", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.04.7" + } + }, + "Should run successfully with a simple CSV": { + "content": [ + [ + [ + { + "id": "test_sample" + }, + "test_sample_matrix.csv:md5,d84209a90c22f35ef4e6ea89a9d11770" + ] + ], + [ + "versions.yml:md5,b77351a76d91db12e57dd88f7e2e0184" + ], + "test_sample_heatmap.png" + ], + "timestamp": "2026-02-13T02:24:53.88495333", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.04.7" + } + } +} \ No newline at end of file