|
| 1 | +#' Select high-quality control replicates via TMMwsp log-CPM correlation |
| 2 | +#' |
| 3 | +#' @description |
| 4 | +#' For a given control group (e.g., DMSO) on a specific plate/batch, this function |
| 5 | +#' ranks samples by their average correlation (Fisher z–averaged) to all *other* |
| 6 | +#' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) |
| 7 | +#' plots per-sample expression distributions and sample–sample correlation heatmaps. |
| 8 | +#' |
| 9 | +#' @param data A tidyseurat object containing an RNA assay with a **counts** layer. |
| 10 | +#' @param samples the control/treatment label to keep in column samples |
| 11 | +#' (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered. |
| 12 | +#' @param orig_ident Character scalar: the plate/batch identifier to keep |
| 13 | +#' (e.g., `"VH02012942"`). Only cells/samples from this batch are considered. |
| 14 | +#' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to |
| 15 | +#' normalization (default `1`). |
| 16 | +#' @param min_samps Integer; a gene must be expressed (CPM > `cpm_filter`) in at least |
| 17 | +#' this many samples to be retained (default `16`). |
| 18 | +#' @param corr_method Correlation type used for ranking; one of |
| 19 | +#' `c("spearman","pearson")` (default `"spearman"`). |
| 20 | +#' @param top_n Integer; the number of top-ranked samples to report in `topN`. |
| 21 | +#' Ties at the cutoff are kept (default `5`). |
| 22 | +#' @param make_plots Logical; if `TRUE`, print a log2-CPM boxplot and Pearson/Spearman |
| 23 | +#' correlation heatmaps (default `TRUE`). |
| 24 | +#' |
| 25 | +#' @details |
| 26 | +#' Workflow: |
| 27 | +#' 1) Subset to the specified `samples` **and** `orig_ident` (plate/batch). |
| 28 | +#' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`. |
| 29 | +#' 3) Normalize with **TMMwsp** and compute log2-CPM. |
| 30 | +#' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples |
| 31 | +#' (according to `corr_method`). |
| 32 | +#' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally) |
| 33 | +#' plots for QC. |
| 34 | +#' |
| 35 | +#' Column names of the counts matrix are rewritten to `"<orig.ident>_<Well_ID>"` |
| 36 | +#' for easier visual inspection in plots. |
| 37 | +#' |
| 38 | +#' @return A list with elements: |
| 39 | +#' \item{subset_obj}{The Seurat object subset used for analysis.} |
| 40 | +#' \item{dge}{The filtered `edgeR::DGEList`.} |
| 41 | +#' \item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} |
| 42 | +#' \item{boxplot_df}{Long-format data frame used for the boxplot (`gene`, `sample`, `log_cpm`).} |
| 43 | +#' \item{cor_pearson}{Sample–sample Pearson correlation matrix.} |
| 44 | +#' \item{cor_spearman}{Sample–sample Spearman correlation matrix.} |
| 45 | +#' \item{ranking_method}{The correlation method used for ranking.} |
| 46 | +#' \item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed |
| 47 | +#' correlations (higher = better), sorted decreasing.} |
| 48 | +#' \item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} |
| 49 | +#' |
| 50 | +#' @examples |
| 51 | +#' data(mini_mac) |
| 52 | +#' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") |
| 53 | +#' |
| 54 | +#' |
| 55 | +#' |
| 56 | +#' @importFrom edgeR DGEList calcNormFactors cpm |
| 57 | +#' @importFrom tibble rownames_to_column |
| 58 | +#' @importFrom tidyr pivot_longer |
| 59 | +#' @export |
| 60 | + |
| 61 | +select_robust_controls <- function( |
| 62 | + data, |
| 63 | + samples, # e.g. "CB_43_EP73_0" |
| 64 | + orig_ident, # e.g. "VH02012942" |
| 65 | + cpm_filter = 1, # CPM threshold for gene filtering |
| 66 | + min_samps = 16, # number of samples a gene must be expressed in |
| 67 | + corr_method = c("spearman","pearson"), |
| 68 | + top_n = 5, |
| 69 | + make_plots = TRUE |
| 70 | +){ |
| 71 | + validate_inputs <- function(data, samples, orig_ident) { |
| 72 | + if (!inherits(data, "Seurat")) { |
| 73 | + stop("argument 'data' must be a Seurat or TidySeurat object.") |
| 74 | + } |
| 75 | + |
| 76 | + # check samples and orig_ident columns |
| 77 | + if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) { |
| 78 | + stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.") |
| 79 | + } |
| 80 | + # check samples in samples column |
| 81 | + if (is.null(samples)){ |
| 82 | + stop("Please provide a value for 'samples'.") |
| 83 | + } else if (!all(samples %in% unique(data$combined_id))) { |
| 84 | + stop("Some values in 'samples' are not found in the 'combined_id' column of 'data'.") |
| 85 | + } |
| 86 | + # check orig.ident in the orig.ident column |
| 87 | + if (is.null(orig_ident)){ |
| 88 | + stop("Please provide a value for 'orig_ident'.") |
| 89 | + } else if (!orig_ident %in% unique(data$orig.ident)) { |
| 90 | + stop("The value of 'orig_ident' is not found in the 'orig.ident' column of 'data'.") |
| 91 | + } |
| 92 | + return(list(data = data, samples = samples, orig_ident = orig_ident)) |
| 93 | + } |
| 94 | + validated <- validate_inputs(data = data, samples = samples, orig_ident = orig_ident) |
| 95 | + data <- validated$data |
| 96 | + group_by <- validated$orig_ident |
| 97 | + samples <- validated$samples |
| 98 | + |
| 99 | + corr_method <- match.arg(corr_method) |
| 100 | + sel_cells <- colnames(data)[data$combined_id == samples & |
| 101 | + data$orig.ident == orig_ident] |
| 102 | + if (length(sel_cells) == 0L) { |
| 103 | + stop("No cells/samples match the specified 'combined_id' and 'orig_ident'.") |
| 104 | + } |
| 105 | + subgroup <- subset(data, cells = sel_cells) |
| 106 | + # Counts and human-friendly column names |
| 107 | + counts_d <- Seurat::GetAssayData(subgroup, assay = "RNA", layer = "counts") |
| 108 | + well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID) |
| 109 | + names(well_colnames) <- rownames(subgroup@meta.data) |
| 110 | + colnames(counts_d) <- well_colnames[colnames(counts_d)] |
| 111 | + # edgeR container + gene filtering |
| 112 | + y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident) |
| 113 | + keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps |
| 114 | + y <- y[keep, , keep.lib.sizes = FALSE] |
| 115 | + y <- edgeR::calcNormFactors(y, method = "TMMwsp") |
| 116 | + log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE) |
| 117 | + # Long data for boxplot |
| 118 | + df_long <- as.data.frame(log_cpm_tmm) |> |
| 119 | + tibble::rownames_to_column(var = "gene") |> |
| 120 | + tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm") |
| 121 | + if (make_plots) { |
| 122 | + if (!requireNamespace("ggplot2", quietly = TRUE)) { |
| 123 | + warning("Package 'ggplot2' not available; skipping boxplot.") |
| 124 | + } else { |
| 125 | + print( |
| 126 | + ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) + |
| 127 | + ggplot2::geom_boxplot(outlier.size = 0.5) + |
| 128 | + ggplot2::labs(x = "Sample", y = "log2 CPM", |
| 129 | + title = "Boxplot of log2-CPM (TMMwsp)") + |
| 130 | + ggplot2::theme_classic() + |
| 131 | + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) |
| 132 | + ) |
| 133 | + } |
| 134 | + } |
| 135 | + # Correlation matrices |
| 136 | + cors_pear <- stats::cor(log_cpm_tmm, method = "pearson") |
| 137 | + cors_spear <- stats::cor(log_cpm_tmm, method = "spearman") |
| 138 | + if (make_plots) { |
| 139 | + if (!requireNamespace("pheatmap", quietly = TRUE)) { |
| 140 | + warning("Package 'pheatmap' not available; skipping heatmaps.") |
| 141 | + } else { |
| 142 | + pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)") |
| 143 | + pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)") |
| 144 | + } |
| 145 | + } |
| 146 | + # Ranking by mean Fisher-z correlation to all *other* samples |
| 147 | + R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs") |
| 148 | + diag(R) <- NA_real_ |
| 149 | + # Clip to (-1,1), Fisher z-transform, average, back-transform |
| 150 | + Z <- atanh(pmin(pmax(R, -0.999999), 0.999999)) |
| 151 | + score_z <- rowMeans(Z, na.rm = TRUE) |
| 152 | + score_r <- tanh(score_z) |
| 153 | + # Top-N names and scores (keep ties at the cutoff) |
| 154 | + ord <- order(score_r, decreasing = TRUE, na.last = NA) |
| 155 | + srt <- score_r[ord] |
| 156 | + if (length(srt) == 0L) { |
| 157 | + stop("No samples available after filtering; adjust 'cpm_filter'/'min_samps'.") |
| 158 | + } |
| 159 | + k <- min(top_n, length(srt)) |
| 160 | + cutoff <- srt[k] |
| 161 | + keepN <- srt >= cutoff |
| 162 | + topN <- srt[keepN] |
| 163 | + # Return everything useful |
| 164 | + list( |
| 165 | + subset_obj = subgroup, |
| 166 | + dge = y, |
| 167 | + log_cpm_tmm = log_cpm_tmm, |
| 168 | + boxplot_df = df_long, |
| 169 | + cor_pearson = cors_pear, |
| 170 | + cor_spearman = cors_spear, |
| 171 | + ranking_method = corr_method, |
| 172 | + scores_mean_to_others = sort(score_r, decreasing = TRUE), |
| 173 | + topN = topN |
| 174 | + ) |
| 175 | +} |
0 commit comments