Skip to content

Commit 5b0313e

Browse files
authored
Merge pull request #141 from PMCC-BioinformaticsCore/zeroinflation_xl
Zeroinflation xl
2 parents c48aad8 + 8e48b4c commit 5b0313e

11 files changed

+1566
-667
lines changed

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ export(plot_qc_metrics_heatmap)
3535
export(plot_rle)
3636
export(plot_volcano)
3737
export(read_metadata)
38+
export(select_robust_controls)
3839
export(subsample_genes)
3940
export(summarise_de)
4041
export(validate_metadata)
@@ -66,6 +67,9 @@ importFrom(dplyr,reframe)
6667
importFrom(dplyr,rename)
6768
importFrom(dplyr,select)
6869
importFrom(dplyr,where)
70+
importFrom(edgeR,DGEList)
71+
importFrom(edgeR,calcNormFactors)
72+
importFrom(edgeR,cpm)
6973
importFrom(ggiraph,geom_point_interactive)
7074
importFrom(ggiraph,girafe)
7175
importFrom(ggplot2,aes)
@@ -124,6 +128,7 @@ importFrom(stringr,str_trim)
124128
importFrom(tibble,column_to_rownames)
125129
importFrom(tibble,rownames_to_column)
126130
importFrom(tidyr,drop_na)
131+
importFrom(tidyr,pivot_longer)
127132
importFrom(tidyr,pivot_wider)
128133
importFrom(tidyr,unnest)
129134
importFrom(utils,capture.output)

R/check_zeroinflation.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,9 @@ check_zeroinflation <- function(data = NULL,
157157
group = unique(df$group),
158158
n_genes = nrow(df),
159159
n_wells = sum(combined_id == unique(df$group)),
160-
median_p0_obs = median(df$p0_obs),
161-
median_p0_nb = median(df$p0_nb),
162-
median_ZI = median(df$ZI),
160+
mean_p0_obs = mean(df$p0_obs),
161+
mean_p0_nb = mean(df$p0_nb),
162+
mean_ZI = mean(df$ZI),
163163
observed_zeros_num = sum(df$obs_zeros_num),
164164
expected_zeros_num = sum(df$expected_zeros_num)
165165
)

R/compute_single_de.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,8 @@ compute_single_de <- function(data = NULL,
157157
combined_id <- data$combined_id
158158
dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts,
159159
colData = pheno_data,
160-
design = ~ condition)
160+
# design = ~ condition)
161+
design = if (length(batch) == 1) ~ condition else ~ condition + batch)
161162
dds <- DESeq(dds)
162163
res <- results(dds, contrast = c("condition", treatment_samples, control_samples))
163164
top_table <- as.data.frame(res) %>%

R/select_robust_controls.R

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#' Select high-quality control replicates via TMMwsp log-CPM correlation
2+
#'
3+
#' @description
4+
#' For a given control group (e.g., DMSO) on a specific plate/batch, this function
5+
#' ranks samples by their average correlation (Fisher z–averaged) to all *other*
6+
#' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally)
7+
#' plots per-sample expression distributions and sample–sample correlation heatmaps.
8+
#'
9+
#' @param data A tidyseurat object containing an RNA assay with a **counts** layer.
10+
#' @param samples the control/treatment label to keep in column samples
11+
#' (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered.
12+
#' @param orig_ident Character scalar: the plate/batch identifier to keep
13+
#' (e.g., `"VH02012942"`). Only cells/samples from this batch are considered.
14+
#' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to
15+
#' normalization (default `1`).
16+
#' @param min_samps Integer; a gene must be expressed (CPM > `cpm_filter`) in at least
17+
#' this many samples to be retained (default `16`).
18+
#' @param corr_method Correlation type used for ranking; one of
19+
#' `c("spearman","pearson")` (default `"spearman"`).
20+
#' @param top_n Integer; the number of top-ranked samples to report in `topN`.
21+
#' Ties at the cutoff are kept (default `5`).
22+
#' @param make_plots Logical; if `TRUE`, print a log2-CPM boxplot and Pearson/Spearman
23+
#' correlation heatmaps (default `TRUE`).
24+
#'
25+
#' @details
26+
#' Workflow:
27+
#' 1) Subset to the specified `samples` **and** `orig_ident` (plate/batch).
28+
#' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`.
29+
#' 3) Normalize with **TMMwsp** and compute log2-CPM.
30+
#' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples
31+
#' (according to `corr_method`).
32+
#' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally)
33+
#' plots for QC.
34+
#'
35+
#' Column names of the counts matrix are rewritten to `"<orig.ident>_<Well_ID>"`
36+
#' for easier visual inspection in plots.
37+
#'
38+
#' @return A list with elements:
39+
#' \item{subset_obj}{The Seurat object subset used for analysis.}
40+
#' \item{dge}{The filtered `edgeR::DGEList`.}
41+
#' \item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).}
42+
#' \item{boxplot_df}{Long-format data frame used for the boxplot (`gene`, `sample`, `log_cpm`).}
43+
#' \item{cor_pearson}{Sample–sample Pearson correlation matrix.}
44+
#' \item{cor_spearman}{Sample–sample Spearman correlation matrix.}
45+
#' \item{ranking_method}{The correlation method used for ranking.}
46+
#' \item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed
47+
#' correlations (higher = better), sorted decreasing.}
48+
#' \item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).}
49+
#'
50+
#' @examples
51+
#' data(mini_mac)
52+
#' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini")
53+
#'
54+
#'
55+
#'
56+
#' @importFrom edgeR DGEList calcNormFactors cpm
57+
#' @importFrom tibble rownames_to_column
58+
#' @importFrom tidyr pivot_longer
59+
#' @export
60+
61+
select_robust_controls <- function(
62+
data,
63+
samples, # e.g. "CB_43_EP73_0"
64+
orig_ident, # e.g. "VH02012942"
65+
cpm_filter = 1, # CPM threshold for gene filtering
66+
min_samps = 16, # number of samples a gene must be expressed in
67+
corr_method = c("spearman","pearson"),
68+
top_n = 5,
69+
make_plots = TRUE
70+
){
71+
validate_inputs <- function(data, samples, orig_ident) {
72+
if (!inherits(data, "Seurat")) {
73+
stop("argument 'data' must be a Seurat or TidySeurat object.")
74+
}
75+
76+
# check samples and orig_ident columns
77+
if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) {
78+
stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.")
79+
}
80+
# check samples in samples column
81+
if (is.null(samples)){
82+
stop("Please provide a value for 'samples'.")
83+
} else if (!all(samples %in% unique(data$combined_id))) {
84+
stop("Some values in 'samples' are not found in the 'combined_id' column of 'data'.")
85+
}
86+
# check orig.ident in the orig.ident column
87+
if (is.null(orig_ident)){
88+
stop("Please provide a value for 'orig_ident'.")
89+
} else if (!orig_ident %in% unique(data$orig.ident)) {
90+
stop("The value of 'orig_ident' is not found in the 'orig.ident' column of 'data'.")
91+
}
92+
return(list(data = data, samples = samples, orig_ident = orig_ident))
93+
}
94+
validated <- validate_inputs(data = data, samples = samples, orig_ident = orig_ident)
95+
data <- validated$data
96+
group_by <- validated$orig_ident
97+
samples <- validated$samples
98+
99+
corr_method <- match.arg(corr_method)
100+
sel_cells <- colnames(data)[data$combined_id == samples &
101+
data$orig.ident == orig_ident]
102+
if (length(sel_cells) == 0L) {
103+
stop("No cells/samples match the specified 'combined_id' and 'orig_ident'.")
104+
}
105+
subgroup <- subset(data, cells = sel_cells)
106+
# Counts and human-friendly column names
107+
counts_d <- Seurat::GetAssayData(subgroup, assay = "RNA", layer = "counts")
108+
well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID)
109+
names(well_colnames) <- rownames(subgroup@meta.data)
110+
colnames(counts_d) <- well_colnames[colnames(counts_d)]
111+
# edgeR container + gene filtering
112+
y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident)
113+
keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps
114+
y <- y[keep, , keep.lib.sizes = FALSE]
115+
y <- edgeR::calcNormFactors(y, method = "TMMwsp")
116+
log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE)
117+
# Long data for boxplot
118+
df_long <- as.data.frame(log_cpm_tmm) |>
119+
tibble::rownames_to_column(var = "gene") |>
120+
tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm")
121+
if (make_plots) {
122+
if (!requireNamespace("ggplot2", quietly = TRUE)) {
123+
warning("Package 'ggplot2' not available; skipping boxplot.")
124+
} else {
125+
print(
126+
ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) +
127+
ggplot2::geom_boxplot(outlier.size = 0.5) +
128+
ggplot2::labs(x = "Sample", y = "log2 CPM",
129+
title = "Boxplot of log2-CPM (TMMwsp)") +
130+
ggplot2::theme_classic() +
131+
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
132+
)
133+
}
134+
}
135+
# Correlation matrices
136+
cors_pear <- stats::cor(log_cpm_tmm, method = "pearson")
137+
cors_spear <- stats::cor(log_cpm_tmm, method = "spearman")
138+
if (make_plots) {
139+
if (!requireNamespace("pheatmap", quietly = TRUE)) {
140+
warning("Package 'pheatmap' not available; skipping heatmaps.")
141+
} else {
142+
pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)")
143+
pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)")
144+
}
145+
}
146+
# Ranking by mean Fisher-z correlation to all *other* samples
147+
R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs")
148+
diag(R) <- NA_real_
149+
# Clip to (-1,1), Fisher z-transform, average, back-transform
150+
Z <- atanh(pmin(pmax(R, -0.999999), 0.999999))
151+
score_z <- rowMeans(Z, na.rm = TRUE)
152+
score_r <- tanh(score_z)
153+
# Top-N names and scores (keep ties at the cutoff)
154+
ord <- order(score_r, decreasing = TRUE, na.last = NA)
155+
srt <- score_r[ord]
156+
if (length(srt) == 0L) {
157+
stop("No samples available after filtering; adjust 'cpm_filter'/'min_samps'.")
158+
}
159+
k <- min(top_n, length(srt))
160+
cutoff <- srt[k]
161+
keepN <- srt >= cutoff
162+
topN <- srt[keepN]
163+
# Return everything useful
164+
list(
165+
subset_obj = subgroup,
166+
dge = y,
167+
log_cpm_tmm = log_cpm_tmm,
168+
boxplot_df = df_long,
169+
cor_pearson = cors_pear,
170+
cor_spearman = cors_spear,
171+
ranking_method = corr_method,
172+
scores_mean_to_others = sort(score_r, decreasing = TRUE),
173+
topN = topN
174+
)
175+
}

_pkgdown.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ navbar:
3131
href: articles/integration_across_modalities.html
3232
- text: "Working with Bioconductor classes"
3333
href: articles/macpie_bioc.html
34-
- text: "Chck zero-inflation"
35-
href: articles/check_zero_inflation.html
34+
- text: "Assessing zero-inflation"
35+
href: articles/assessing_zero_inflation.html
3636
- text: "Reference"
3737
href: reference/index.html # Links to your function reference docs.
3838
right:

man/select_robust_controls.Rd

Lines changed: 82 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
test_that("multiplication works", {
2+
# Load test dataset from tests/testthat/testdata
3+
testdata <- get_testdata()
4+
expect_error(select_robust_controls(testdata, samples=NULL, orig_ident="PLATE"))
5+
res <- select_robust_controls(testdata, samples="DMSO_0", orig_ident="testdata", make_plots = FALSE)
6+
expect_true(is.list(res))
7+
})

0 commit comments

Comments
 (0)