From da05fa86421bc2b5155ae5e5a7f60354932d2a5e Mon Sep 17 00:00:00 2001 From: SuiYue-2308 Date: Fri, 9 May 2025 16:20:20 +0800 Subject: [PATCH 1/4] split rc by first and end exon group y --- R/bambu-extendAnnotations-utilityCombine.R | 10 +-- ...processReads_utilityConstructReadClasses.R | 65 +++++++++++++++++-- 2 files changed, 65 insertions(+), 10 deletions(-) diff --git a/R/bambu-extendAnnotations-utilityCombine.R b/R/bambu-extendAnnotations-utilityCombine.R index 4a644451..238fe4cc 100644 --- a/R/bambu-extendAnnotations-utilityCombine.R +++ b/R/bambu-extendAnnotations-utilityCombine.R @@ -116,7 +116,7 @@ updateStartEndReadCount <- function(combinedFeatureTibble){ filter(row_number()==1) combinedFeatureTibble <- combinedFeatureTibble %>% - dplyr::select(intronStarts, intronEnds, chr, strand, maxTxScore, + dplyr::select(intronStarts, intronEnds, chr, strand, maxTxScore, firstExonGroup, lastExonGroup, maxTxScore.noFit, NSampleReadCount, NSampleReadProp, NSampleTxScore, rowID) %>% full_join(select(startTibble, rowID, start), by = "rowID") %>% @@ -134,7 +134,7 @@ combineFeatureTibble <- function(combinedFeatureTibble, featureTibbleSummarised, index=1, intraGroup = TRUE){ if (is.null(combinedFeatureTibble)) { combinedTable <- featureTibbleSummarised %>% - select(intronStarts, intronEnds, chr, strand, maxTxScore, + select(intronStarts, intronEnds, chr, strand, maxTxScore, firstExonGroup, lastExonGroup, maxTxScore.noFit, NSampleReadCount, NSampleReadProp,NSampleTxScore, starts_with('start'), starts_with('end'), starts_with('readCount')) } else { @@ -154,7 +154,7 @@ combineFeatureTibble <- function(combinedFeatureTibble, select(intronStarts, intronEnds, chr, strand, NSampleReadCount, NSampleReadProp, NSampleTxScore, maxTxScore, maxTxScore.noFit, starts_with('start'), starts_with('end'), - starts_with('readCount')) + starts_with('readCount'), firstExonGroup, lastExonGroup) } if(intraGroup) combinedTable <- @@ -186,12 +186,12 @@ extractFeaturesFromReadClassSE <- function(readClassSe, sample_id, rowData <- as_tibble(rowData(readClassSe)) %>% mutate(start = unname(min(start(rowRangesSe))), end= unname(max(end(rowRangesSe)))) - group_var <- c("intronStarts", "intronEnds", "chr", "strand") + group_var <- c("intronStarts", "intronEnds", "chr", "strand", "firstExonGroup", "lastExonGroup") sum_var <- c("start","end","NSampleReadCount", "maxTxScore", "maxTxScore.noFit", "readCount","NSampleReadProp", "NSampleTxScore") featureTibble <- rowData %>% - dplyr::select(chr = chr.rc, start, end, strand = strand.rc, + dplyr::select(chr = chr.rc, start, end, strand = strand.rc, firstExonGroup, lastExonGroup, intronStarts, intronEnds, confidenceType, readCount, geneReadProp, txScore, txScore.noFit, numExons) %>% filter(readCount >= 1, # only use readCount>1 and highconfidence reads diff --git a/R/bambu-processReads_utilityConstructReadClasses.R b/R/bambu-processReads_utilityConstructReadClasses.R index fec9adbf..2e2f9b99 100644 --- a/R/bambu-processReads_utilityConstructReadClasses.R +++ b/R/bambu-processReads_utilityConstructReadClasses.R @@ -29,7 +29,7 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions, uniqueJunctions = uniqueJunctions, unlisted_junctions = unlisted_junctions, readGrgList = readGrgList, - stranded = stranded)} + stranded = stranded, annotations)} else{exonsByRC.spliced = GRangesList()} end.ptm <- proc.time() rm(readGrgList, unlisted_junctions, uniqueJunctions) @@ -57,7 +57,7 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions, #' @importFrom GenomicRanges match #' @noRd constructSplicedReadClasses <- function(uniqueJunctions, unlisted_junctions, - readGrgList, stranded = FALSE) { + readGrgList, annotations, stranded = FALSE) { options(scipen = 999) allToUniqueJunctionMatch <- GenomicRanges::match(unlisted_junctions, uniqueJunctions, ignore.strand = TRUE) @@ -91,10 +91,11 @@ constructSplicedReadClasses <- function(uniqueJunctions, unlisted_junctions, rm(lowConfidenceReads, uniqueJunctions, allToUniqueJunctionMatch) readTable <- createReadTable(start(unlisted_junctions), end(unlisted_junctions), mcols(unlisted_junctions)$id, readGrgList, - readStrand, readConfidence) + readStrand, readConfidence, annotations) exonsByReadClass <- createExonsByReadClass(readTable) readTable <- readTable %>% dplyr::select(chr.rc = chr, strand.rc = strand, startSD = startSD, endSD = endSD, + firstExonGroup = firstExonGroup, lastExonGroup = lastExonGroup, readCount.posStrand = readCount.posStrand, intronStarts, intronEnds, confidenceType, readCount, readIds, sampleIDs) mcols(exonsByReadClass) <- readTable @@ -159,7 +160,7 @@ correctReadStrandById <- function(strand, id, stranded = FALSE){ #' row_number .groups #' @noRd createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, - unlisted_junctions_id, readGrgList,readStrand, readConfidence) { + unlisted_junctions_id, readGrgList,readStrand, readConfidence, annotations) { readRanges <- unlist(range(ranges(readGrgList)), use.names = FALSE) intronStartCoordinatesInt <- as.integer(min(splitAsList(unlisted_junctions_start, @@ -180,15 +181,26 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, alignmentStrand = as.character(getStrandFromGrList(readGrgList))=='+', readId = mcols(readGrgList)$id, sampleID = mcols(readGrgList)$sampleID) + readTable <- readTable %>% + mutate(intronStartCoordinatesInt = intronStartCoordinatesInt, + intronEndCoordinatesInt = intronEndCoordinatesInt, + firstExon5prime = ifelse(strand != "-", start, end), #assume * is + + firstExon3prime = ifelse(strand != "-", intronStartCoordinatesInt+1, intronEndCoordinatesInt-1), + lastExon5prime = ifelse(strand != "-", intronEndCoordinatesInt-1, intronStartCoordinatesInt+1), + lastExon3prime = ifelse(strand != "-", end, start) + ) %>% + select(-intronStartCoordinatesInt, -intronEndCoordinatesInt) rm(readRanges, readStrand, unlisted_junctions_start, unlisted_junctions_end, unlisted_junctions_id, readConfidence, intronStartCoordinatesInt, intronEndCoordinatesInt) + readTable <- splitReadClassByStartEnd(readTable, annotations) ## currently 80%/20% quantile of reads is used to identify start/end sites readTable <- readTable %>% - group_by(chr, strand, intronEnds, intronStarts, confidenceType) %>% + group_by(chr, strand, intronEnds, intronStarts, confidenceType, firstExonGroup, lastExonGroup) %>% summarise(readCount = n(), startSD = sd(start), endSD = sd(end), start = nth(x = start, n = ceiling(readCount / 5), order_by = start), end = nth(x = end, n = ceiling(readCount / 1.25), order_by = end), + firstExonGroup = unique(firstExonGroup), lastExonGroup = unique(lastExonGroup), readCount.posStrand = sum(alignmentStrand, na.rm = TRUE), readIds = list(readId), sampleIDs = list(sampleID), .groups = 'drop') %>% @@ -197,6 +209,49 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, return(readTable) } +splitReadClassByStartEnd <- function(readTable, annotations){ + exons <- unlist(annotations) + mcols(exons) <- cbind(mcols(exons), + mcols(annotations)[rep(seq_along(annotations), elementNROWS(annotations)), ]) + annoTable <- tibble(TXNAME = names(exons), + GENEID = mcols(exons)$GENEID, + exonRank = mcols(exons)$exon_rank, + chr = as.character(seqnames(exons)), + start = start(exons), + end = end(exons), + strand = as.character(strand(exons)), + firstExon5prime = ifelse(strand != "-", start(exons), end(exons)), #assume * is + + firstExon3prime = ifelse(strand != "-", end(exons), start(exons)), + lastExon5prime = ifelse(strand != "-", start(exons), end(exons)), #assume * is + + lastExon3prime = ifelse(strand != "-", end(exons), start(exons))) + annoTable <- annoTable %>% + group_by(firstExon3prime) %>% + mutate(exonGroupId = cur_group_id()) %>% + ungroup() + readTable = bind_rows(readTable, annoTable) + #add gene id id for mapped reads + readTable <- readTable %>% + #filter(strand != "*") %>% + group_by(chr, strand, firstExon3prime) %>% + mutate(exonGroupId = ifelse(is.na(exonGroupId), exonGroupId[!is.na(exonGroupId)][1], exonGroupId)) %>% # is it possible that two tx from annotation have same exon + ungroup() %>% + group_by(chr, strand, lastExon5prime) %>% + mutate(exonGroupId = ifelse(is.na(exonGroupId), exonGroupId[!is.na(exonGroupId)][1], exonGroupId)) %>% # is it possible that two tx from annotation have same exon + ungroup() + #add first exon group for reads + readTable <- readTable %>% + group_by(exonGroupId, firstExon3prime) %>% + mutate(firstExonGroup = ifelse(strand != "-", + findInterval(start,sort(start[is.na(readId)])), + findInterval(end,sort(end[is.na(readId)]), left.open = T))) %>% ungroup() %>% + group_by(exonGroupId, lastExon5prime) %>% + mutate(lastExonGroup = ifelse(strand != "-", + findInterval(end,sort(end[is.na(readId)]), left.open = T), + findInterval(start,sort(start[is.na(readId)])))) %>% ungroup() %>% + filter(!is.na(readId)) + return(readTable) +} + #' @noRd getChrFromGrList <- function(grl) { return(unlist(seqnames(grl), use.names = FALSE)[cumsum(elementNROWS(grl))]) From 67938655daca5d8fb43444465ad60199021f2538 Mon Sep 17 00:00:00 2001 From: SuiYue-2308 Date: Fri, 13 Jun 2025 13:25:15 +0800 Subject: [PATCH 2/4] unique read class by firstExonGroup and lastExonGroup to avoid full_join error --- R/bambu-extendAnnotations-utilityCombine.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bambu-extendAnnotations-utilityCombine.R b/R/bambu-extendAnnotations-utilityCombine.R index 238fe4cc..0e080615 100644 --- a/R/bambu-extendAnnotations-utilityCombine.R +++ b/R/bambu-extendAnnotations-utilityCombine.R @@ -140,7 +140,7 @@ combineFeatureTibble <- function(combinedFeatureTibble, } else { combinedTable <- full_join(combinedFeatureTibble, featureTibbleSummarised, by = c('intronStarts', 'intronEnds', 'chr', - 'strand'), suffix=c('.combined','.new')) %>% + 'strand', 'firstExonGroup', 'lastExonGroup'), suffix=c('.combined','.new')) %>% mutate(NSampleReadCount=pmax0NA(NSampleReadCount.combined) + pmax0NA(NSampleReadCount.new), NSampleReadProp = pmax0NA(NSampleReadProp.combined) + From 893bc68d94d473bf3369f84f73d4c77548ff418f Mon Sep 17 00:00:00 2001 From: SuiYue-2308 Date: Fri, 13 Jun 2025 14:27:44 +0800 Subject: [PATCH 3/4] remove exonGroupId and introduce startEndWindowSize to separate alternative TSS/TES from annotated sites --- ...processReads_utilityConstructReadClasses.R | 28 +++++-------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/R/bambu-processReads_utilityConstructReadClasses.R b/R/bambu-processReads_utilityConstructReadClasses.R index 2e2f9b99..3da71d08 100644 --- a/R/bambu-processReads_utilityConstructReadClasses.R +++ b/R/bambu-processReads_utilityConstructReadClasses.R @@ -209,7 +209,7 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, return(readTable) } -splitReadClassByStartEnd <- function(readTable, annotations){ +splitReadClassByStartEnd <- function(readTable, annotations, startEndWindowSize = 35 ){ exons <- unlist(annotations) mcols(exons) <- cbind(mcols(exons), mcols(annotations)[rep(seq_along(annotations), elementNROWS(annotations)), ]) @@ -224,30 +224,16 @@ splitReadClassByStartEnd <- function(readTable, annotations){ firstExon3prime = ifelse(strand != "-", end(exons), start(exons)), lastExon5prime = ifelse(strand != "-", start(exons), end(exons)), #assume * is + lastExon3prime = ifelse(strand != "-", end(exons), start(exons))) - annoTable <- annoTable %>% - group_by(firstExon3prime) %>% - mutate(exonGroupId = cur_group_id()) %>% - ungroup() readTable = bind_rows(readTable, annoTable) - #add gene id id for mapped reads readTable <- readTable %>% - #filter(strand != "*") %>% - group_by(chr, strand, firstExon3prime) %>% - mutate(exonGroupId = ifelse(is.na(exonGroupId), exonGroupId[!is.na(exonGroupId)][1], exonGroupId)) %>% # is it possible that two tx from annotation have same exon - ungroup() %>% - group_by(chr, strand, lastExon5prime) %>% - mutate(exonGroupId = ifelse(is.na(exonGroupId), exonGroupId[!is.na(exonGroupId)][1], exonGroupId)) %>% # is it possible that two tx from annotation have same exon - ungroup() - #add first exon group for reads - readTable <- readTable %>% - group_by(exonGroupId, firstExon3prime) %>% + group_by(firstExon3prime) %>% mutate(firstExonGroup = ifelse(strand != "-", - findInterval(start,sort(start[is.na(readId)])), - findInterval(end,sort(end[is.na(readId)]), left.open = T))) %>% ungroup() %>% - group_by(exonGroupId, lastExon5prime) %>% + findInterval(start,sort(start[is.na(readId)]) - startEndWindowSize), + findInterval(-end,sort(-end[is.na(readId)]) - startEndWindowSize))) %>% ungroup() %>% + group_by(lastExon5prime) %>% mutate(lastExonGroup = ifelse(strand != "-", - findInterval(end,sort(end[is.na(readId)]), left.open = T), - findInterval(start,sort(start[is.na(readId)])))) %>% ungroup() %>% + findInterval(-end,sort(-end[is.na(readId)]) - startEndWindowSize), + findInterval(start,sort(start[is.na(readId)]) - startEndWindowSize))) %>% ungroup() %>% filter(!is.na(readId)) return(readTable) } From f47d54a64cd8e1b2a8e01cccb317b4d414db1ee2 Mon Sep 17 00:00:00 2001 From: SuiYue-2308 Date: Mon, 16 Jun 2025 09:28:00 +0800 Subject: [PATCH 4/4] remove TXNAME, GENEID, equal and compatible of read class with extended TSS/TES --- R/bambu-extendAnnotations-utilityExtend.R | 28 +++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/R/bambu-extendAnnotations-utilityExtend.R b/R/bambu-extendAnnotations-utilityExtend.R index d65d70ea..86b5db49 100644 --- a/R/bambu-extendAnnotations-utilityExtend.R +++ b/R/bambu-extendAnnotations-utilityExtend.R @@ -10,7 +10,7 @@ isore.extendAnnotations <- function(combinedTranscripts, annotationGrangesList, combinedTranscripts <- filterTranscripts(combinedTranscripts, min.sampleNumber) if (nrow(combinedTranscripts) > 0) { group_var <- c("intronStarts","intronEnds","chr","strand","start","end", - "confidenceType","readCount", "maxTxScore", "maxTxScore.noFit") + "confidenceType","readCount", "maxTxScore", "maxTxScore.noFit", "firstExonGroup", "lastExonGroup") rowDataTibble <- select(combinedTranscripts,all_of(group_var)) annotationSeqLevels <- seqlevels(annotationGrangesList) rowDataSplicedTibble <- filter(rowDataTibble, @@ -343,6 +343,16 @@ addNewSplicedReadClasses <- function(combinedTranscriptRanges, # annotate with compatible gene id, rowDataFilteredSpliced$GENEID[equalQhits[!duplicated(equalQhits)]] <- mcols(annotationGrangesList[equalSubHits[!duplicated(equalQhits)]])$GENEID + + # remove TXNAME, GENEID, equal and compatible for + idx_startEnd <- which(rowDataFilteredSpliced$firstExonGroup == 0 | + rowDataFilteredSpliced$lastExonGroup == 0) + if (length(idx_startEnd) > 0) { + classificationTable$compatible[idx_startEnd] <- "" + classificationTable$equal[idx_startEnd] <- "" + rowDataFilteredSpliced$GENEID[idx_startEnd] <- NA + rowDataFilteredSpliced$TXNAME[idx_startEnd] <- NA + } # annotate as identical, using intron matches unlistedIntrons <- unlist(intronsByReadClass, use.names = TRUE) partitioning <- PartitioningByEnd(cumsum(elementNROWS(intronsByReadClass)), @@ -355,7 +365,8 @@ addNewSplicedReadClasses <- function(combinedTranscriptRanges, updateWIntronMatches(unlistedIntrons, unlistedIntronsAnnotations, partitioning, classificationTable, annotationGrangesList, rowDataFilteredSpliced, exonsByReadClass, min.exonDistance, - min.primarySecondaryDist, min.primarySecondaryDistStartEnd) + min.primarySecondaryDist, min.primarySecondaryDistStartEnd) + classificationTable <- updateWStartEnd(rowDataFilteredSpliced, classificationTable) rowDataFilteredSpliced$readClassType <- apply(classificationTable, 1, function(x){paste(x[x!=""], collapse = ":")}) rowDataFilteredSpliced$novelTranscript = TRUE @@ -420,6 +431,19 @@ updateWIntronMatches <- function(unlistedIntrons, unlistedIntronsAnnotations, } +#' update classificationTable by start and end +#' @importFrom GenomicRanges match +#' @noRd +updateWStartEnd <- function(rowDataSplicedTibble, classificationTable) { + idx <- which(rowDataSplicedTibble$firstExonGroup == 0 | + rowDataSplicedTibble$lastExonGroup == 0) + if (length(idx) > 0) { + classificationTable$compatible[idx] <- "" + classificationTable$equal[idx] <- "" + } + return(classificationTable) +} + #' assign gene id by maximum match #' @importFrom dplyr as_tibble %>% group_by summarise filter ungroup