still testing

SuiYue-2308 · SuiYue-2308 · commit b22ab7534465 · 2025-05-15T15:26:07.000+08:00
diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R
@@ -82,33 +82,27 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){
 #' Generate non-unique counts
 #' @noRd
 generateNonUniqueCounts <- function(readClassDt, countMatrix, annotations){
-    #fuse multi align RCs by gene
-    x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match))
-    x <- x %>% distinct(eqClassId, .keep_all = TRUE)
-    nonuniqueCounts <- countMatrix[x$eqClass.match,, drop = FALSE]
-    if(nrow(x)>1 & length(unique(x$gene_sid))>1){
-        nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1)
-        nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts
-        #covert ids into gene ids
-        geneids <- as.numeric(levels(factor(x$gene_sid)))
-        geneids <- x$txid[match(geneids, x$gene_sid)]
-        geneids <- mcols(annotations)$GENEID[as.numeric(geneids)]
-        rownames(nonuniqueCounts) <- geneids
-    } else{
-        warning("The factor variable 'gene_sid' has only one level. Adjusting output.")
-        nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE)
-        nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts
-        #covert ids into gene ids
-        geneids <- as.numeric(levels(factor(x$gene_sid)))
-        geneids <- x$txid[match(geneids, x$gene_sid)]
-        geneids <- mcols(annotations)$GENEID[as.numeric(geneids)]
-        rownames(nonuniqueCounts) <- rownames(geneMat)[1:nrow(nonuniqueCounts)]
-
-    }
-    #create matrix for all annotated genes
-    genes <- levels(factor(unique(mcols(annotations)$GENEID)))
-    geneMat <- sparseMatrix(length(genes), ncol(nonuniqueCounts), x = 0)
-    rownames(geneMat) <- genes
-    geneMat[rownames(nonuniqueCounts),] <- nonuniqueCounts
-    return(geneMat)
+  #fuse multi align RCs by gene
+  x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match))
+  x <- x %>% distinct(eqClassId, .keep_all = TRUE)
+  nonuniqueCounts <- countMatrix[x$eqClass.match,, drop = FALSE]
+  if(nrow(x)>1 & length(unique(x$gene_sid))>1){
+    nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1)
+    nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts
+  } else{
+    warning("The factor variable 'gene_sid' has only one level. Adjusting output.")
+    nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE)
+    nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts
+  }
+  #covert ids into gene ids
+  geneids <- as.numeric(levels(factor(x$gene_sid)))
+  geneids <- x$txid[match(geneids, x$gene_sid)]
+  geneids <- mcols(annotations)$GENEID[as.numeric(geneids)]
+  rownames(nonuniqueCounts) <- geneids
+  #create matrix for all annotated genes
+  genes <- levels(factor(unique(mcols(annotations)$GENEID)))
+  geneMat <- sparseMatrix(length(genes), ncol(nonuniqueCounts), x = 0)
+  rownames(geneMat) <- genes
+  geneMat[rownames(nonuniqueCounts),] <- nonuniqueCounts
+  return(geneMat)
 }
diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R
@@ -16,7 +16,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence,
     readClass.outputDir=NULL, yieldSize=1000000, bpParameters, 
     stranded=FALSE, verbose=FALSE, isoreParameters = setIsoreParameters(NULL),
     processByChromosome = FALSE, processByBam = TRUE, trackReads = trackReads, fusionMode = fusionMode, 
-    demultiplexed = FALSE, cleanReads = FALSE, dedupUMI = FALSE, sampleNames = NULL, barcodesToFilter = NULL, trustReadStartEnd = FALSE) {
+    demultiplexed = FALSE, cleanReads = FALSE, dedupUMI = FALSE, sampleNames = NULL, barcodesToFilter = NULL) {
     genomeSequence <- checkInputSequence(genomeSequence)
     # ===# create BamFileList object from character #===#
     if (is(reads, "BamFile")) {
@@ -125,7 +125,7 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations,
     yieldSize = NULL, stranded = FALSE, min.readCount = 2, 
     fitReadClassModel = TRUE, min.exonOverlap = 10, defaultModels = NULL, returnModel = FALSE, 
     verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE, demultiplexed = FALSE, 
-    cleanReads = FALSE, dedupUMI = FALSE, index = 0, barcodesToFilter = NULL, trustReadStartEnd = FALSE) {
+    cleanReads = FALSE, dedupUMI = FALSE, index = 0, barcodesToFilter = NULL) {
     if(verbose) message(names(bam.file)[1])
     readGrgList <- prepareDataFromBam(bam.file[[1]], verbose = verbose, yieldSize = yieldSize, use.names = trackReads, demultiplexed = demultiplexed, cleanReads = cleanReads, dedupUMI = dedupUMI)
     if(verbose) message(paste0("Number of alignments/reads: ",length(readGrgList)))
@@ -196,7 +196,7 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations,
                                                          annotations,genomeSequence, stranded = stranded, verbose = verbose)
         se <- isore.constructReadClasses(readGrgList, 
                                               unlisted_junctions, uniqueJunctions, runName = "TODO",
-                                              annotations, stranded, verbose,  trustReadStartEnd = FALSE)
+                                              annotations, stranded, verbose)
 
     }
 
diff --git a/R/bambu-processReads_utilityConstructReadClasses.R b/R/bambu-processReads_utilityConstructReadClasses.R
@@ -10,7 +10,7 @@
 #' @noRd
 isore.constructReadClasses <- function(readGrgList, unlisted_junctions,
                                        uniqueJunctions, runName = "sample1",
-                                       annotations, stranded = FALSE, verbose = FALSE, trustReadStartEnd = FALSE) {
+                                       annotations, stranded = FALSE, verbose = FALSE) {
     #split reads into single exon and multi exon reads
     reads.singleExon <- unlist(readGrgList[elementNROWS(readGrgList) == 1],
                                use.names = FALSE)
@@ -29,7 +29,7 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions,
             uniqueJunctions = uniqueJunctions,
             unlisted_junctions = unlisted_junctions,
             readGrgList = readGrgList,
-            stranded = stranded, annotations, trustReadStartEnd = FALSE)}
+            stranded = stranded, annotations)}
     else{exonsByRC.spliced = GRangesList()}
     end.ptm <- proc.time()
     rm(readGrgList, unlisted_junctions, uniqueJunctions)
@@ -57,7 +57,7 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions,
 #' @importFrom GenomicRanges match
 #' @noRd
 constructSplicedReadClasses <- function(uniqueJunctions, unlisted_junctions, 
-                                        readGrgList, annotations, stranded = FALSE, trustReadStartEnd = FALSE) {
+                                        readGrgList, annotations, stranded = FALSE) {
     options(scipen = 999)
     allToUniqueJunctionMatch <- GenomicRanges::match(unlisted_junctions,
                                                      uniqueJunctions, ignore.strand = TRUE)
@@ -91,7 +91,7 @@ constructSplicedReadClasses <- function(uniqueJunctions, unlisted_junctions,
     rm(lowConfidenceReads, uniqueJunctions, allToUniqueJunctionMatch)
     readTable <- createReadTable(start(unlisted_junctions), 
         end(unlisted_junctions), mcols(unlisted_junctions)$id, readGrgList,
-        readStrand, readConfidence, annotations, trustReadStartEnd = FALSE)
+        readStrand, readConfidence, annotations)
     exonsByReadClass <- createExonsByReadClass(readTable)
     readTable <- readTable %>% dplyr::select(chr.rc = chr, strand.rc = strand,
         startSD = startSD, endSD = endSD, firstExonGroup = firstExonGroup,
@@ -159,8 +159,7 @@ correctReadStrandById <- function(strand, id, stranded = FALSE){
 #'     row_number .groups
 #' @noRd
 createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, 
-    unlisted_junctions_id, readGrgList,readStrand, readConfidence, annotations, trustReadStartEnd = FALSE) {
-    firstExons <- selectFirstExonFromRead(readGrgList)
+    unlisted_junctions_id, readGrgList,readStrand, readConfidence, annotations) {
     readRanges <- unlist(range(ranges(readGrgList)), use.names = FALSE)
     intronStartCoordinatesInt <- 
         as.integer(min(splitAsList(unlisted_junctions_start,
@@ -178,22 +177,29 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end,
         start = pmin(start(readRanges), intronStartCoordinatesInt),
         end = pmax(end(readRanges), intronEndCoordinatesInt),
         strand = readStrand, confidenceType = readConfidence,
-        firstExon5prime = ifelse(strand != "-", start(firstExons), end(firstExons)), #assume * is +
-        firstExon3prime = ifelse(strand != "-", end(firstExons), start(firstExons)), 
         alignmentStrand = as.character(getStrandFromGrList(readGrgList))=='+',
         readId = mcols(readGrgList)$id,
         sampleID = mcols(readGrgList)$sampleID)
+    readTable <- readTable %>%
+      mutate(intronStartCoordinatesInt = intronStartCoordinatesInt,
+             intronEndCoordinatesInt = intronEndCoordinatesInt,
+             firstExon5prime = ifelse(strand != "-", start, end), #assume * is +
+             firstExon3prime = ifelse(strand != "-", intronStartCoordinatesInt+1, intronEndCoordinatesInt-1),
+             lastExon5prime = ifelse(strand != "-", intronEndCoordinatesInt-1, intronStartCoordinatesInt+1),
+             lastExon3prime = ifelse(strand != "-", end, start)
+             ) %>%
+      select(-intronStartCoordinatesInt, -intronEndCoordinatesInt)
     rm(readRanges, readStrand, unlisted_junctions_start, 
         unlisted_junctions_end, unlisted_junctions_id, readConfidence, 
         intronStartCoordinatesInt, intronEndCoordinatesInt)
-    readTable <- readsPotentialTss(readTable, annotations, trustReadStartEnd = FALSE)
+    readTable <- splitReadClassByStartEnd(readTable, annotations)
     ## currently 80%/20% quantile of reads is used to identify start/end sites
     readTable <- readTable %>% 
-        group_by(chr, strand, intronEnds, intronStarts, confidenceType, firstExonGroup) %>% 
+        group_by(chr, strand, intronEnds, intronStarts, confidenceType, firstExonGroup, lastExonGroup) %>% 
         summarise(readCount = n(), startSD = sd(start), endSD = sd(end),
                 start = nth(x = start, n = ceiling(readCount / 5), order_by = start),
                 end = nth(x = end, n = ceiling(readCount / 1.25), order_by = end), 
-                firstExonGroup = unique(firstExonGroup),
+                firstExonGroup = unique(firstExonGroup), lastExonGroup =  unique(lastExonGroup),
                 readCount.posStrand = sum(alignmentStrand, na.rm = TRUE), 
                 readIds = list(readId), sampleIDs = list(sampleID),
                 .groups = 'drop') %>% 
@@ -202,42 +208,37 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end,
     return(readTable)
 }
 
-readsPotentialTss <- function(readTable, annotations, trustReadStartEnd = TRUE){
+splitReadClassByStartEnd <- function(readTable, annotations){
   exons <- unlist(annotations)
-  annoTable <- tibble(Tx = names(exons), 
+  mcols(exons) <- cbind(mcols(exons),
+                        mcols(annotations)[rep(seq_along(annotations), elementNROWS(annotations)), ])
+  annoTable <- tibble(TXNAME = names(exons), 
+                      GENEID = mcols(exons)$GENEID, 
                       exonRank = mcols(exons)$exon_rank,
                       chr = as.character(seqnames(exons)), 
                       start = start(exons),
                       end = end(exons),
                       strand = as.character(strand(exons)), 
                       firstExon5prime = ifelse(strand != "-", start(exons), end(exons)), #assume * is +
-                      firstExon3prime = ifelse(strand != "-", end(exons), start(exons)))
+                      firstExon3prime = ifelse(strand != "-", end(exons), start(exons)),
+                      lastExon5prime = ifelse(strand != "-", start(exons), end(exons)), #assume * is +
+                      lastExon3prime = ifelse(strand != "-", end(exons), start(exons)))
   readTable = bind_rows(readTable, annoTable)
-  #add Tx id for mapped reads
+  #add gene id id for mapped reads
   readTable <- readTable %>% 
     filter(strand != "*") %>%
     group_by(chr, strand, firstExon3prime) %>% 
-    mutate(Tx = ifelse(is.na(Tx), Tx[!is.na(Tx)][1], Tx)) %>% # is it possible that two tx from annotation have same exon
+    mutate(GENEID = ifelse(is.na(GENEID), GENEID[!is.na(GENEID)][1], GENEID)) %>% # is it possible that two tx from annotation have same exon
+    ungroup() %>% 
+    group_by(chr, strand, lastExon5prime) %>% 
+    mutate(GENEID = ifelse(is.na(GENEID), GENEID[!is.na(GENEID)][1], GENEID)) %>% # is it possible that two tx from annotation have same exon
     ungroup()
   #add first exon group for reads
   readTable <- readTable %>% 
-    group_by(Tx)  %>% 
-    arrange(firstExon5prime, .by_group = TRUE) %>%
-    mutate(firstExonGroup = findInterval(start,sort(start[is.na(readId)]), left.open = F)) %>%
-    ungroup()
-  
-  if(trustReadStartEnd == TRUE){
-    readTable <- readTable %>% 
-      group_by(Tx, firstExon3prime, firstExonGroup) %>%
-      #mutate(potentialTss = ifelse(strand != "-", min(start[!is.na(readId)]), max(end[!is.na(readId)]))) %>%
-      ungroup() %>% filter(!is.na(readId))
-  } 
-  else{
-    readTable <- readTable %>% 
-      group_by(Tx, firstExon3prime, firstExonGroup) %>%
-      #mutate(potentialTss = ifelse(strand != "-", start[is.na(readId)], end[is.na(readId)])) %>%
-      ungroup() %>% filter(!is.na(readId))
-  }
+    group_by(GENEID)  %>% 
+    mutate(firstExonGroup = findInterval(start,sort(start[is.na(readId)]))) %>% 
+    mutate(lastExonGroup = findInterval(end,sort(end[is.na(readId)]),  left.open = T)) %>%
+    ungroup() %>% filter(!is.na(readId))
   return(readTable)
 }
 
diff --git a/R/bambu.R b/R/bambu.R
@@ -141,7 +141,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL,
     trackReads = FALSE, returnDistTable = FALSE, lowMemory = FALSE,
     fusionMode = FALSE, verbose = FALSE, demultiplexed = FALSE, spatial = NULL, quantData = NULL,
     sampleNames = NULL, cleanReads = FALSE, dedupUMI = FALSE, barcodesToFilter = NULL, clusters = NULL,
-    processByChromosome = FALSE, processByBam = TRUE, trustReadStartEnd = FALSE) {
+    processByChromosome = FALSE, processByBam = TRUE) {
     message(paste0("Running Bambu-v", "3.9.0"))
     if(!is.null(mode)){
         if(mode == "bulk"){
@@ -209,7 +209,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL,
                                                 processByChromosome = processByChromosome, processByBam = processByBam, 
                                                 demultiplexed = demultiplexed,
                                                 sampleNames = sampleNames, cleanReads = cleanReads, 
-                                                dedupUMI = dedupUMI,barcodesToFilter = barcodesToFilter, trustReadStartEnd = FALSE)
+                                                dedupUMI = dedupUMI,barcodesToFilter = barcodesToFilter)
         }
         
         #warnings = handleWarnings(readClassList, verbose)