This repository provides the steps to create mini BAM files for the 11q23 genomic region, perform approximate pattern matching, and identify the number of sequences matching the EBV-like sequence within the 11q23 region.
- System Tools:
samtools(≥1.9) for BAM file manipulation.
- Programming Languages:
- Python 3.x with the following libraries:
pysamnumpypandas
- R (≥4.0) with the following libraries:
stringrkaryoploteRGenomicRangesggplot2dplyrggbreakpatchworkplyrggforce
- Python 3.x with the following libraries:
- BAM File: A whole-genome BAM file for the region of interest (e.g.,
chr11). - Gap File: A
.txtfile with gap intervals in the reference genome (e.g., UCSC gap file for GRCh38).
This step creates smaller BAM files (miniBAMs) by isolating the 11q23 region from the main BAM file.
# Load samtools module on the cluster
module load samtools
# Define paths for input and output BAM files
INPUT_BAM=/path/to/HG007_PacBio_GRCh38.bam
OUTPUT_BAM=/path/to/HG007_11q23_PacBio_GRCh38.bam
# Extract the 11q23 region and save it as a miniBAM file
samtools view -b $INPUT_BAM chr11:110600000-121300000 > $OUTPUT_BAM
# Index the miniBAM file for querying
samtools index $OUTPUT_BAM
# Print confirmation
echo "Extraction and indexing complete. Output saved to: $OUTPUT_BAM"Using the extracted miniBAM, run the approximate pattern matching Python script to identify patterns in the 11q23 region.
- BAM: Path to the miniBAM file.
- CHR: Chromosome (e.g.,
chr11). - PATTERN: The target sequence to match.
- MISMATCH_THRESHOLD: Maximum allowed mismatches (e.g., 6).
- GAPS_FILE: Path to the
.txtfile containing genomic gap intervals. (GRCh38 Gap file provided in the repo) - OUTFILE: Path to save the output.
# Define variables
BAM=/path/to/HG007_11q23_PacBio_GRCh38.bam
CHR='chr11'
PATTERN='GGGTAGCATATGCTACCC'
MISMATCH_THRESHOLD=6
GAPS_FILE=/path/to/grch38.gaps.txt
OUTFILE=/path/to/mismatch.thresh.6.ebv.sequence.txt
# Run the Python script
python3 /path/to/approx_pattern_matching.py $BAM $CHR $PATTERN $MISMATCH_THRESHOLD $OUTFILE $GAPS_FILEStep 3: Identify Palindromic Sequences, Remove Duplicate Reads, and Plot EBV-Like Repeats in the 11q23 Region
This step processes the output data to:
- Identify palindromic sequences within a mismatch threshold of 2.
- Remove duplicate reads based on unique start and end coordinates.
- Bin repeat counts into 10kb intervals across the 11q23 region.
- Visualize the distribution of repeats using a karyotype plot.
#--------------------------------------------------#
# Load Required Libraries #
#--------------------------------------------------#
library(stringr)
library(reticulate)
library(karyoploteR)
library(GenomicRanges)
library(ggplot2)
library(dplyr)
library(ggseqlogo)
library(ggbreak)
library(patchwork)
library(plyr)
library(ggforce)
#--------------------------------------------------#
# Helper Functions #
#--------------------------------------------------#
# Check if a sequence is a palindrome within mismatch threshold
ispalindrome <- function(sequence, mismatch_thresh) {
distance <- hammingdistance(
str_sub(sequence, 1, 7),
rev_comp(str_sub(sequence, 12, 18))
)
return(as.integer(distance <= mismatch_thresh))
}
# Compute Hamming distance between two sequences
hammingdistance <- function(seq1, seq2) {
sequence1 <- str_split(seq1, '')[[1]]
sequence2 <- str_split(seq2, '')[[1]]
sum(sequence1 != sequence2)
}
# Compute the reverse complement of a sequence
rev_comp <- function(seq) {
complements <- c(A = "T", C = "G", G = "C", T = "A")
rev_complement <- paste0(complements[str_split(seq, '')[[1]]], collapse = "")
stringi::stri_reverse(rev_complement)
}
#--------------------------------------------------#
# Load and Process Data #
#--------------------------------------------------#
# Define the input file path {REPLACE YOUR FILE PATH HERE}
file <- "/path/to/mismatch.thresh.6.ebv.sequence.txt"
# Load repeat counts data
df <- read.delim(file, sep = '\t', stringsAsFactors = FALSE)
# Adjust positions
df$readpos <- df$readpos + 1
df$pos <- df$pos + 1
df$start <- as.numeric(df$pos)
df$end <- df$start + 17 # First base is inclusive
# Identify palindromes and filter data
df$ispalindrome.2 <- sapply(
df$matched_sequences,
ispalindrome,
mismatch_thresh = 2
)
df <- df[df$hamming.distance <= 5 & df$ispalindrome.2 == 1, ]
df$start_end <- paste0(df$start, "_", df$end)
df <- df[!duplicated(df$start_end), ]
# generate intervals of 10kb for 11q23
bins = as.numeric(seq(110600000,121300000,1e+04) )
counts=list(); bins1=list(); bins2=list();
index=1;
for(i in (1:(length(bins)-1))){
temp = df[((df$pos>=bins[i]) & (df$pos<bins[i+1])),]
counts[[index]]=nrow(temp)
index=index+1;
bins1[[index]] = bins[i]
bins2[[index]] = bins[i+1]
}
data = as.data.frame(cbind(unlist(bins1), unlist(bins2), unlist(counts)))
colnames(data) = c('start', 'end', 'score')
data$chr = rep('chr11', nrow(data))
print(paste0(" Total Repeat Counts: " , sum(data[(data$start>=114600000 & data$end<=114630000 & data$chr=="chr11"),]$score))) #11q23.3 region with the repeat cluster
#--------------------------------------------------#
# Plot the 11q23 Region #
#--------------------------------------------------#
# Define output PDF file
output_file <- "11q23.repeat.counts.pdf"
# Create the plot
pdf(file = output_file, width = 10, height = 3)
zoom <- toGRanges(data.frame("chr11", 110600000, 121300000))
kp <- plotKaryotype(genome = "hg38", plot.type = 1, chromosomes = c('chr11'), zoom = zoom)
kpDataBackground(kp, color = "white", data.panel = 1)
kpAxis(kp, data.panel = 1, ymin = 0, ymax = 500, numticks = 5, cex = 0.5)
kpBars(
kp,
chr = data$chr,
x0 = as.numeric(data$start),
x1 = as.numeric(data$end),
y1 = as.numeric(data$score),
col = "black",
border = "black",
ymax = 500,
data.panel = 1
)
kpAddBaseNumbers(kp, tick.dist = 10e5, minor.tick.dist = 5e5)
dev.off()
# Print confirmation
cat("Plot saved to", output_file, "\n")