Skip to content

Commit 61058c8

Browse files
authored
Merge pull request #1 from YuheCheng62/cram_fix
Cram fix
2 parents fdb24af + 8dadb55 commit 61058c8

File tree

6 files changed

+46
-40
lines changed

6 files changed

+46
-40
lines changed

src/DupCaller.egg-info/SOURCES.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ src/subcommands/funcs/depth.py
2020
src/subcommands/funcs/indels.py
2121
src/subcommands/funcs/learn.py
2222
src/subcommands/funcs/misc.py
23-
src/subcommands/funcs/prob.py
23+
src/subcommands/funcs/prob.py
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

src/subcommands/Caller.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,12 @@ def do_call(args):
3737
"threads": args.threads,
3838
# "amperr": args.amperrs,
3939
# "amperri": args.amperri,
40-
"amperr_file": args.output + "/" + args.output + ".amp.tn.txt",
41-
"amperri_file": args.output + "/" + args.output + ".amp.id.txt",
40+
"amperr_file": args.output + ".amp.tn.txt",
41+
"amperri_file": args.output + ".amp.id.txt",
4242
# "dmgerr": args.dmgerrs,
4343
# "dmgerri": args.dmgerri,
44-
"dmgerr_file": args.output + "/" + args.output + ".dmg.tn.txt",
45-
"dmgerri_file": args.output + "/" + args.output + ".dmg.id.txt",
44+
"dmgerr_file": args.output + ".dmg.tn.txt",
45+
"dmgerri_file": args.output + ".dmg.id.txt",
4646
"mutRate": args.mutRate,
4747
"pcutoff": args.threshold,
4848
"mapq": args.mapq,
@@ -133,7 +133,7 @@ def do_call(args):
133133
except OSError as e:
134134
if e.errno != errno.EEXIST:
135135
raise
136-
bamObject = BAM(args.bam, "rb")
136+
bamObject = BAM(args.bam, "rb", args.reference)
137137

138138
"""
139139
Execulte variant calling
@@ -184,7 +184,7 @@ def do_call(args):
184184
# print(args.threads)
185185
# if args.normalBam:
186186
cutSites, chunkSize, contigs = splitBamRegions(
187-
[args.bam], args.threads, contigs, args.windowSize
187+
[args.bam], args.threads, contigs, args.windowSize, args.reference
188188
)
189189
# else:
190190
# cutSites, chunkSize, contigs = splitBamRegions(
@@ -284,17 +284,17 @@ def do_call(args):
284284
mismatch_dmg_profile, columns=["A", "T", "C", "G"], index=num2trinuc
285285
)
286286
# np.savetxt(params["output"] + "/" + args.output + ".amp.tn.txt",np.hstack([trinuc_cols[0:32],mismatch_profile]),delimiter="\t",header=" \tA\tT\tC\tG\n")
287-
amp_tn_pd.to_csv(params["output"] + "/" + args.output + ".amp.tn.txt", sep="\t")
287+
amp_tn_pd.to_csv(args.output + ".amp.tn.txt", sep="\t")
288288
np.savetxt(
289-
params["output"] + "/" + args.output + ".amp.id.txt",
289+
args.output + ".amp.id.txt",
290290
indelerr_profile,
291291
delimiter="\t",
292292
fmt="%d",
293293
)
294-
dmg_tn_pd.to_csv(params["output"] + "/" + args.output + ".dmg.tn.txt", sep="\t")
294+
dmg_tn_pd.to_csv(args.output + ".dmg.tn.txt", sep="\t")
295295
# np.savetxt(params["output"] + "/" + args.output + ".dmg.tn.txt",np.hstack([trinuc_cols,mismatch_dmg_profile]),delimiter="\t",header=" \tA\tT\tC\tG\n")
296296
np.savetxt(
297-
params["output"] + "/" + args.output + ".dmg.id.txt",
297+
args.output + ".dmg.id.txt",
298298
indelerr_dmg_profile,
299299
delimiter="\t",
300300
fmt="%d",
@@ -368,11 +368,11 @@ def do_call(args):
368368
print("....Splitting genomic regions for parallel execution.....")
369369
if args.normalBams:
370370
cutSites, chunkSize, contigs = splitBamRegions(
371-
[args.bam], args.threads, contigs, args.windowSize
371+
[args.bam], args.threads, contigs, args.windowSize, args.reference
372372
)
373373
else:
374374
cutSites, chunkSize, contigs = splitBamRegions(
375-
[args.bam], args.threads, contigs, args.windowSize
375+
[args.bam], args.threads, contigs, args.windowSize, args.reference
376376
)
377377
currentContigIndex = 0
378378
usedTime = (time.time() - startTime) / 60
@@ -491,7 +491,7 @@ def do_call(args):
491491
FPAll = sum(FPs, [])
492492
RPAll = sum(RPs, [])
493493

494-
tBam = BAM(args.bam, "rb")
494+
tBam = BAM(args.bam, "rb", args.reference)
495495
contigs = tBam.references
496496
# print(contigs)
497497
chromDict = {contig: tBam.get_reference_length(contig) for contig in contigs}
@@ -518,11 +518,11 @@ def do_call(args):
518518
}
519519
filterDict = {"PASS": "All filter Passed"}
520520
vcfLines = createVcfStrings(chromDict, infoDict, formatDict, filterDict, mutsAll)
521-
with open(args.output + "/" + args.output + "_snv.vcf", "w") as vcf:
521+
with open(args.output + "_snv.vcf", "w") as vcf:
522522
vcf.write(vcfLines)
523523

524524
vcfLines = createVcfStrings(chromDict, infoDict, formatDict, filterDict, indelsAll)
525-
with open(args.output + "/" + args.output + "_indel.vcf", "w") as vcf:
525+
with open(args.output + "_indel.vcf", "w") as vcf:
526526
vcf.write(vcfLines)
527527

528528
burden_naive = muts_num / (coverage)
@@ -531,7 +531,7 @@ def do_call(args):
531531
pass_duprate = unique_read_num / pass_read_num
532532

533533
with open(
534-
params["output"] + "/" + args.output + "_duplex_group_stats.txt", "w"
534+
args.output + "_duplex_group_stats.txt", "w"
535535
) as f:
536536
f.write(
537537
"duplex_group_strand_composition\tduplex_group_number\t\
@@ -570,13 +570,13 @@ def do_call(args):
570570
trinuc_by_duplex_group = pd.DataFrame(duplex_read_num_trinuc)
571571
trinuc_by_duplex_group.insert(0, "", trinuc_list)
572572
trinuc_by_duplex_group.to_csv(
573-
params["output"] + "/" + args.output + "_trinuc_by_duplex_group.txt",
573+
args.output + "_trinuc_by_duplex_group.txt",
574574
sep="\t",
575575
index=False,
576576
)
577577

578578
muts_by_group = np.loadtxt(
579-
params["output"] + "/" + args.output + "_duplex_group_stats.txt",
579+
args.output + "_duplex_group_stats.txt",
580580
skiprows=1,
581581
dtype=float,
582582
delimiter="\t",
@@ -610,7 +610,7 @@ def do_call(args):
610610
lgd2 = mpatches.Patch(color="blue", label="Naive")
611611
plt.legend(handles=[lgd1, lgd2])
612612
plt.savefig(
613-
params["output"] + "/" + args.output + "_burden_by_duplex_group_size.png"
613+
args.output + "_burden_by_duplex_group_size.png"
614614
)
615615
"""
616616
if len(FPAll + RPAll) != 0:
@@ -633,7 +633,7 @@ def do_call(args):
633633
clonal_num = 0
634634
"""
635635

636-
with open(params["output"] + "/" + args.output + "_stats.txt", "w") as f:
636+
with open(args.output + "_stats.txt", "w") as f:
637637
f.write(f"Number of Read Families\t{unique_read_num}\n")
638638
f.write(f"Number of Pass-filter Reads\t{pass_read_num}\n")
639639
f.write(f"Number of Effective Read Families\t{duplex_num}\n")

src/subcommands/Learn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def do_learn(args):
6868
except OSError as e:
6969
if e.errno != errno.EEXIST:
7070
raise
71-
bamObject = BAM(args.bam, "rb")
71+
bamObject = BAM(args.bam, args.reference, "rb")
7272

7373
"""
7474
Execulte variant calling

src/subcommands/funcs/call.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pysam import TabixFile as BED
1212
from pysam import VariantFile as VCF
1313
import pysam
14+
import os
1415
import re
1516
import h5py
1617

@@ -207,8 +208,8 @@ def nums2str(nums, num2base="ATCG"):
207208
return "".join(bases)
208209

209210

210-
def bamIterateMultipleRegion(bam, regions):
211-
bamObject = BAM(bam, "rb")
211+
def bamIterateMultipleRegion(bam, regions, ref):
212+
bamObject = BAM(bam, "rb", ref)
212213
for region in regions:
213214
for rec in bamObject.fetch(*region):
214215
if len(region) >= 2:
@@ -223,7 +224,7 @@ def callBam(params, processNo):
223224
nbams = params["normalBams"]
224225
regions = params["regions"]
225226
if params["germline"]:
226-
germline = VCF(params["germline"], "rb")
227+
germline = VCF(params["germline"], mode = "rb")
227228
else:
228229
germline = None
229230
all_chroms = [_[0] for _ in regions]
@@ -235,7 +236,9 @@ def callBam(params, processNo):
235236
pcut = params["pcutoff"]
236237
isLearn = params.get("isLearn", False)
237238
nn = processNo
238-
output = "tmp/" + params["output"] + "_" + str(nn)
239+
output = os.path.join("tmp", params["output"].lstrip('/') + "_" + str(nn))
240+
if not os.path.exists(os.path.dirname(output)):
241+
os.makedirs(os.path.dirname(output))
239242
if params["noise"]:
240243
noise = BED(params["noise"])
241244
else:
@@ -252,6 +255,7 @@ def callBam(params, processNo):
252255
feature_beds = [BED(_) for _ in params["feature_files"]]
253256
else:
254257
feature_beds = None
258+
255259
base2num = {"A": 0, "T": 1, "C": 2, "G": 3}
256260
num2base = "ATCG"
257261
muts = []
@@ -302,7 +306,7 @@ def callBam(params, processNo):
302306
params["trinuc_convert"] = trinuc_convert_np
303307
params["trinuc2num_dict"] = trinuc2num
304308
params["num2trinuc_list"] = num2trinuc
305-
309+
print(params["amperr_file"])
306310
### Load amp error matrix
307311
if not params["amperr_file"]:
308312
prob_amp = params["amperr"]
@@ -412,16 +416,16 @@ def callBam(params, processNo):
412416

413417
params["dmgmat_indel_mean"] = np.mean(dmgmat_indel, axis=1)
414418
params["dmgmat_indel_rev_mean"] = np.mean(dmgmat_indel_rev, axis=1)
415-
416419
# Initialize
420+
417421
total_coverage = 0
418422
total_coverage_indel = 0
419423
starttime = time.time()
420-
tumorBam = BAM(bam, "rb")
424+
tumorBam = BAM(bam, "rb", params.get("reference"))
421425
if nbams:
422426
normalBams = list()
423427
for nbam in nbams:
424-
normalBams.append(BAM(nbam, "rb"))
428+
normalBams.append(BAM(nbam, "rb", params.get("reference")))
425429
else:
426430
normalBams = None
427431
currentStart = -1
@@ -452,7 +456,7 @@ def callBam(params, processNo):
452456
)
453457
read_blacklist = set()
454458
rec_num = 0
455-
for rec, region in bamIterateMultipleRegion(bam, regions):
459+
for rec, region in bamIterateMultipleRegion(bam, regions, params.get("reference")):
456460
rec_num += 1
457461
if rec.query_name in read_blacklist or rec.is_unmapped:
458462
continue
@@ -472,7 +476,7 @@ def callBam(params, processNo):
472476
print(
473477
f"Process {str(processNo)}: finished screening highly damaged reads in {currentTime: .2f} minutes. Blacklisted {len(read_blacklist)} ({percent_blocked: .2f}%) possible highly damaged read and started variant calling."
474478
)
475-
for rec, region in bamIterateMultipleRegion(bam, regions):
479+
for rec, region in bamIterateMultipleRegion(bam, regions, params.get("reference")):
476480
recCount += 1
477481
if recCount == currentCheckPoint:
478482
currentTime = (time.time() - starttime) / 60

src/subcommands/funcs/misc.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,13 @@ def createVcfStrings(chromDict, infoDict, formatDict, filterDict, recs):
4848
return "\n".join(lines) + "\n"
4949

5050

51-
def bamReadCount(bam, chrom, start, end):
52-
count = getAlignmentObject(bam, "rb").count(chrom, start, end)
51+
def bamReadCount(bam, chrom, start, end, ref):
52+
count = getAlignmentObject(bam, "rb", ref).count(chrom, start, end)
5353
return count
5454

5555

56-
def splitBamRegions(bams, num, contigs, step):
57-
bamObject = getAlignmentObject(bams[0], "rb")
56+
def splitBamRegions(bams, num, contigs, step, ref):
57+
bamObject = getAlignmentObject(bams[0], "rb", ref)
5858
contigs_set = set(contigs)
5959
contigs_sorted = [_ for _ in bamObject.references if _ in contigs]
6060

@@ -88,6 +88,7 @@ def splitBamRegions(bams, num, contigs, step):
8888
contig,
8989
current_start + k * step,
9090
current_start + k * step + step,
91+
ref,
9192
)
9293
for k in range(num)
9394
]
@@ -97,7 +98,7 @@ def splitBamRegions(bams, num, contigs, step):
9798
current_start += num * step
9899
current_window += num
99100
current_arguments = [
100-
(bam, contig, current_start + k * step, current_start + k * step + step)
101+
(bam, contig, current_start + k * step, current_start + k * step + step, ref)
101102
for k in range(window_nums_cumulative[c] - current_window)
102103
]
103104
pool = Pool()
@@ -126,11 +127,11 @@ def splitBamRegions(bams, num, contigs, step):
126127

127128

128129
## Allow reading either bam or cram as bamObject
129-
def getAlignmentObject(bam, refpath=None):
130+
def getAlignmentObject(bam, mode, refpath=None):
130131
if bam.endswith(".bam"):
131-
bamObject = BAM(bam, "rb")
132+
bamObject = BAM(bam, mode)
132133
elif bam.endswith(".cram"):
133-
bamObject = BAM(bam, "rc", reference_filename=refpath)
134+
bamObject = BAM(bam, mode, reference_filename=refpath)
134135
else:
135136
raise NameError(f"{bam} should have .bam or .cram as file extension")
136137
return bamObject

0 commit comments

Comments
 (0)