Skip to content

Commit 8b3aeb4

Browse files
committed
Added merging of headers when using the sfm command with a directory
containing multiple files as input. The merging respects the requirements outlined in the SAM spec: - order of sequence dictionaries is kept - read group identifiers are made unique if needed + optional rg tags of reads are updated accordingly - program line identifiers are made unique if needed + optional pg tags of reads are updated accordingly - comment lines are merged - the sorting order is set to Unknown If elprep cannot merge while guaranteeing the above constraints, an error is produced.
1 parent 97a6c0a commit 8b3aeb4

File tree

4 files changed

+37
-4
lines changed

4 files changed

+37
-4
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ file is created for each entry in the sequence dictionary.
772772

773773
The elprep split command requires two arguments: 1) the input file or a path to multiple input files and 2) a path to a directory where elPrep can store the split files. The input file(s) can be .sam or .bam. It is also possible to use /dev/stdin as the input for using Unix pipes. There are no structural requirements on the input file(s) for using elprep split. For example, it is not necessary to sort the input file, nor is it necessary to convert to .bam or index the input file.
774774

775-
Warning: If you pass a path to multiple input files to the elprep split command, elprep assumes that they all have the same (or compatible) headers, and just picks the first header it finds as the header for all input files. elprep currently does not make an attempt to resolve potential conflicts between headers, especially with regard to the @SQ, @RG, or @PG header records. We will include proper merging of different SAM/BAM files in a future version of elprep. In the meantime, if you need proper merging of SAM/BAM files, please use samtools merge, Picard MergeSamFiles, or a similar tool. (If such a tool produces SAM file as output, it can be piped into elprep using Unix pipes.)
775+
If you pass a path to multiple input files to the elprep split command, elprep attempts to merge the headers, resolving potential conflicts by adhering to the SAM specification. Specifically, while merging headers: 1) the order of sequence dictionaries must be kept (@sq); 2) read group identifiers must be unique (@rg) and in case of collisions elprep makes the IDs unique and updates optional @rg tags in alignments accordingly; 3) program identifiers must be unique (@pg) and elprep changes IDs to be unique in case of collisions and updates optional @pg tags in alignments accordingly; 4) comment lines are all merged (any order); 5) the order of the header is updated (GO entry). In case the headers are incompatible and merging violates any of the SAM specification requirements, elPrep produces an error and aborts the execution of the command.
776776

777777
elPrep creates the output directory denoted by the output path, unless the directory already exists, in which case elPrep may override the existing files in that directory. Please make sure elPrep has the correct permissions for writing that directory.
778778

sam/sam-types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,11 @@ func (aln *Alignment) SetRG(rg interface{}) {
358358
aln.TAGS.Set(RG, rg)
359359
}
360360

361+
// SetPG sets the PG optional field.
362+
func (aln *Alignment) SetPG(pg interface{}) {
363+
aln.TAGS.Set(PG, pg)
364+
}
365+
361366
// REFID returns the REFID temporary field.
362367
//
363368
// If REFID field is not set, this will panic with a log message. The

sam/split-merge.go

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,16 @@ func (hdr *Header) Contigs() (contigs []string, ok bool) {
229229
// requirements on the input file for splitting.
230230
func SplitFilePerChromosome(input, outputPath, outputPrefix, outputExtension string, contigGroupSize int) {
231231
inputPath, files := internal.Directory(input)
232+
var header *Header
233+
var filters []AlignmentFilter
232234
firstFile := filepath.Join(inputPath, files[0])
233235
firstIn := Open(firstFile)
234-
header := firstIn.ParseHeader()
236+
if len(files) > 1 {
237+
header, filters = MergeInputs(inputPath, files)
238+
firstIn.SkipHeader() // skip header because it won't be used as it is replaced by the merged header
239+
} else {
240+
header = firstIn.ParseHeader()
241+
}
235242
groups, contigToGroup, groupToContigs := computeContigGroups(header.SQ, contigGroupSize)
236243
splitsPath := filepath.Join(outputPath, "splits")
237244
internal.MkdirAll(splitsPath, 0700)
@@ -266,8 +273,15 @@ func SplitFilePerChromosome(input, outputPath, outputPrefix, outputExtension str
266273
pipeline.LimitedPar(0, BytesToAlignment(in)),
267274
pipeline.StrictOrd(pipeline.Receive(func(_ int, data interface{}) interface{} {
268275
for _, aln := range data.([]*Alignment) {
276+
//execute the filters that may update optional rg and pg tags in reads
277+
for _, filter := range filters {
278+
filter(aln)
279+
}
269280
group := contigToGroup[aln.RNAME]
270281
out := groupFiles[group]
282+
if out == nil {
283+
log.Panic("Attempting to output a read mapped to a region not present in the header: QNAME: ", aln.QNAME, "RNAME: ", aln.RNAME)
284+
}
271285
buf = out.FormatAlignment(aln, buf[:0])
272286
if aln.RNEXT == "=" || aln.RNAME == "*" || contigToGroup[aln.RNEXT] == group {
273287
out.Write(buf) // untagged
@@ -649,9 +663,16 @@ func MergeSortedFilesSplitPerChromosomeWithoutSpreadFile(inputPath, output, inpu
649663
// are no requirements on the input file for splitting.
650664
func SplitSingleEndFilePerChromosome(input, outputPath, outputPrefix, outputExtension string, contigGroupSize int) {
651665
inputPath, files := internal.Directory(input)
666+
var header *Header
667+
var filters []AlignmentFilter
652668
firstFile := filepath.Join(inputPath, files[0])
653669
firstIn := Open(firstFile)
654-
header := firstIn.ParseHeader()
670+
if len(files) > 1 {
671+
header, filters = MergeInputs(inputPath, files)
672+
firstIn.SkipHeader()
673+
} else {
674+
header = firstIn.ParseHeader()
675+
}
655676
groups, contigToGroup, _ := computeContigGroups(header.SQ, contigGroupSize)
656677
groupFiles := make(map[string]*OutputFile)
657678
header.AddUserRecord("@sr", utils.StringMap{"co": "This file was created using elprep split --single-end."})
@@ -673,7 +694,14 @@ func SplitSingleEndFilePerChromosome(input, outputPath, outputPrefix, outputExte
673694
pipeline.LimitedPar(0, BytesToAlignment(in)),
674695
pipeline.StrictOrd(pipeline.Receive(func(_ int, data interface{}) interface{} {
675696
for _, aln := range data.([]*Alignment) {
697+
//execute the filters that may update optional rg and pg tags in reads
698+
for _, filter := range filters {
699+
filter(aln)
700+
}
676701
out := groupFiles[contigToGroup[aln.RNAME]]
702+
if out == nil {
703+
log.Panic("Attempting to output a read mapped to a region not present in the header: QNAME: ", aln.QNAME, " RNAME: ", aln.RNAME)
704+
}
677705
buf = out.FormatAlignment(aln, buf[:0])
678706
out.Write(buf) // untagged
679707
}

utils/programinfo.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ const (
2323
ProgramName = "elprep"
2424

2525
// ProgramVersion is the version of the elprep binary
26-
ProgramVersion = "5.0.2"
26+
ProgramVersion = "5.1.0"
2727

2828
// ProgramURL is the repository for the elprep source code
2929
ProgramURL = "http://github.com/exascience/elprep"

0 commit comments

Comments
 (0)