Added merging of headers when using the sfm command with a directory

caherzee · caherzee · commit 8b3aeb4d503b · 2021-10-01T11:45:16.000+02:00
containing multiple files as input.
The merging respects the requirements outlined in the SAM spec:
- order of sequence dictionaries is kept
- read group identifiers are made unique if needed + optional rg tags of
  reads are updated accordingly
- program line identifiers are made unique if needed + optional pg tags
  of reads are updated accordingly
- comment lines are merged
- the sorting order is set to Unknown
If elprep cannot merge while guaranteeing the above constraints, an
error is produced.
diff --git a/README.md b/README.md
@@ -772,7 +772,7 @@ file is created for each entry in the sequence dictionary.
 
 The elprep split command requires two arguments: 1) the input file or a path to multiple input files and 2) a path to a directory where elPrep can store the split files. The input file(s) can be .sam or .bam. It is also possible to use /dev/stdin as the input for using Unix pipes. There are no structural requirements on the input file(s) for using elprep split. For example, it is not necessary to sort the input file, nor is it necessary to convert to .bam or index the input file.
 
-Warning: If you pass a path to multiple input files to the elprep split command, elprep assumes that they all have the same (or compatible) headers, and just picks the first header it finds as the header for all input files. elprep currently does not make an attempt to resolve potential conflicts between headers, especially with regard to the @SQ, @RG, or @PG header records. We will include proper merging of different SAM/BAM files in a future version of elprep. In the meantime, if you need proper merging of SAM/BAM files, please use samtools merge, Picard MergeSamFiles, or a similar tool. (If such a tool produces SAM file as output, it can be piped into elprep using Unix pipes.)
+If you pass a path to multiple input files to the elprep split command, elprep attempts to merge the headers, resolving potential conflicts by adhering to the SAM specification. Specifically, while merging headers: 1) the order of sequence dictionaries must be kept (@sq); 2) read group identifiers must be unique (@rg) and in case of collisions elprep makes the IDs unique and updates optional @rg tags in alignments accordingly; 3) program identifiers must be unique (@pg) and elprep changes IDs to be unique in case of collisions and updates optional @pg tags in alignments accordingly; 4) comment lines are all merged (any order); 5) the order of the header is updated (GO entry). In case the headers are incompatible and merging violates any of the SAM specification requirements, elPrep produces an error and aborts the execution of the command.
 
 elPrep creates the output directory denoted by the output path, unless the directory already exists, in which case elPrep may override the existing files in that directory. Please make sure elPrep has the correct permissions for writing that directory.
 
diff --git a/sam/sam-types.go b/sam/sam-types.go
@@ -358,6 +358,11 @@ func (aln *Alignment) SetRG(rg interface{}) {
 	aln.TAGS.Set(RG, rg)
 }
 
+// SetPG sets the PG optional field.
+func (aln *Alignment) SetPG(pg interface{}) {
+	aln.TAGS.Set(PG, pg)
+}
+
 // REFID returns the REFID temporary field.
 //
 // If REFID field is not set, this will panic with a log message. The
diff --git a/sam/split-merge.go b/sam/split-merge.go
@@ -229,9 +229,16 @@ func (hdr *Header) Contigs() (contigs []string, ok bool) {
 // requirements on the input file for splitting.
 func SplitFilePerChromosome(input, outputPath, outputPrefix, outputExtension string, contigGroupSize int) {
 	inputPath, files := internal.Directory(input)
+	var header *Header
+	var filters []AlignmentFilter
 	firstFile := filepath.Join(inputPath, files[0])
 	firstIn := Open(firstFile)
-	header := firstIn.ParseHeader()
+	if len(files) > 1 {
+		header, filters = MergeInputs(inputPath, files)
+		firstIn.SkipHeader() // skip header because it won't be used as it is replaced by the merged header
+	} else {
+		header = firstIn.ParseHeader()
+	}
 	groups, contigToGroup, groupToContigs := computeContigGroups(header.SQ, contigGroupSize)
 	splitsPath := filepath.Join(outputPath, "splits")
 	internal.MkdirAll(splitsPath, 0700)
@@ -266,8 +273,15 @@ func SplitFilePerChromosome(input, outputPath, outputPrefix, outputExtension str
 			pipeline.LimitedPar(0, BytesToAlignment(in)),
 			pipeline.StrictOrd(pipeline.Receive(func(_ int, data interface{}) interface{} {
 				for _, aln := range data.([]*Alignment) {
+					//execute the filters that may update optional rg and pg tags in reads
+					for _, filter := range filters {
+						filter(aln)
+					}
 					group := contigToGroup[aln.RNAME]
 					out := groupFiles[group]
+					if out == nil {
+						log.Panic("Attempting to output a read mapped to a region not present in the header: QNAME: ", aln.QNAME, "RNAME: ", aln.RNAME)
+					}
 					buf = out.FormatAlignment(aln, buf[:0])
 					if aln.RNEXT == "=" || aln.RNAME == "*" || contigToGroup[aln.RNEXT] == group {
 						out.Write(buf) // untagged
@@ -649,9 +663,16 @@ func MergeSortedFilesSplitPerChromosomeWithoutSpreadFile(inputPath, output, inpu
 // are no requirements on the input file for splitting.
 func SplitSingleEndFilePerChromosome(input, outputPath, outputPrefix, outputExtension string, contigGroupSize int) {
 	inputPath, files := internal.Directory(input)
+	var header *Header
+	var filters []AlignmentFilter
 	firstFile := filepath.Join(inputPath, files[0])
 	firstIn := Open(firstFile)
-	header := firstIn.ParseHeader()
+	if len(files) > 1 {
+		header, filters = MergeInputs(inputPath, files)
+		firstIn.SkipHeader()
+	} else {
+		header = firstIn.ParseHeader()
+	}
 	groups, contigToGroup, _ := computeContigGroups(header.SQ, contigGroupSize)
 	groupFiles := make(map[string]*OutputFile)
 	header.AddUserRecord("@sr", utils.StringMap{"co": "This file was created using elprep split --single-end."})
@@ -673,7 +694,14 @@ func SplitSingleEndFilePerChromosome(input, outputPath, outputPrefix, outputExte
 			pipeline.LimitedPar(0, BytesToAlignment(in)),
 			pipeline.StrictOrd(pipeline.Receive(func(_ int, data interface{}) interface{} {
 				for _, aln := range data.([]*Alignment) {
+					//execute the filters that may update optional rg and pg tags in reads
+					for _, filter := range filters {
+						filter(aln)
+					}
 					out := groupFiles[contigToGroup[aln.RNAME]]
+					if out == nil {
+						log.Panic("Attempting to output a read mapped to a region not present in the header: QNAME: ", aln.QNAME, " RNAME: ", aln.RNAME)
+					}
 					buf = out.FormatAlignment(aln, buf[:0])
 					out.Write(buf) // untagged
 				}
diff --git a/utils/programinfo.go b/utils/programinfo.go
@@ -23,7 +23,7 @@ const (
 	ProgramName = "elprep"
 
 	// ProgramVersion is the version of the elprep binary
-	ProgramVersion = "5.0.2"
+	ProgramVersion = "5.1.0"
 
 	// ProgramURL is the repository for the elprep source code
 	ProgramURL = "http://github.com/exascience/elprep"