Draft VWF to BED7 conversion

bede · bede · commit fcafa577902d · 2025-01-31T11:34:30.000Z
diff --git a/src/primaschema/cli.py b/src/primaschema/cli.py
@@ -137,6 +137,17 @@ def six_to_seven(bed_path: Path, fasta_path: Path):
     print(bed_str)
 
 
+def vwf_to_bed(vwf_path: Path, chrom: str = "chrom"):
+    """
+    Convert a Viridian VWF scheme TSV to a 7 column primer.bed
+
+    :arg vwf_path: path of scheme.bed file
+    :arg chrom: name of reference chromosome
+    """
+    bed_str = lib.convert_vwf_to_primer_bed(vwf_path=vwf_path, chrom=chrom)
+    print(bed_str)
+
+
 def diff(bed1_path: Path, bed2_path: Path, only_positions: bool = False):
     """
     Show the symmetric difference of records in two bed files
@@ -215,6 +226,7 @@ def main():
             "diff": diff,
             "6to7": six_to_seven,
             "7to6": seven_to_six,
+            "vwftobed": vwf_to_bed,
             "plot": plot,
             "show-intervals": amplicon_intervals,
             "show-discordant-primers": discordant_primers,
diff --git a/src/primaschema/lib.py b/src/primaschema/lib.py
@@ -170,6 +170,38 @@ def convert_scheme_bed_to_primer_bed(bed_path: Path, fasta_path: Path) -> str:
     return df.to_csv(sep="\t", header=False, index=False)
 
 
+def convert_vwf_to_primer_bed(vwf_path: Path, chrom: str = "chrom") -> str:
+    vwf_df = pd.read_csv(vwf_path, sep="\t")
+    bed_records = []
+    pool_counter = {}
+
+    for r in vwf_df.to_records("dict"):
+        amplicon_name = r["Amplicon_name"]
+        primer_name = r["Primer_name"]
+        orientation = r["Left_or_right"]
+        amplicon_number = int(amplicon_name.split("_")[-1])
+        pool_name = 1 if amplicon_number % 2 != 0 else 2
+        if amplicon_name not in pool_counter:
+            pool_counter[amplicon_name] = 1
+        else:
+            pool_counter[amplicon_name] += 1
+        strand = "+" if orientation == "left" else "-"
+        sequence = r["Sequence"]
+        start_pos = r["Position"]
+        bed_record = {}
+        bed_record["chrom"] = chrom
+        bed_record["chromStart"] = start_pos
+        bed_record["chromEnd"] = start_pos + len(sequence)
+        bed_record["name"] = primer_name
+        bed_record["poolName"] = str(pool_name)
+        bed_record["strand"] = strand
+        bed_record["sequence"] = sequence
+        bed_records.append(bed_record)
+
+    bed_df = pd.DataFrame(bed_records)
+    return bed_df.to_csv(sep="\t", header=False, index=False)
+
+
 def hash_bed(bed_path: Path) -> str:
     bed_type = infer_bed_type(bed_path)
     if bed_type == "primer":