wav2vec support is added

Jemoka · Jemoka · commit 769c55bf70de · 2025-02-06T12:08:59.000-08:00
diff --git a/batchalign/cli/cli.py b/batchalign/cli/cli.py
@@ -107,10 +107,12 @@ def batchalign(ctx, verbose):
 @common_options
 @click.option("--whisper/--rev",
               default=False, help="For utterance timing recovery, OpenAI Whisper (ASR) instead of Rev.AI (default).")
+@click.option("--wav2vec/--whisper_fa",
+              default=False, help="Use Whisper instead of Wav2Vec for English (defaults for Whisper for non-English)")
 @click.option("--pauses", type=bool, default=False, help="Should we try to bullet each word or should we try to add pauses in between words by grouping them? Default: no pauses.", is_flag=True)
 
 @click.pass_context
-def align(ctx, in_dir, out_dir, whisper, **kwargs):
+def align(ctx, in_dir, out_dir, whisper, wav2vec, **kwargs):
     """Align transcripts against corresponding media files."""
     def loader(file):
         return (
@@ -121,12 +123,22 @@ def loader(file):
     def writer(doc, output):
         CHATFile(doc=doc).write(output)
 
-    _dispatch("align", "eng", 1,
-              ["cha"], ctx,
-              in_dir, out_dir,
-              loader, writer, C,
-              utr="whisper_utr" if whisper else "rev_utr",
-              **kwargs)
+    if not wav2vec:
+        _dispatch("align", "eng", 1,
+                  ["cha"], ctx,
+                  in_dir, out_dir,
+                  loader, writer, C,
+                  fa="whisper_fa",
+                  utr="whisper_utr" if whisper else "rev_utr",
+                  **kwargs)
+    else:
+        _dispatch("align", "eng", 1,
+                  ["cha"], ctx,
+                  in_dir, out_dir,
+                  loader, writer, C,
+                  fa="wav2vec_fa",
+                  utr="whisper_utr" if whisper else "rev_utr",
+                  **kwargs)
 
 #################### TRANSCRIBE ################################
 
diff --git a/batchalign/pipelines/__init__.py b/batchalign/pipelines/__init__.py
@@ -6,7 +6,7 @@
 from .cleanup import NgramRetraceEngine, DisfluencyReplacementEngine
 from .speaker import NemoSpeakerEngine
 
-from .fa import WhisperFAEngine
+from .fa import WhisperFAEngine, Wave2VecFAEngine
 from .utr import WhisperUTREngine, RevUTREngine
 
 from .analysis import EvaluationEngine
diff --git a/batchalign/pipelines/dispatch.py b/batchalign/pipelines/dispatch.py
@@ -6,7 +6,7 @@
 from batchalign import (WhisperEngine, WhisperFAEngine, StanzaEngine, RevEngine,
                         NgramRetraceEngine, DisfluencyReplacementEngine, WhisperUTREngine,
                         RevUTREngine, EvaluationEngine, WhisperXEngine, NemoSpeakerEngine,
-                        StanzaUtteranceEngine, CorefEngine)
+                        StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine)
 from batchalign import BatchalignPipeline
 from batchalign.models import resolve
 
@@ -127,7 +127,8 @@ def dispatch_pipeline(pkg_str, lang, num_speakers=None, **arg_overrides):
             engines.append(StanzaUtteranceEngine())
         elif engine == "stanza_coref":
             engines.append(CorefEngine())
-
+        elif engine == "wav2vec_fa":
+            engines.append(Wave2VecFAEngine())
 
     L.debug(f"Done initalizing packages.")
     return BatchalignPipeline(*engines)
diff --git a/batchalign/pipelines/fa/__init__.py b/batchalign/pipelines/fa/__init__.py
@@ -1 +1,2 @@
 from .whisper_fa import WhisperFAEngine
+from .wave2vec_fa import Wave2VecFAEngine
diff --git a/batchalign/pipelines/fa/wave2vec_fa.py b/batchalign/pipelines/fa/wave2vec_fa.py
@@ -27,18 +27,21 @@ def process(self, doc:Document, **kwargs):
         # check that the document has a media path to align to
         assert doc.media != None and doc.media.url != None, f"We cannot forced-align something that doesn't have a media path! Provided media tier='{doc.media}'"
 
+        if doc.langs[0] != "eng":
+            warnings.warn("Looks like you are not aligning English with wav2vec; this works for a lot of Roman languages, but outside of that your milage may vary.")
+
         # load the audio file
         L.debug(f"Wave2Vec FA is loading url {doc.media.url}...")
         f = self.__wav2vec.load(doc.media.url)
-        L.debug(f"Wave2Vec FA finished loading media.")
+        L.debug(f"Wav2Vec FA finished loading media.")
 
         # collect utterances 30 secondish segments to be aligned for whisper
         # we have to do this because whisper does poorly with very short segments
         groups = []
         group = []
         seg_start = 0
 
-        L.debug(f"Wave2Vec FA finished loading media.")
+        L.debug(f"Wav2Vec FA finished loading media.")
 
         for i in doc.content:
             if not isinstance(i, Utterance):
@@ -59,7 +62,7 @@ def process(self, doc:Document, **kwargs):
 
         groups.append(group)
 
-        L.debug(f"Begin Whisper Inference...")
+        L.debug(f"Begin Wav2Vec Inference...")
 
         for indx, grp in enumerate(groups):
             L.info(f"Wave2Vec FA processing segment {indx+1}/{len(groups)}...")
diff --git a/batchalign/version b/batchalign/version
@@ -1,3 +1,3 @@
-0.7.11-beta.3
-Feburary 2nd, 2025
-Incorporate additional pauses
+0.7.11-beta.4
+Feburary 6nd, 2025
+Wav2vec support!
diff --git a/scratchpad.py b/scratchpad.py
@@ -48,7 +48,9 @@
 
 # print(str(CHATFile(doc=doc)))
 
-# doc = CHATFile(path="../talkbank-alignment/input/barry.cha").doc
+# doc = CHATFile(path="../talkbank-alignment/testing_playground_2/input/test.cha").doc
+# pipe = Wave2
+
 # doc[3][0]
 # て
 # print(str(CHATFile(doc=res)))
@@ -99,6 +101,8 @@
 # ppe = pipeline
 # cha = CHATFile(path="../talkbank-alignment/testing_playground_2/input/test.cha")
 # doc = cha.doc
+# engine = Wave2VecFAEngine()
+# res = engine(doc)
 
 # # print(str(CHATFile(doc=doc)))
 # result = ppe(doc)
@@ -263,15 +267,15 @@
 ########### The Batchalign String Test Harness ###########
 # from batchalign.formats.chat.parser import chat_parse_utterance
  
-file = "/Users/houjun/Documents/Projects/talkbank-alignment/input/spanish.mp3"
-function = "asr"
-lang = "spa"
-num_speakers = 1
-
-ut = Document.new(media_path=file, lang=lang)
-pipeline = BatchalignPipeline.new(function, lang=lang, num_speakers=num_speakers, asr="rev")
-doc = pipeline(ut)
-doc
+# file = "/Users/houjun/Documents/Projects/talkbank-alignment/input/spanish.mp3"
+# function = "asr"
+# lang = "spa"
+# num_speakers = 1
+
+# ut = Document.new(media_path=file, lang=lang)
+# pipeline = BatchalignPipeline.new(function, lang=lang, num_speakers=num_speakers, asr="rev")
+# doc = pipeline(ut)
+# doc
 # doc.content
 # # doc[0][-1]
 # # doc[0][-2].model_dump()

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`from .whisper_fa import WhisperFAEngine`
	`2`	`+from .wave2vec_fa import Wave2VecFAEngine`