Merge branch 'master' of github.com:TalkBank/batchalign2

Jemoka · Jemoka · commit 2347fc5f3edc · 2025-01-31T12:44:35.000-08:00
diff --git a/batchalign/cli/cli.py b/batchalign/cli/cli.py
@@ -298,6 +298,7 @@ def writer(doc, output):
             df.write(str(doc["wer"]))
         with open(Path(output).with_suffix(".diff"), 'w') as df:
             df.write(str(doc["diff"]))
+        CHATFile(doc=doc["doc"]).write(str(Path(output).with_suffix(".asr.cha")))
 
 
     _dispatch("benchmark", lang, num_speakers, ["mp3", "mp4", "wav"], ctx,
diff --git a/batchalign/formats/chat/parser.py b/batchalign/formats/chat/parser.py
@@ -120,7 +120,7 @@ def chat_parse_utterance(text, mor, gra, wor, additional):
     if wor == None:
         wor = [None for i in range(len(phonated_words))]
     else:
-        words = re.findall(rf"[^{''.join(ENDING_PUNCT)} ]+ ?(\x15\d+_\d+\x15)?", wor)
+        words = re.findall(rf"[^{''.join([i for i in ENDING_PUNCT if len(i) == 1])} ]+ ?(\x15\d+_\d+\x15)?", wor)
         wor = []
         for i in words:
             if i.strip() == "":
diff --git a/batchalign/pipelines/analysis/eval.py b/batchalign/pipelines/analysis/eval.py
@@ -148,7 +148,8 @@ def analyze(self, doc, **kwargs):
 
         return {
             "wer": wer,
-            "diff": diff
+            "diff": diff,
+            "doc": doc
         }
 
 
diff --git a/batchalign/pipelines/asr/utils.py b/batchalign/pipelines/asr/utils.py
@@ -4,6 +4,10 @@
 
 from batchalign.constants import ENDING_PUNCT
 
+from num2words import num2words
+import pycountry
+
+
 def retokenize(intermediate_output):
     """Retokenize the output of the ASR system from one giant blob to utterances
 
@@ -153,6 +157,17 @@ def process_generation(output, lang="eng", utterance_engine=None):
             for part in word_parts:
                 final_words.append([part.strip(), [cur, cur+div]])
                 cur += div
+
+        lang_2 = pycountry.languages.get(alpha_3=lang).alpha_2
+        def catched_num2words(i):
+            if not i.isdigit():
+                return i
+            try:
+                return num2words(i, lang=lang_2)
+            except NotImplementedError:
+                return i
+        final_words = [[catched_num2words(i), j] for i,j in final_words]
+
         # if the final words is > 300, split into n parts
         if len(final_words) > 300:
             # for each group, append
diff --git a/batchalign/pipelines/morphosyntax/ud.py b/batchalign/pipelines/morphosyntax/ud.py
@@ -990,7 +990,8 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
                     content.dependency = form.dependency
 
         except Exception as e:
-            warnings.warn(f"Utterance failed parsing, skipping ud tagging... line='{line}', error='{e}'.\n")
+            pass
+            # warnings.warn(f"Utterance failed parsing, skipping ud tagging... line='{line}', error='{e}'.\n")
 
     L.debug("Stanza done.")
     return doc
diff --git a/scratchpad.py b/scratchpad.py
@@ -67,7 +67,6 @@
 
 
 
-
 # # ng = NgramRetraceEngine()
 # # # disf = DisfluencyReplacementEngine()
 
@@ -264,16 +263,15 @@
 ########### The Batchalign String Test Harness ###########
 # from batchalign.formats.chat.parser import chat_parse_utterance
  
-# file = "/Users/houjun/Documents/Projects/talkbank-alignment/test_harness/input/Untitled.wav"
-
-# function = "asr"
-# lang = "spa"
-# num_speakers = 1
-
-# ut = Document.new(media_path=file, lang=lang)
-
-# pipeline = BatchalignPipeline.new(function, lang=lang, num_speakers=num_speakers)
-# doc = pipeline(ut)
+file = "/Users/houjun/Documents/Projects/talkbank-alignment/input/spanish.mp3"
+function = "asr"
+lang = "spa"
+num_speakers = 1
+
+ut = Document.new(media_path=file, lang=lang)
+pipeline = BatchalignPipeline.new(function, lang=lang, num_speakers=num_speakers, asr="rev")
+doc = pipeline(ut)
+doc
 # doc.content
 # # doc[0][-1]
 # # doc[0][-2].model_dump()
diff --git a/setup.py b/setup.py
@@ -51,6 +51,7 @@ def read(fname):
         "soundfile~=0.12.0",
         "rich-click>=1.7.0",
         "typing-extensions",
+        "num2words",
     ],
     extras_require={
         'dev': [

Original file line number	Diff line number	Diff line change
`@@ -148,7 +148,8 @@ def analyze(self, doc, **kwargs):`
`148`	`148`
`149`	`149`	`return {`
`150`	`150`	`"wer": wer,`
`151`		`- "diff": diff`
	`151`	`+ "diff": diff,`
	`152`	`+ "doc": doc`
`152`	`153`	`}`
`153`	`154`
`154`	`155`