Fix bug in syllable stress enrichment from phones (#250)

mmcauliffe · web-flow · commit a51502a7e301 · 2025-11-18T13:27:08.000-08:00
diff --git a/polyglotdb/corpus/syllabic.py b/polyglotdb/corpus/syllabic.py
@@ -456,7 +456,7 @@ def enrich_syllables(self, syllable_data, type_data=None):
         self.hierarchy.add_type_properties(self, 'syllable', type_data.items())
         self.encode_hierarchy()
 
-    def _generate_stress_enrichment(self, pattern):
+    def _generate_stress_enrichment(self, pattern, clean_phone_label=True):
         syllable = self.syllable
         all_syls = self.query_graph(syllable).all()
         enrich_dict = {}
@@ -473,11 +473,12 @@ def _generate_stress_enrichment(self, pattern):
                 end = nucleus[r.start(0):r.end(0)].replace("_", "")
                 nucleus = re.sub(pattern, "", nucleus)
                 fullpatt = str(nucleus) + str(pattern).replace("$", "")
-                syl = re.sub(fullpatt, nucleus, syl)
+                if clean_phone_label:
+                    syl = re.sub(fullpatt, nucleus, syl)
                 enrich_dict.update({syl: {'stress': end}})
         return enrich_dict
 
-    def _generate_tone_enrichment(self, pattern):
+    def _generate_tone_enrichment(self, pattern, clean_phone_label=True):
         syllable = self.syllable
         all_syls = self.query_graph(syllable).all()
         enrich_dict = {}
@@ -494,7 +495,8 @@ def _generate_tone_enrichment(self, pattern):
                     end = nucleus[r.start(0):r.end(0)].replace("_", "")
                     nucleus = re.sub(pattern, "", nucleus)
                     fullpatt = str(nucleus) + str(pattern).replace("$", "")
-                    syl = re.sub(fullpatt, nucleus, syl)
+                    if clean_phone_label:
+                        syl = re.sub(fullpatt, nucleus, syl)
 
                     enrich_dict.update({syl: {'tone': end}})
         return enrich_dict
@@ -514,8 +516,7 @@ def encode_stress_to_syllables(self, regex=None, clean_phone_label=True):
         if regex is None:
             regex = '[0-9]'
 
-        enrich_dict = self._generate_stress_enrichment(regex)
-
+        enrich_dict = self._generate_stress_enrichment(regex, clean_phone_label)
         if clean_phone_label:
             self.remove_pattern(regex)
         self.enrich_syllables(enrich_dict)
@@ -536,7 +537,7 @@ def encode_tone_to_syllables(self, regex=None, clean_phone_label=True):
         if regex is None:
             regex = '[0-9]'
 
-        enrich_dict = self._generate_tone_enrichment(regex)
+        enrich_dict = self._generate_tone_enrichment(regex, clean_phone_label)
 
         if clean_phone_label:
             self.remove_pattern(regex)
diff --git a/polyglotdb/io/importer/from_csv.py b/polyglotdb/io/importer/from_csv.py
@@ -4,8 +4,7 @@
 import neo4j
 import re
 import numpy as np
-import csv 
-from ...acoustics.classes import (Track, TimePoint)
+import csv
 
 
 def make_path_safe(path):
@@ -1398,6 +1397,12 @@ def import_token_csv_with_timestamp(corpus_context, path, annotated_type, timest
     properties : list
         A list of column names to update; if None, assume all columns will be updated (default).
     """
+
+    # If on the Docker version, the files live in /site/proj
+    if os.path.exists('/site/proj') and not path.startswith('/site/proj'):
+        csv_path = 'file:///site/proj/{}'.format(make_path_safe(path))
+    else:
+        csv_path = 'file:///{}'.format(make_path_safe(path))
     if properties is None:
         with open(path, 'r') as f:
             properties = [x.strip() for x in f.readline().split(',') if x.strip() not in [timestamp_column, discourse_column]]
@@ -1434,14 +1439,14 @@ def import_token_csv_with_timestamp(corpus_context, path, annotated_type, timest
 
     statement = '''
     CALL {{
-        LOAD CSV WITH HEADERS FROM "file://{path}" AS csvLine
+        LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
         MATCH (d:Discourse {{name: csvLine.{discourse_column}}})
         MATCH (x:{a_type}:{corpus})-[:spoken_in]->(d)
         WHERE x.begin <= toFloat(csvLine.{timestamp_column}) <= x.end
         SET {property_update}
     }} IN TRANSACTIONS OF 500 ROWS
     '''.format(
-        path=path, a_type=annotated_type, corpus=corpus_context.cypher_safe_name,
+        path=csv_path, a_type=annotated_type, corpus=corpus_context.cypher_safe_name,
         timestamp_column=timestamp_column, discourse_column=discourse_column, property_update=property_update
     )
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -508,7 +508,7 @@ def summarized_config(graph_db, textgrid_test_dir):
     return config
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope='function')
 def stressed_config(graph_db, textgrid_test_dir):
     config = CorpusConfig('stressed', **graph_db)
 
diff --git a/tests/test_enrich.py b/tests/test_enrich.py
@@ -27,6 +27,33 @@ def test_stress_enrichment(stressed_config):
         assert (c.hierarchy.has_type_property("syllable", "stress"))
 
 
+def test_stress_enrichment_no_clean(stressed_config):
+    syllabics = "AA0,AA1,AA2,AH0,AH1,AH2,AE0,AE1,AE2,AY0,AY1,AY2,ER0,ER1,ER2,EH0,EH1,EH2,EY1,EY2,IH0,IH1,IH2,IY0,IY1,IY2,UW0,UW1,UW2".split(
+        ",")
+    with CorpusContext(stressed_config) as c:
+        c.encode_syllabic_segments(syllabics)
+        c.encode_syllables("maxonset")
+        c.encode_stress_to_syllables(clean_phone_label=False)
+
+        assert (c.hierarchy.has_type_property("syllable", "stress"))
+
+        q = c.query_graph(c.syllable)
+        q = q.filter(c.syllable.word.label == "began")
+
+        q = q.columns(
+            c.syllable.label.column_name('syllable'),
+            c.syllable.stress.column_name('syllable_stress'),
+            c.syllable.word.label.column_name('word'),
+            c.syllable.word.begin.column_name('word_begin'),
+            c.syllable.word.end.column_name('word_end'),
+            c.syllable.discourse.name.column_name('file'),
+        )
+        q = q.limit(10)
+        res = q.all()
+        for r in res:
+            assert r['syllable_stress'] is not None
+
+
 def test_relativized_enrichment_syllables(acoustic_config):
     with CorpusContext(acoustic_config) as c:
         # c.encode_measure("word_median")