Skip to content

Commit a51502a

Browse files
authored
Fix bug in syllable stress enrichment from phones (#250)
1 parent 29c4fcd commit a51502a

File tree

4 files changed

+45
-12
lines changed

4 files changed

+45
-12
lines changed

polyglotdb/corpus/syllabic.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,7 @@ def enrich_syllables(self, syllable_data, type_data=None):
456456
self.hierarchy.add_type_properties(self, 'syllable', type_data.items())
457457
self.encode_hierarchy()
458458

459-
def _generate_stress_enrichment(self, pattern):
459+
def _generate_stress_enrichment(self, pattern, clean_phone_label=True):
460460
syllable = self.syllable
461461
all_syls = self.query_graph(syllable).all()
462462
enrich_dict = {}
@@ -473,11 +473,12 @@ def _generate_stress_enrichment(self, pattern):
473473
end = nucleus[r.start(0):r.end(0)].replace("_", "")
474474
nucleus = re.sub(pattern, "", nucleus)
475475
fullpatt = str(nucleus) + str(pattern).replace("$", "")
476-
syl = re.sub(fullpatt, nucleus, syl)
476+
if clean_phone_label:
477+
syl = re.sub(fullpatt, nucleus, syl)
477478
enrich_dict.update({syl: {'stress': end}})
478479
return enrich_dict
479480

480-
def _generate_tone_enrichment(self, pattern):
481+
def _generate_tone_enrichment(self, pattern, clean_phone_label=True):
481482
syllable = self.syllable
482483
all_syls = self.query_graph(syllable).all()
483484
enrich_dict = {}
@@ -494,7 +495,8 @@ def _generate_tone_enrichment(self, pattern):
494495
end = nucleus[r.start(0):r.end(0)].replace("_", "")
495496
nucleus = re.sub(pattern, "", nucleus)
496497
fullpatt = str(nucleus) + str(pattern).replace("$", "")
497-
syl = re.sub(fullpatt, nucleus, syl)
498+
if clean_phone_label:
499+
syl = re.sub(fullpatt, nucleus, syl)
498500

499501
enrich_dict.update({syl: {'tone': end}})
500502
return enrich_dict
@@ -514,8 +516,7 @@ def encode_stress_to_syllables(self, regex=None, clean_phone_label=True):
514516
if regex is None:
515517
regex = '[0-9]'
516518

517-
enrich_dict = self._generate_stress_enrichment(regex)
518-
519+
enrich_dict = self._generate_stress_enrichment(regex, clean_phone_label)
519520
if clean_phone_label:
520521
self.remove_pattern(regex)
521522
self.enrich_syllables(enrich_dict)
@@ -536,7 +537,7 @@ def encode_tone_to_syllables(self, regex=None, clean_phone_label=True):
536537
if regex is None:
537538
regex = '[0-9]'
538539

539-
enrich_dict = self._generate_tone_enrichment(regex)
540+
enrich_dict = self._generate_tone_enrichment(regex, clean_phone_label)
540541

541542
if clean_phone_label:
542543
self.remove_pattern(regex)

polyglotdb/io/importer/from_csv.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
import neo4j
55
import re
66
import numpy as np
7-
import csv
8-
from ...acoustics.classes import (Track, TimePoint)
7+
import csv
98

109

1110
def make_path_safe(path):
@@ -1398,6 +1397,12 @@ def import_token_csv_with_timestamp(corpus_context, path, annotated_type, timest
13981397
properties : list
13991398
A list of column names to update; if None, assume all columns will be updated (default).
14001399
"""
1400+
1401+
# If on the Docker version, the files live in /site/proj
1402+
if os.path.exists('/site/proj') and not path.startswith('/site/proj'):
1403+
csv_path = 'file:///site/proj/{}'.format(make_path_safe(path))
1404+
else:
1405+
csv_path = 'file:///{}'.format(make_path_safe(path))
14011406
if properties is None:
14021407
with open(path, 'r') as f:
14031408
properties = [x.strip() for x in f.readline().split(',') if x.strip() not in [timestamp_column, discourse_column]]
@@ -1434,14 +1439,14 @@ def import_token_csv_with_timestamp(corpus_context, path, annotated_type, timest
14341439

14351440
statement = '''
14361441
CALL {{
1437-
LOAD CSV WITH HEADERS FROM "file://{path}" AS csvLine
1442+
LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
14381443
MATCH (d:Discourse {{name: csvLine.{discourse_column}}})
14391444
MATCH (x:{a_type}:{corpus})-[:spoken_in]->(d)
14401445
WHERE x.begin <= toFloat(csvLine.{timestamp_column}) <= x.end
14411446
SET {property_update}
14421447
}} IN TRANSACTIONS OF 500 ROWS
14431448
'''.format(
1444-
path=path, a_type=annotated_type, corpus=corpus_context.cypher_safe_name,
1449+
path=csv_path, a_type=annotated_type, corpus=corpus_context.cypher_safe_name,
14451450
timestamp_column=timestamp_column, discourse_column=discourse_column, property_update=property_update
14461451
)
14471452

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,7 @@ def summarized_config(graph_db, textgrid_test_dir):
508508
return config
509509

510510

511-
@pytest.fixture(scope='session')
511+
@pytest.fixture(scope='function')
512512
def stressed_config(graph_db, textgrid_test_dir):
513513
config = CorpusConfig('stressed', **graph_db)
514514

tests/test_enrich.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,33 @@ def test_stress_enrichment(stressed_config):
2727
assert (c.hierarchy.has_type_property("syllable", "stress"))
2828

2929

30+
def test_stress_enrichment_no_clean(stressed_config):
31+
syllabics = "AA0,AA1,AA2,AH0,AH1,AH2,AE0,AE1,AE2,AY0,AY1,AY2,ER0,ER1,ER2,EH0,EH1,EH2,EY1,EY2,IH0,IH1,IH2,IY0,IY1,IY2,UW0,UW1,UW2".split(
32+
",")
33+
with CorpusContext(stressed_config) as c:
34+
c.encode_syllabic_segments(syllabics)
35+
c.encode_syllables("maxonset")
36+
c.encode_stress_to_syllables(clean_phone_label=False)
37+
38+
assert (c.hierarchy.has_type_property("syllable", "stress"))
39+
40+
q = c.query_graph(c.syllable)
41+
q = q.filter(c.syllable.word.label == "began")
42+
43+
q = q.columns(
44+
c.syllable.label.column_name('syllable'),
45+
c.syllable.stress.column_name('syllable_stress'),
46+
c.syllable.word.label.column_name('word'),
47+
c.syllable.word.begin.column_name('word_begin'),
48+
c.syllable.word.end.column_name('word_end'),
49+
c.syllable.discourse.name.column_name('file'),
50+
)
51+
q = q.limit(10)
52+
res = q.all()
53+
for r in res:
54+
assert r['syllable_stress'] is not None
55+
56+
3057
def test_relativized_enrichment_syllables(acoustic_config):
3158
with CorpusContext(acoustic_config) as c:
3259
# c.encode_measure("word_median")

0 commit comments

Comments
 (0)