Skip to content
This repository was archived by the owner on Oct 6, 2025. It is now read-only.

Commit d4cd91f

Browse files
committed
Add script for re-ordering lexicon
1 parent d0d7066 commit d4cd91f

File tree

1 file changed

+73
-0
lines changed

1 file changed

+73
-0
lines changed

bin/reorder_lexicon.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/usr/bin/env python3
2+
import json
3+
import sys
4+
from collections import Counter, defaultdict
5+
6+
SKIP_WORDS = {"<eps>", "<unk>"}
7+
8+
if len(sys.argv) < 2:
9+
print(
10+
"Usage: reorder_lexicon.py aligned_phones.jsonl [phoneme_col=1] < lexicon.txt > reordered_lexicon.txt"
11+
)
12+
sys.exit(1)
13+
14+
aligned_path = sys.argv[1]
15+
phoneme_col = 1
16+
17+
if len(sys.argv) > 2:
18+
phoneme_col = int(sys.argv[2])
19+
20+
21+
lexicon = defaultdict(dict)
22+
num_prons = Counter()
23+
24+
# word -> pron counts
25+
pron_counts = defaultdict(Counter)
26+
27+
# Load lexicon
28+
print("Loading lexicon...", file=sys.stderr)
29+
for line in sys.stdin:
30+
line = line.strip()
31+
if not line:
32+
continue
33+
34+
parts = line.split()
35+
assert len(parts) > 1, line
36+
37+
word = parts[0]
38+
phonemes = tuple(parts[phoneme_col:])
39+
40+
lexicon[word][phonemes] = line
41+
42+
num_prons[word] += 1
43+
44+
print("Loading alignments...", file=sys.stderr)
45+
with open(aligned_path, "r") as aligned_file:
46+
for line in aligned_file:
47+
line = line.strip()
48+
if not line:
49+
continue
50+
51+
alignment = json.loads(line)
52+
for word_pron in alignment["prons"]:
53+
word = word_pron["word"]
54+
if (word in SKIP_WORDS) or (num_prons[word] < 1):
55+
continue
56+
57+
phonemes = tuple(word_pron["phones"])
58+
pron_counts[word][phonemes] += 1
59+
60+
61+
print("Re-ordering lexicon...", file=sys.stderr)
62+
for word in sorted(lexicon.keys()):
63+
word_pron_counts = pron_counts[word]
64+
phonemes_lines = sorted(
65+
lexicon[word].items(),
66+
key=lambda kv: word_pron_counts.get(kv[0], 0),
67+
reverse=True,
68+
)
69+
70+
for _, line in phonemes_lines:
71+
print(line)
72+
73+
print("Done", file=sys.stderr)

0 commit comments

Comments
 (0)