|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import json |
| 3 | +import sys |
| 4 | +from collections import Counter, defaultdict |
| 5 | + |
| 6 | +SKIP_WORDS = {"<eps>", "<unk>"} |
| 7 | + |
| 8 | +if len(sys.argv) < 2: |
| 9 | + print( |
| 10 | + "Usage: reorder_lexicon.py aligned_phones.jsonl [phoneme_col=1] < lexicon.txt > reordered_lexicon.txt" |
| 11 | + ) |
| 12 | + sys.exit(1) |
| 13 | + |
| 14 | +aligned_path = sys.argv[1] |
| 15 | +phoneme_col = 1 |
| 16 | + |
| 17 | +if len(sys.argv) > 2: |
| 18 | + phoneme_col = int(sys.argv[2]) |
| 19 | + |
| 20 | + |
| 21 | +lexicon = defaultdict(dict) |
| 22 | +num_prons = Counter() |
| 23 | + |
| 24 | +# word -> pron counts |
| 25 | +pron_counts = defaultdict(Counter) |
| 26 | + |
| 27 | +# Load lexicon |
| 28 | +print("Loading lexicon...", file=sys.stderr) |
| 29 | +for line in sys.stdin: |
| 30 | + line = line.strip() |
| 31 | + if not line: |
| 32 | + continue |
| 33 | + |
| 34 | + parts = line.split() |
| 35 | + assert len(parts) > 1, line |
| 36 | + |
| 37 | + word = parts[0] |
| 38 | + phonemes = tuple(parts[phoneme_col:]) |
| 39 | + |
| 40 | + lexicon[word][phonemes] = line |
| 41 | + |
| 42 | + num_prons[word] += 1 |
| 43 | + |
| 44 | +print("Loading alignments...", file=sys.stderr) |
| 45 | +with open(aligned_path, "r") as aligned_file: |
| 46 | + for line in aligned_file: |
| 47 | + line = line.strip() |
| 48 | + if not line: |
| 49 | + continue |
| 50 | + |
| 51 | + alignment = json.loads(line) |
| 52 | + for word_pron in alignment["prons"]: |
| 53 | + word = word_pron["word"] |
| 54 | + if (word in SKIP_WORDS) or (num_prons[word] < 1): |
| 55 | + continue |
| 56 | + |
| 57 | + phonemes = tuple(word_pron["phones"]) |
| 58 | + pron_counts[word][phonemes] += 1 |
| 59 | + |
| 60 | + |
| 61 | +print("Re-ordering lexicon...", file=sys.stderr) |
| 62 | +for word in sorted(lexicon.keys()): |
| 63 | + word_pron_counts = pron_counts[word] |
| 64 | + phonemes_lines = sorted( |
| 65 | + lexicon[word].items(), |
| 66 | + key=lambda kv: word_pron_counts.get(kv[0], 0), |
| 67 | + reverse=True, |
| 68 | + ) |
| 69 | + |
| 70 | + for _, line in phonemes_lines: |
| 71 | + print(line) |
| 72 | + |
| 73 | +print("Done", file=sys.stderr) |
0 commit comments