Skip to content

Commit 404cc01

Browse files
authored
Fixes cutting lines
1 parent 5da9934 commit 404cc01

File tree

1 file changed

+29
-2
lines changed

1 file changed

+29
-2
lines changed

bin/convert_all.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ def gen_mdict_target(filepath, filebase, output_folder, dataName, dataDescriptio
168168
)
169169

170170
return execute_shell(cmd_line=cmd_line, message=f"generating MDict MDX")
171-
172171
import re
173172

174173
def split_long_text_file(
@@ -177,18 +176,46 @@ def split_long_text_file(
177176
max_len=800
178177
):
179178
item_pattern = re.compile(r'(?=\b\d+\. )')
179+
ascii_word = re.compile(r'[A-Za-z0-9_]')
180+
zh_punct = '。;,'
180181
output_lines = []
181182

182183
def split_by_length(text):
183184
chunks = []
185+
184186
while len(text) > max_len:
187+
cut = -1
188+
189+
# 1. Try space (English)
185190
cut = text.rfind(' ', 0, max_len)
191+
192+
# 2. Try Chinese punctuation
186193
if cut == -1:
187-
cut = max_len # fallback: hard cut
194+
for p in zh_punct:
195+
pos = text.rfind(p, 0, max_len)
196+
if pos > cut:
197+
cut = pos + 1 # include punctuation
198+
199+
# 3. Smart fallback
200+
if cut == -1:
201+
cut = max_len
202+
203+
# Avoid breaking ASCII words / tags
204+
if (
205+
cut < len(text)
206+
and ascii_word.match(text[cut - 1])
207+
and ascii_word.match(text[cut])
208+
):
209+
forward = re.search(r'\s', text[cut:cut + 100])
210+
if forward:
211+
cut += forward.start()
212+
188213
chunks.append(text[:cut].rstrip())
189214
text = text[cut:].lstrip()
215+
190216
if text:
191217
chunks.append(text)
218+
192219
return chunks
193220

194221
with open(input_path, 'r', encoding='utf-8') as f:

0 commit comments

Comments
 (0)