diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py index 4583216..bd478aa 100644 --- a/cutlet/cutlet.py +++ b/cutlet/cutlet.py @@ -248,6 +248,13 @@ def romaji_tokens(self, words, capitalize=True, title=False): out.append(tok) continue + # preserve spaces between ascii tokens + if (word.surface.isascii() and + nw and nw.surface.isascii()): + use_space = bool(nw.white_space) + out.append(Token(word.surface, use_space)) + continue + out.append(tok) # no space sometimes diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py index 35766bd..4fa8287 100644 --- a/cutlet/test/test_basic.py +++ b/cutlet/test/test_basic.py @@ -46,7 +46,7 @@ "Kouso to wa, seitai de okoru kagaku hannou ni taishite shokubai to shite kinou suru bunshi de aru.", ), ("ホッピーは元祖ビアテイスト清涼飲料水です", "Hoppy wa ganso beer taste seiryou inryousui desu"), - ("東京タワーの高さは333mです", "Tokyo tower no takasa wa 333 m desu"), + ("東京タワーの高さは333mです", "Tokyo tower no takasa wa 333m desu"), ( "国立国語研究所(NINJAL)は,日本語学・言語学・日本語教育研究を中心とした研究機関です。", "Kokuritsu kokugo kenkyuusho (NINJAL) wa, Nippon gogaku/gengogaku/Nippon go kyouiku kenkyuu wo chuushin to shita kenkyuu kikan desu.", @@ -56,8 +56,7 @@ ("私はテストです", "Watakushi wa test desu"), # issue #4, 私 -> 代名詞 ("《月》", "(gatsu)"), # issue #7, unfamiliar punctuation ("2 【電子版特典付】", "2 [denshi ban tokutentsuke]"), # issue #7 - # This looks weird but MeCab tokenizes at alpha-num barriers - ("cutlet23", "Cutlet 23"), + ("cutlet23", "Cutlet23"), # Test some kana unks - issue #8 ("アマガミ Sincerely Your S シンシアリーユアーズ", "Amagami Sincerely Your S shinshiariiyuaazu"), ("ケメコデラックス", "Kemekoderakkusu"), @@ -89,14 +88,12 @@ # don't add spaces around apostrophe if it wasn't there ("McDonald's", "McDonald's"), ("Text McDonald's text", "Text McDonald's text"), - # Following are quote weirdness. Not good but hard to fix. - # An issue is that ," or .' is a single token. - ("It's 'delicious.'", "It's ' delicious .'"), - ('"Hello," he said.', '" Hello ," he said.'), + ("It's 'delicious.'", "It's 'delicious.'"), + ('"Hello," he said.', '"Hello," he said.'), # this is a very strange typo ("アトランテッィク", "Atoranteku"), - # odoriji. Note at this point these rarely work properly, they mainly - # don't blow up. + # odoriji. Note at this point these rarely work properly, these mainly test + # that they don't blow up. ("くゞる", "Kuguru"), # note this is actually in unidic-lite ("くヽる", "Ku ru"), ("今度クヾペへ行こう", "Kondo kugupe e ikou"), # made up word @@ -104,6 +101,9 @@ # prefixes, see #56 ("ビオハザード", "Bio-hazard"), ("イントラワード", "Intra-word"), + # ascii whitespace, see #65 + ("[04:30.748]", "[04:30.748]"), + (".big,bad bog", ".big,bad bog"), ] SENTENCES_KUNREI = [