From ffa706c7d4957a39ddb4ed90a53a1f939aecc1af Mon Sep 17 00:00:00 2001 From: Mohit Tare Date: Wed, 14 Oct 2020 17:32:21 +0530 Subject: [PATCH 1/5] acronym and number filtering --- SymSpellCppPy.cpp | 5 +++-- include/Helpers.h | 15 +++++++++++++++ library.cpp | 29 +++++++++++++++++++++++++---- library.h | 4 +++- tests/CatchMain.cpp | 12 +++++++++++- tests/SymSpellCppPyTest.py | 2 ++ 6 files changed, 59 insertions(+), 8 deletions(-) diff --git a/SymSpellCppPy.cpp b/SymSpellCppPy.cpp index 6ae3e18..39ea245 100644 --- a/SymSpellCppPy.cpp +++ b/SymSpellCppPy.cpp @@ -163,7 +163,7 @@ PYBIND11_MODULE(SymSpellCppPy, m) { " 3. multiple independent input terms with/without spelling errors", py::arg("input"), py::arg("max_edit_distance")) - .def("lookup_compound", py::overload_cast( + .def("lookup_compound", py::overload_cast( &symspellcpppy::SymSpell::LookupCompound), " LookupCompound supports compound aware automatic spelling correction of multi-word input strings with three cases:\n" " 1. mistakenly inserted space into a correct word led to two incorrect terms \n" @@ -171,7 +171,8 @@ PYBIND11_MODULE(SymSpellCppPy, m) { " 3. multiple independent input terms with/without spelling errors", py::arg("input"), py::arg("max_edit_distance"), - py::arg("transfer_casing")) + py::arg("transfer_casing"), + py::arg("ignore_non_words")) .def("word_segmentation", py::overload_cast( &symspellcpppy::SymSpell::WordSegmentation), " WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions\n" diff --git a/include/Helpers.h b/include/Helpers.h index 552dd98..75e1e44 100644 --- a/include/Helpers.h +++ b/include/Helpers.h @@ -181,6 +181,21 @@ class Helpers { return response_string; } + + static bool is_acronym(xstring word, bool match_any_term_with_digit){ + if(match_any_term_with_digit == true){ + for(char i:word){ + if(std::isdigit(i)){ + return true; + } + } + } + std::regex accr_regex("[A-Z0-9]{3,}"); + if(std::regex_match(word,accr_regex)){ + return true; + } + return false; + } }; template diff --git a/library.cpp b/library.cpp index 5e751a3..2620933 100644 --- a/library.cpp +++ b/library.cpp @@ -468,6 +468,18 @@ namespace symspellcpppy { return matches; } + std::vector SymSpell::ParseWordsPreserveCasing(const xstring &text) { + xregex r(XL("['’\\w-\\[_\\]]+")); + xsmatch m; + std::vector matches; + xstring::const_iterator ptr(text.cbegin()); + while (regex_search(ptr, text.cend(), m, r)) { + matches.push_back(m[0]); + ptr = m.suffix().first; + } + return matches; + } + std::shared_ptr> SymSpell::Edits(const xstring &word, int editDistance, std::shared_ptr> deleteWords) { editDistance++; @@ -511,15 +523,15 @@ namespace symspellcpppy { } std::vector SymSpell::LookupCompound(const xstring &input) { - return LookupCompound(input, maxDictionaryEditDistance, false); + return LookupCompound(input, maxDictionaryEditDistance, false,true); } std::vector SymSpell::LookupCompound(const xstring &input, int editDistanceMax) { - return LookupCompound(input, editDistanceMax, false); + return LookupCompound(input, editDistanceMax, false,true); } - std::vector SymSpell::LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing) { - std::vector termList1 = ParseWords(input); + std::vector SymSpell::LookupCompound(const xstring &input, int editDistanceMax,bool transferCasing,bool ignore_non_words) { + std::vector termList1 = ParseWordsPreserveCasing(input); std::vector suggestions; //suggestions for a single term std::vector suggestionParts; //1 line with separate parts @@ -527,6 +539,15 @@ namespace symspellcpppy { bool lastCombi = false; for (int i = 0; i < termList1.size(); i++) { + if(ignore_non_words == true){ + if(Helpers::is_acronym(termList1[i],true)){ + SuggestItem temp = SuggestItem(termList1[i],0,0); + suggestionParts.push_back(temp); + continue; + } + + } + suggestions = Lookup(termList1[i], Top, editDistanceMax); if ((i > 0) && !lastCombi) { diff --git a/library.h b/library.h index 910e677..49439ca 100644 --- a/library.h +++ b/library.h @@ -229,6 +229,8 @@ namespace symspellcpppy { static std::vector ParseWords(const xstring &text); + static std::vector ParseWordsPreserveCasing(const xstring &text); + std::shared_ptr> Edits(const xstring &word, int editDistance, std::shared_ptr> deleteWords); @@ -259,7 +261,7 @@ namespace symspellcpppy { /// The string being spell checked. /// The maximum edit distance between input and suggested words. /// A List of SuggestItem object representing suggested correct spellings for the input string. - std::vector LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing); + std::vector LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing, bool ignore_non_words); //###### diff --git a/tests/CatchMain.cpp b/tests/CatchMain.cpp index 7c6e3e1..43be6c4 100644 --- a/tests/CatchMain.cpp +++ b/tests/CatchMain.cpp @@ -172,7 +172,7 @@ TEST_CASE("Testing English", "[english]") { auto results = symSpell.LookupCompound(typo, 2); REQUIRE(results[0].term == correction); } - + SECTION("Lookup transfer casing") { SymSpell symSpell(maxEditDistance, prefixLength); symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' ')); @@ -181,4 +181,14 @@ TEST_CASE("Testing English", "[english]") { auto results = symSpell.Lookup(typo, Verbosity::Top, 2, false, true); REQUIRE(results[0].term == correction); } + + SECTION("Lookup compound accronyms and numbers") { + SymSpell symSpell(maxEditDistance, prefixLength); + symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' ')); + xstring typo = "whera is the PNR9 locaited"; + xstring correction = "where is the PNR9 located"; + auto results = symSpell.LookupCompound(typo); + REQUIRE(results[0].term == correction); + } + } \ No newline at end of file diff --git a/tests/SymSpellCppPyTest.py b/tests/SymSpellCppPyTest.py index ba72cc7..7205e63 100644 --- a/tests/SymSpellCppPyTest.py +++ b/tests/SymSpellCppPyTest.py @@ -402,6 +402,8 @@ def test_lookup_compound_only_combi(self): self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) + def test_lookup_compound_numbers_accr(self) + def test_lookup_compound_no_suggestion(self): edit_distance_max = 2 prefix_length = 7 From 49e227197bec271f3c7c3775d7326ab56d635eb9 Mon Sep 17 00:00:00 2001 From: Mohit Tare Date: Wed, 14 Oct 2020 19:31:15 +0530 Subject: [PATCH 2/5] testcases and parse word changes --- library.cpp | 27 ++++++++++----------------- library.h | 4 +--- tests/CatchMain.cpp | 9 +++++++++ tests/SymSpellCppPyTest.py | 11 +++++++---- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/library.cpp b/library.cpp index 2620933..261def8 100644 --- a/library.cpp +++ b/library.cpp @@ -252,7 +252,7 @@ namespace symspellcpppy { xstring line; auto staging = std::make_shared(16384); while (getline(corpusStream, line)) { - for (const xstring &key : ParseWords(line)) { + for (const xstring &key : ParseWords(line,true)) { CreateDictionaryEntry(key, 1, staging); } @@ -455,26 +455,19 @@ namespace symspellcpppy { return true; } - std::vector SymSpell::ParseWords(const xstring &text) { + std::vector SymSpell::ParseWords(const xstring &text,bool lower_casing=true) { xregex r(XL("['’\\w-\\[_\\]]+")); xsmatch m; std::vector matches; xstring::const_iterator ptr(text.cbegin()); while (regex_search(ptr, text.cend(), m, r)) { - xstring matchLower = Helpers::string_lower(m[0]); - matches.push_back(matchLower); - ptr = m.suffix().first; - } - return matches; - } - - std::vector SymSpell::ParseWordsPreserveCasing(const xstring &text) { - xregex r(XL("['’\\w-\\[_\\]]+")); - xsmatch m; - std::vector matches; - xstring::const_iterator ptr(text.cbegin()); - while (regex_search(ptr, text.cend(), m, r)) { - matches.push_back(m[0]); + if(lower_casing){ + xstring matchLower = Helpers::string_lower(m[0]); + matches.push_back(matchLower); + } + else{ + matches.push_back(m[0]); + } ptr = m.suffix().first; } return matches; @@ -531,7 +524,7 @@ namespace symspellcpppy { } std::vector SymSpell::LookupCompound(const xstring &input, int editDistanceMax,bool transferCasing,bool ignore_non_words) { - std::vector termList1 = ParseWordsPreserveCasing(input); + std::vector termList1 = ParseWords(input,false); std::vector suggestions; //suggestions for a single term std::vector suggestionParts; //1 line with separate parts diff --git a/library.h b/library.h index 49439ca..2005486 100644 --- a/library.h +++ b/library.h @@ -227,9 +227,7 @@ namespace symspellcpppy { bool DeleteInSuggestionPrefix(xstring deleteSugg, int deleteLen, xstring suggestion, int suggestionLen) const; - static std::vector ParseWords(const xstring &text); - - static std::vector ParseWordsPreserveCasing(const xstring &text); + static std::vector ParseWords(const xstring &text, bool lower_casing); std::shared_ptr> Edits(const xstring &word, int editDistance, std::shared_ptr> deleteWords); diff --git a/tests/CatchMain.cpp b/tests/CatchMain.cpp index 43be6c4..cbd48b1 100644 --- a/tests/CatchMain.cpp +++ b/tests/CatchMain.cpp @@ -190,5 +190,14 @@ TEST_CASE("Testing English", "[english]") { auto results = symSpell.LookupCompound(typo); REQUIRE(results[0].term == correction); } + + SECTION("Lookup compound with just numbers") { + SymSpell symSpell(maxEditDistance, prefixLength); + symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' ')); + xstring typo = "whera is the 999 locaited"; + xstring correction = "where is the 999 located"; + auto results = symSpell.LookupCompound(typo); + REQUIRE(results[0].term == correction); + } } \ No newline at end of file diff --git a/tests/SymSpellCppPyTest.py b/tests/SymSpellCppPyTest.py index 7205e63..069f679 100644 --- a/tests/SymSpellCppPyTest.py +++ b/tests/SymSpellCppPyTest.py @@ -401,9 +401,7 @@ def test_lookup_compound_only_combi(self): results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) - - def test_lookup_compound_numbers_accr(self) - + def test_lookup_compound_no_suggestion(self): edit_distance_max = 2 prefix_length = 7 @@ -720,7 +718,12 @@ def test_lookup_transfer_casing(self): result = sym_spell.lookup("I", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("I", result[0].term) - + + def test_lookup_compund_acr(self): + symSpell = SymSpell() + symSpell.load_dictionary("resources/frequency_dictionary_en_82_765.txt", 0, 1, " ") + res = symSpell.lookup_compound("Wht is yur PNR numbir") + self.assertEqual("What is your PNR number",res[0].term) if __name__ == '__main__': unittest.main() From a13a60778350388896a759439443c6c844717750 Mon Sep 17 00:00:00 2001 From: Mohit Tare Date: Wed, 14 Oct 2020 19:39:31 +0530 Subject: [PATCH 3/5] changes for new LookupCompound --- tests/SymSpellCppPyTest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/SymSpellCppPyTest.py b/tests/SymSpellCppPyTest.py index 069f679..9aee195 100644 --- a/tests/SymSpellCppPyTest.py +++ b/tests/SymSpellCppPyTest.py @@ -630,7 +630,7 @@ def test_lookup_compound_transfer_casing_no_bigram(self): "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, - transfer_casing=True) + transfer_casing=True,True) self.assertEqual(correction, results[0].term) # TODO: test_create_dictionary_entry_below_threshold @@ -722,7 +722,7 @@ def test_lookup_transfer_casing(self): def test_lookup_compund_acr(self): symSpell = SymSpell() symSpell.load_dictionary("resources/frequency_dictionary_en_82_765.txt", 0, 1, " ") - res = symSpell.lookup_compound("Wht is yur PNR numbir") + res = symSpell.lookup_compound("Whate is yur PNR numbir") self.assertEqual("What is your PNR number",res[0].term) if __name__ == '__main__': From 9aae24c38d1263c2957653f73720261a95ad31e6 Mon Sep 17 00:00:00 2001 From: Mohit Tare Date: Wed, 14 Oct 2020 19:43:08 +0530 Subject: [PATCH 4/5] changes for new LookupCompound --- tests/SymSpellCppPyTest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/SymSpellCppPyTest.py b/tests/SymSpellCppPyTest.py index 9aee195..5882b52 100644 --- a/tests/SymSpellCppPyTest.py +++ b/tests/SymSpellCppPyTest.py @@ -630,7 +630,7 @@ def test_lookup_compound_transfer_casing_no_bigram(self): "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, - transfer_casing=True,True) + True,True) self.assertEqual(correction, results[0].term) # TODO: test_create_dictionary_entry_below_threshold From 93e81e511819c4789350fd190786d809f1291863 Mon Sep 17 00:00:00 2001 From: Mohit Tare Date: Wed, 14 Oct 2020 19:52:38 +0530 Subject: [PATCH 5/5] fix for lookup compund test case --- tests/SymSpellCppPyTest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/SymSpellCppPyTest.py b/tests/SymSpellCppPyTest.py index 5882b52..e563837 100644 --- a/tests/SymSpellCppPyTest.py +++ b/tests/SymSpellCppPyTest.py @@ -615,7 +615,7 @@ def test_lookup_compound_transfer_casing(self): "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, - transfer_casing=True) + transfer_casing=True,ignore_non_words=True) self.assertEqual(correction, results[0].term) def test_lookup_compound_transfer_casing_no_bigram(self):