diff --git a/SymSpellCppPy.cpp b/SymSpellCppPy.cpp index 6ae3e18..39ea245 100644 --- a/SymSpellCppPy.cpp +++ b/SymSpellCppPy.cpp @@ -163,7 +163,7 @@ PYBIND11_MODULE(SymSpellCppPy, m) { " 3. multiple independent input terms with/without spelling errors", py::arg("input"), py::arg("max_edit_distance")) - .def("lookup_compound", py::overload_cast( + .def("lookup_compound", py::overload_cast( &symspellcpppy::SymSpell::LookupCompound), " LookupCompound supports compound aware automatic spelling correction of multi-word input strings with three cases:\n" " 1. mistakenly inserted space into a correct word led to two incorrect terms \n" @@ -171,7 +171,8 @@ PYBIND11_MODULE(SymSpellCppPy, m) { " 3. multiple independent input terms with/without spelling errors", py::arg("input"), py::arg("max_edit_distance"), - py::arg("transfer_casing")) + py::arg("transfer_casing"), + py::arg("ignore_non_words")) .def("word_segmentation", py::overload_cast( &symspellcpppy::SymSpell::WordSegmentation), " WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions\n" diff --git a/include/Helpers.h b/include/Helpers.h index 552dd98..75e1e44 100644 --- a/include/Helpers.h +++ b/include/Helpers.h @@ -181,6 +181,21 @@ class Helpers { return response_string; } + + static bool is_acronym(xstring word, bool match_any_term_with_digit){ + if(match_any_term_with_digit == true){ + for(char i:word){ + if(std::isdigit(i)){ + return true; + } + } + } + std::regex accr_regex("[A-Z0-9]{3,}"); + if(std::regex_match(word,accr_regex)){ + return true; + } + return false; + } }; template diff --git a/library.cpp b/library.cpp index 5e751a3..261def8 100644 --- a/library.cpp +++ b/library.cpp @@ -252,7 +252,7 @@ namespace symspellcpppy { xstring line; auto staging = std::make_shared(16384); while (getline(corpusStream, line)) { - for (const xstring &key : ParseWords(line)) { + for (const xstring &key : ParseWords(line,true)) { CreateDictionaryEntry(key, 1, staging); } @@ -455,14 +455,19 @@ namespace symspellcpppy { return true; } - std::vector SymSpell::ParseWords(const xstring &text) { + std::vector SymSpell::ParseWords(const xstring &text,bool lower_casing=true) { xregex r(XL("['’\\w-\\[_\\]]+")); xsmatch m; std::vector matches; xstring::const_iterator ptr(text.cbegin()); while (regex_search(ptr, text.cend(), m, r)) { - xstring matchLower = Helpers::string_lower(m[0]); - matches.push_back(matchLower); + if(lower_casing){ + xstring matchLower = Helpers::string_lower(m[0]); + matches.push_back(matchLower); + } + else{ + matches.push_back(m[0]); + } ptr = m.suffix().first; } return matches; @@ -511,15 +516,15 @@ namespace symspellcpppy { } std::vector SymSpell::LookupCompound(const xstring &input) { - return LookupCompound(input, maxDictionaryEditDistance, false); + return LookupCompound(input, maxDictionaryEditDistance, false,true); } std::vector SymSpell::LookupCompound(const xstring &input, int editDistanceMax) { - return LookupCompound(input, editDistanceMax, false); + return LookupCompound(input, editDistanceMax, false,true); } - std::vector SymSpell::LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing) { - std::vector termList1 = ParseWords(input); + std::vector SymSpell::LookupCompound(const xstring &input, int editDistanceMax,bool transferCasing,bool ignore_non_words) { + std::vector termList1 = ParseWords(input,false); std::vector suggestions; //suggestions for a single term std::vector suggestionParts; //1 line with separate parts @@ -527,6 +532,15 @@ namespace symspellcpppy { bool lastCombi = false; for (int i = 0; i < termList1.size(); i++) { + if(ignore_non_words == true){ + if(Helpers::is_acronym(termList1[i],true)){ + SuggestItem temp = SuggestItem(termList1[i],0,0); + suggestionParts.push_back(temp); + continue; + } + + } + suggestions = Lookup(termList1[i], Top, editDistanceMax); if ((i > 0) && !lastCombi) { diff --git a/library.h b/library.h index 910e677..2005486 100644 --- a/library.h +++ b/library.h @@ -227,7 +227,7 @@ namespace symspellcpppy { bool DeleteInSuggestionPrefix(xstring deleteSugg, int deleteLen, xstring suggestion, int suggestionLen) const; - static std::vector ParseWords(const xstring &text); + static std::vector ParseWords(const xstring &text, bool lower_casing); std::shared_ptr> Edits(const xstring &word, int editDistance, std::shared_ptr> deleteWords); @@ -259,7 +259,7 @@ namespace symspellcpppy { /// The string being spell checked. /// The maximum edit distance between input and suggested words. /// A List of SuggestItem object representing suggested correct spellings for the input string. - std::vector LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing); + std::vector LookupCompound(const xstring &input, int editDistanceMax, bool transferCasing, bool ignore_non_words); //###### diff --git a/tests/CatchMain.cpp b/tests/CatchMain.cpp index 7c6e3e1..cbd48b1 100644 --- a/tests/CatchMain.cpp +++ b/tests/CatchMain.cpp @@ -172,7 +172,7 @@ TEST_CASE("Testing English", "[english]") { auto results = symSpell.LookupCompound(typo, 2); REQUIRE(results[0].term == correction); } - + SECTION("Lookup transfer casing") { SymSpell symSpell(maxEditDistance, prefixLength); symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' ')); @@ -181,4 +181,23 @@ TEST_CASE("Testing English", "[english]") { auto results = symSpell.Lookup(typo, Verbosity::Top, 2, false, true); REQUIRE(results[0].term == correction); } + + SECTION("Lookup compound accronyms and numbers") { + SymSpell symSpell(maxEditDistance, prefixLength); + symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' ')); + xstring typo = "whera is the PNR9 locaited"; + xstring correction = "where is the PNR9 located"; + auto results = symSpell.LookupCompound(typo); + REQUIRE(results[0].term == correction); + } + + SECTION("Lookup compound with just numbers") { + SymSpell symSpell(maxEditDistance, prefixLength); + symSpell.LoadDictionary("../resources/frequency_dictionary_en_82_765.txt", 0, 1, XL(' ')); + xstring typo = "whera is the 999 locaited"; + xstring correction = "where is the 999 located"; + auto results = symSpell.LookupCompound(typo); + REQUIRE(results[0].term == correction); + } + } \ No newline at end of file diff --git a/tests/SymSpellCppPyTest.py b/tests/SymSpellCppPyTest.py index ba72cc7..e563837 100644 --- a/tests/SymSpellCppPyTest.py +++ b/tests/SymSpellCppPyTest.py @@ -401,7 +401,7 @@ def test_lookup_compound_only_combi(self): results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) - + def test_lookup_compound_no_suggestion(self): edit_distance_max = 2 prefix_length = 7 @@ -615,7 +615,7 @@ def test_lookup_compound_transfer_casing(self): "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, - transfer_casing=True) + transfer_casing=True,ignore_non_words=True) self.assertEqual(correction, results[0].term) def test_lookup_compound_transfer_casing_no_bigram(self): @@ -630,7 +630,7 @@ def test_lookup_compound_transfer_casing_no_bigram(self): "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, - transfer_casing=True) + True,True) self.assertEqual(correction, results[0].term) # TODO: test_create_dictionary_entry_below_threshold @@ -718,7 +718,12 @@ def test_lookup_transfer_casing(self): result = sym_spell.lookup("I", Verbosity.TOP, 2, transfer_casing=True) self.assertEqual("I", result[0].term) - + + def test_lookup_compund_acr(self): + symSpell = SymSpell() + symSpell.load_dictionary("resources/frequency_dictionary_en_82_765.txt", 0, 1, " ") + res = symSpell.lookup_compound("Whate is yur PNR numbir") + self.assertEqual("What is your PNR number",res[0].term) if __name__ == '__main__': unittest.main()