Merge pull request #27 from cipherstash/fm/fix-ngram-tokenization

fimac · web-flow · commit c30826a19147 · 2022-12-22T08:54:11.000+11:00
Fix ngram tokenization
diff --git a/lib/cipherstash/analysis/text_processor.rb b/lib/cipherstash/analysis/text_processor.rb
@@ -16,7 +16,7 @@ class TextProcessor
       # Processor.new({
       #   "tokenFilters"=>[
       #     {"kind"=>"downcase"},
-      #     {"kind"=>"ngram", "tokenLength"=>3}
+      #     {"kind"=>"ngram", "minLength"=>3, "maxLength"=>8}
       #   ],
       #   "tokenizer"=>{"kind"=>"standard"}
       # })
diff --git a/lib/cipherstash/analysis/token_filters.rb b/lib/cipherstash/analysis/token_filters.rb
@@ -15,13 +15,24 @@ def perform(str_or_array)
 
       class NGram < Base
         def perform(str_or_array)
-          token_length = @opts["tokenLength"] || 3
+          min_length = @opts["minLength"] || 3
+          max_length = @opts["maxLength"] || 8
+
           Array(str_or_array).flat_map do |token|
-            [].tap do |out|
-              (token.length - token_length + 1).times do |i|
-                out << token[i, token_length]
+            token_length = token.length
+
+            ngrams = [].tap do |out|
+              (min_length..max_length).each do |n|
+                ngram = token.chars.each_cons(n).map(&:join)
+                out << ngram
+              end
+
+              if token_length > max_length
+                out << token
               end
             end
+
+            ngrams.flatten
           end
         end
       end
diff --git a/spec/cipherstash/analysis/text_processor_spec.rb b/spec/cipherstash/analysis/text_processor_spec.rb
@@ -0,0 +1,61 @@
+require 'cipherstash/analysis/text_processor'
+require "cipherstash/client"
+
+RSpec.describe CipherStash::Analysis::TextProcessor do
+  describe "Standard text processor" do
+    it "splits text based on word boundaries" do
+      tokenizer =
+        CipherStash::Analysis::TextProcessor.new({
+          "tokenFilters" => [
+            { "kind" => "downcase" }
+          ],
+         "tokenizer" => { "kind" => "standard" }
+        })
+      result = tokenizer.perform("This is an example of a standard tokenizer")
+      expect(result.length).to eq(8)
+      expect(result).to eq(["this", "is", "an", "example", "of", "a", "standard", "tokenizer"])
+    end
+  end
+
+  describe "Standard text processor with an ngram filter" do
+    it "splits text into ngrams using min length of 3 and max length of 8" do
+      tokenizer =
+        CipherStash::Analysis::TextProcessor.new({
+          "tokenFilters" => [
+            { "kind" => "downcase" },
+            { "kind" => "ngram", "minLength" => 3, "maxLength" => 8 }
+          ],
+         "tokenizer" => { "kind" => "standard" }
+        })
+      result = tokenizer.perform("Example filter")
+
+      expect(result).to eq([
+        "exa",
+        "xam",
+        "amp",
+        "mpl",
+        "ple",
+        "exam",
+        "xamp",
+        "ampl",
+        "mple",
+        "examp",
+        "xampl",
+        "ample",
+        "exampl",
+        "xample",
+        "example",
+        "fil",
+        "ilt",
+        "lte",
+        "ter",
+        "filt",
+        "ilte",
+        "lter",
+        "filte",
+        "ilter",
+        "filter"
+      ])
+    end
+  end
+end
diff --git a/spec/cipherstash/client_spec.rb b/spec/cipherstash/client_spec.rb
@@ -126,7 +126,7 @@ def data(profile)
         def schema(kind)
           mapping = {
             "kind" => kind,
-            "tokenFilters" => [{"kind" => "downcase"}, {"kind" => "ngram", "tokenLength" => 3}],
+            "tokenFilters" => [{"kind" => "downcase"}, {"kind" => "ngram", "minLength" => 3, "maxLength" => 8}],
             "tokenizer" => {"kind" => "standard"}
           }
 
@@ -233,4 +233,4 @@ def schema(kind)
       end
     end
   end
-end 
+end
diff --git a/spec/cipherstash/index_spec.rb b/spec/cipherstash/index_spec.rb
@@ -16,7 +16,7 @@ def match_settings(id, kind)
       "mapping" => {
         "kind" => kind,
         "fields" => ["title"],
-        "tokenFilters" => [{"kind"=>"downcase"}, {"kind"=>"ngram", "tokenLength"=>3}],
+        "tokenFilters" => [{"kind"=>"downcase"}, {"kind"=>"ngram", "minLength" => 3, "maxLength" => 8}],
         "tokenizer" => {"kind"=>"standard"},
         "fieldType" => "string",
       }
@@ -33,7 +33,7 @@ def dynamic_match_settings(id, kind)
       },
       "mapping" => {
         "kind" => kind,
-        "tokenFilters" => [{"kind"=>"downcase"}, {"kind"=>"ngram", "tokenLength"=>3}],
+        "tokenFilters" => [{"kind"=>"downcase"}, {"kind"=>"ngram", "minLength" => 3, "maxLength" => 8}],
         "tokenizer" => {"kind"=>"standard"},
         "fieldType" => "string",
       }