Merge pull request #29 from cipherstash/fm/add-guards-for-min-max-length

fimac · web-flow · commit f90815fe27b0 · 2022-12-22T15:35:40.000+11:00
Add guards for min and max length
diff --git a/lib/cipherstash/analysis/text_processor.rb b/lib/cipherstash/analysis/text_processor.rb
@@ -46,8 +46,19 @@ def build_token_filters(array)
             TokenFilters::Downcase.new(obj)
 
           when "ngram"
-            TokenFilters::NGram.new(obj)
+            if obj["tokenLength"]
+              raise CipherStash::Client::Error::InvalidSchemaError, "'tokenLength' is deprecated. Use 'minLength' and 'maxLength' for the ngram filter."
+            end
+
+            unless obj["minLength"].instance_of?(Integer) && obj["maxLength"].instance_of?(Integer)
+              raise CipherStash::Client::Error::InvalidSchemaError, "The values provided to the min and max length must be of type Integer."
+            end
 
+            unless obj["maxLength"] >= obj["minLength"]
+                raise CipherStash::Client::Error::InvalidSchemaError, "The ngram filter min length must be less than or equal to the max length"
+            end
+
+            TokenFilters::NGram.new(obj)
           else
             raise "Unknown token filter: '#{obj['kind']}'"
           end
diff --git a/spec/cipherstash/analysis/text_processor_spec.rb b/spec/cipherstash/analysis/text_processor_spec.rb
@@ -18,6 +18,44 @@
   end
 
   describe "Standard text processor with an ngram filter" do
+    ["1", { foo: "bar" }, Object.new].each do |length|
+      it "raises an error if invalid length of #{length.inspect} provided" do
+        expect {
+          CipherStash::Analysis::TextProcessor.new({
+            "tokenFilters" => [
+              { "kind" => "downcase" },
+              { "kind" => "ngram", "minLength" => length, "maxLength" => length }
+              ],
+            "tokenizer" => { "kind" => "standard" }
+          })
+        }.to raise_error(CipherStash::Client::Error::InvalidSchemaError, "The values provided to the min and max length must be of type Integer.")
+      end
+    end
+
+    it "raises an error if the min length is greater than the max length" do
+      expect {
+        CipherStash::Analysis::TextProcessor.new({
+          "tokenFilters" => [
+            { "kind" => "downcase" },
+            { "kind" => "ngram", "minLength" => 4, "maxLength" => 3 }
+          ],
+          "tokenizer" => { "kind" => "standard" }
+        })
+        }.to raise_error(CipherStash::Client::Error::InvalidSchemaError, "The ngram filter min length must be less than or equal to the max length")
+    end
+
+    it "raises an error if tokenLength is provided" do
+      expect {
+        CipherStash::Analysis::TextProcessor.new({
+          "tokenFilters" => [
+            { "kind" => "downcase" },
+            { "kind" => "ngram", "tokenLength" => 3 }
+          ],
+          "tokenizer" => { "kind" => "standard" }
+        })
+      }.to raise_error(CipherStash::Client::Error::InvalidSchemaError, "'tokenLength' is deprecated. Use 'minLength' and 'maxLength' for the ngram filter.")
+    end
+
     it "splits text into ngrams using min length of 3 and max length of 8" do
       tokenizer =
         CipherStash::Analysis::TextProcessor.new({