Skip to content
This repository was archived by the owner on Jan 27, 2023. It is now read-only.

Commit f90815f

Browse files
authored
Merge pull request #29 from cipherstash/fm/add-guards-for-min-max-length
Add guards for min and max length
2 parents c30826a + bae8aed commit f90815f

File tree

2 files changed

+50
-1
lines changed

2 files changed

+50
-1
lines changed

lib/cipherstash/analysis/text_processor.rb

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,19 @@ def build_token_filters(array)
4646
TokenFilters::Downcase.new(obj)
4747

4848
when "ngram"
49-
TokenFilters::NGram.new(obj)
49+
if obj["tokenLength"]
50+
raise CipherStash::Client::Error::InvalidSchemaError, "'tokenLength' is deprecated. Use 'minLength' and 'maxLength' for the ngram filter."
51+
end
52+
53+
unless obj["minLength"].instance_of?(Integer) && obj["maxLength"].instance_of?(Integer)
54+
raise CipherStash::Client::Error::InvalidSchemaError, "The values provided to the min and max length must be of type Integer."
55+
end
5056

57+
unless obj["maxLength"] >= obj["minLength"]
58+
raise CipherStash::Client::Error::InvalidSchemaError, "The ngram filter min length must be less than or equal to the max length"
59+
end
60+
61+
TokenFilters::NGram.new(obj)
5162
else
5263
raise "Unknown token filter: '#{obj['kind']}'"
5364
end

spec/cipherstash/analysis/text_processor_spec.rb

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,44 @@
1818
end
1919

2020
describe "Standard text processor with an ngram filter" do
21+
["1", { foo: "bar" }, Object.new].each do |length|
22+
it "raises an error if invalid length of #{length.inspect} provided" do
23+
expect {
24+
CipherStash::Analysis::TextProcessor.new({
25+
"tokenFilters" => [
26+
{ "kind" => "downcase" },
27+
{ "kind" => "ngram", "minLength" => length, "maxLength" => length }
28+
],
29+
"tokenizer" => { "kind" => "standard" }
30+
})
31+
}.to raise_error(CipherStash::Client::Error::InvalidSchemaError, "The values provided to the min and max length must be of type Integer.")
32+
end
33+
end
34+
35+
it "raises an error if the min length is greater than the max length" do
36+
expect {
37+
CipherStash::Analysis::TextProcessor.new({
38+
"tokenFilters" => [
39+
{ "kind" => "downcase" },
40+
{ "kind" => "ngram", "minLength" => 4, "maxLength" => 3 }
41+
],
42+
"tokenizer" => { "kind" => "standard" }
43+
})
44+
}.to raise_error(CipherStash::Client::Error::InvalidSchemaError, "The ngram filter min length must be less than or equal to the max length")
45+
end
46+
47+
it "raises an error if tokenLength is provided" do
48+
expect {
49+
CipherStash::Analysis::TextProcessor.new({
50+
"tokenFilters" => [
51+
{ "kind" => "downcase" },
52+
{ "kind" => "ngram", "tokenLength" => 3 }
53+
],
54+
"tokenizer" => { "kind" => "standard" }
55+
})
56+
}.to raise_error(CipherStash::Client::Error::InvalidSchemaError, "'tokenLength' is deprecated. Use 'minLength' and 'maxLength' for the ngram filter.")
57+
end
58+
2159
it "splits text into ngrams using min length of 3 and max length of 8" do
2260
tokenizer =
2361
CipherStash::Analysis::TextProcessor.new({

0 commit comments

Comments
 (0)