Skip to content
This repository was archived by the owner on Jan 27, 2023. It is now read-only.

Commit c30826a

Browse files
authored
Merge pull request #27 from cipherstash/fm/fix-ngram-tokenization
Fix ngram tokenization
2 parents e5d5a3e + a48288e commit c30826a

File tree

5 files changed

+81
-9
lines changed

5 files changed

+81
-9
lines changed

lib/cipherstash/analysis/text_processor.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class TextProcessor
1616
# Processor.new({
1717
# "tokenFilters"=>[
1818
# {"kind"=>"downcase"},
19-
# {"kind"=>"ngram", "tokenLength"=>3}
19+
# {"kind"=>"ngram", "minLength"=>3, "maxLength"=>8}
2020
# ],
2121
# "tokenizer"=>{"kind"=>"standard"}
2222
# })

lib/cipherstash/analysis/token_filters.rb

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,24 @@ def perform(str_or_array)
1515

1616
class NGram < Base
1717
def perform(str_or_array)
18-
token_length = @opts["tokenLength"] || 3
18+
min_length = @opts["minLength"] || 3
19+
max_length = @opts["maxLength"] || 8
20+
1921
Array(str_or_array).flat_map do |token|
20-
[].tap do |out|
21-
(token.length - token_length + 1).times do |i|
22-
out << token[i, token_length]
22+
token_length = token.length
23+
24+
ngrams = [].tap do |out|
25+
(min_length..max_length).each do |n|
26+
ngram = token.chars.each_cons(n).map(&:join)
27+
out << ngram
28+
end
29+
30+
if token_length > max_length
31+
out << token
2332
end
2433
end
34+
35+
ngrams.flatten
2536
end
2637
end
2738
end
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
require 'cipherstash/analysis/text_processor'
2+
require "cipherstash/client"
3+
4+
RSpec.describe CipherStash::Analysis::TextProcessor do
5+
describe "Standard text processor" do
6+
it "splits text based on word boundaries" do
7+
tokenizer =
8+
CipherStash::Analysis::TextProcessor.new({
9+
"tokenFilters" => [
10+
{ "kind" => "downcase" }
11+
],
12+
"tokenizer" => { "kind" => "standard" }
13+
})
14+
result = tokenizer.perform("This is an example of a standard tokenizer")
15+
expect(result.length).to eq(8)
16+
expect(result).to eq(["this", "is", "an", "example", "of", "a", "standard", "tokenizer"])
17+
end
18+
end
19+
20+
describe "Standard text processor with an ngram filter" do
21+
it "splits text into ngrams using min length of 3 and max length of 8" do
22+
tokenizer =
23+
CipherStash::Analysis::TextProcessor.new({
24+
"tokenFilters" => [
25+
{ "kind" => "downcase" },
26+
{ "kind" => "ngram", "minLength" => 3, "maxLength" => 8 }
27+
],
28+
"tokenizer" => { "kind" => "standard" }
29+
})
30+
result = tokenizer.perform("Example filter")
31+
32+
expect(result).to eq([
33+
"exa",
34+
"xam",
35+
"amp",
36+
"mpl",
37+
"ple",
38+
"exam",
39+
"xamp",
40+
"ampl",
41+
"mple",
42+
"examp",
43+
"xampl",
44+
"ample",
45+
"exampl",
46+
"xample",
47+
"example",
48+
"fil",
49+
"ilt",
50+
"lte",
51+
"ter",
52+
"filt",
53+
"ilte",
54+
"lter",
55+
"filte",
56+
"ilter",
57+
"filter"
58+
])
59+
end
60+
end
61+
end

spec/cipherstash/client_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def data(profile)
126126
def schema(kind)
127127
mapping = {
128128
"kind" => kind,
129-
"tokenFilters" => [{"kind" => "downcase"}, {"kind" => "ngram", "tokenLength" => 3}],
129+
"tokenFilters" => [{"kind" => "downcase"}, {"kind" => "ngram", "minLength" => 3, "maxLength" => 8}],
130130
"tokenizer" => {"kind" => "standard"}
131131
}
132132

@@ -233,4 +233,4 @@ def schema(kind)
233233
end
234234
end
235235
end
236-
end
236+
end

spec/cipherstash/index_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def match_settings(id, kind)
1616
"mapping" => {
1717
"kind" => kind,
1818
"fields" => ["title"],
19-
"tokenFilters" => [{"kind"=>"downcase"}, {"kind"=>"ngram", "tokenLength"=>3}],
19+
"tokenFilters" => [{"kind"=>"downcase"}, {"kind"=>"ngram", "minLength" => 3, "maxLength" => 8}],
2020
"tokenizer" => {"kind"=>"standard"},
2121
"fieldType" => "string",
2222
}
@@ -33,7 +33,7 @@ def dynamic_match_settings(id, kind)
3333
},
3434
"mapping" => {
3535
"kind" => kind,
36-
"tokenFilters" => [{"kind"=>"downcase"}, {"kind"=>"ngram", "tokenLength"=>3}],
36+
"tokenFilters" => [{"kind"=>"downcase"}, {"kind"=>"ngram", "minLength" => 3, "maxLength" => 8}],
3737
"tokenizer" => {"kind"=>"standard"},
3838
"fieldType" => "string",
3939
}

0 commit comments

Comments
 (0)