Skip to content

Commit 6737824

Browse files
fix: used more reliable mongo queries
1 parent 96f92f5 commit 6737824

File tree

7 files changed

+148
-71
lines changed

7 files changed

+148
-71
lines changed

Gemfile.lock

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
GIT
22
remote: https://github.com/boost/codeclimate_diff.git
3-
revision: 8974e206e994dbd168a46de24faaabfea8503c8e
3+
revision: ee2cece9fb5baffb8d9a367c2ffa41e1266a3c32
44
specs:
5-
codeclimate_diff (0.1.13)
5+
codeclimate_diff (0.1.14)
66
colorize
77
json
88
optparse
@@ -164,7 +164,7 @@ GEM
164164
debug_inspector (>= 1.2.0)
165165
bson (5.0.0)
166166
builder (3.3.0)
167-
byebug (11.1.3)
167+
byebug (12.0.0)
168168
case_transform (0.2)
169169
activesupport
170170
childprocess (5.0.0)
@@ -221,7 +221,7 @@ GEM
221221
hash-deep-merge (0.1.1)
222222
htmlentities (4.3.4)
223223
http-accept (1.7.0)
224-
http-cookie (1.0.7)
224+
http-cookie (1.1.0)
225225
domain_name (~> 0.5)
226226
i18n (1.14.7)
227227
concurrent-ruby (~> 1.0)
@@ -230,7 +230,7 @@ GEM
230230
pp (>= 0.6.0)
231231
rdoc (>= 4.0.0)
232232
reline (>= 0.4.2)
233-
json (2.7.2)
233+
json (2.16.0)
234234
json-canonicalization (1.0.0)
235235
json-ld (3.3.1)
236236
htmlentities (~> 4.3)
@@ -278,10 +278,10 @@ GEM
278278
net-smtp
279279
marcel (1.0.4)
280280
method_source (1.1.0)
281-
mime-types (3.6.0)
281+
mime-types (3.7.0)
282282
logger
283-
mime-types-data (~> 3.2015)
284-
mime-types-data (3.2024.1001)
283+
mime-types-data (~> 3.2025, >= 3.2025.0507)
284+
mime-types-data (3.2025.0924)
285285
mini_mime (1.1.5)
286286
minitest (5.25.5)
287287
mongo (2.20.0)
@@ -312,7 +312,7 @@ GEM
312312
racc (~> 1.4)
313313
nokogiri (1.18.9-x86_64-linux-gnu)
314314
racc (~> 1.4)
315-
optparse (0.5.0)
315+
optparse (0.8.0)
316316
orm_adapter (0.5.0)
317317
parallel (1.25.1)
318318
parser (3.3.4.0)
@@ -323,12 +323,12 @@ GEM
323323
pr_geohash (1.0.0)
324324
prettyprint (0.2.0)
325325
progressbar (1.13.0)
326-
pry (0.14.2)
326+
pry (0.15.2)
327327
coderay (~> 1.1)
328328
method_source (~> 1.0)
329-
pry-byebug (3.10.1)
330-
byebug (~> 11.0)
331-
pry (>= 0.13, < 0.15)
329+
pry-byebug (3.11.0)
330+
byebug (~> 12.0)
331+
pry (>= 0.13, < 0.16)
332332
pry-rails (0.3.11)
333333
pry (>= 0.13.0)
334334
psych (5.2.6)

app/models/supplejack_api/collection_metric.rb

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ class CollectionMetric
66
include Mongoid::Document
77
include Mongoid::Timestamps
88
include SupplejackApi::Concerns::QueryableByDate
9+
include SupplejackApi::Concerns::MetricHelpers
910

1011
field :d, as: :date, type: Date, default: Time.now.utc
1112
field :dc, as: :display_collection, type: String
@@ -31,49 +32,56 @@ class CollectionMetric
3132
)
3233
end
3334

34-
def self.spawn(date_range = (30.days.ago.utc..Time.zone.now.yesterday.beginning_of_day))
35+
def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginning_of_day))
3536
return unless SupplejackApi.config.log_metrics == true
3637

37-
dates = SupplejackApi::RecordMetric.where(date: date_range).map(&:date).uniq
38-
dates.each do |date|
39-
Rails.logger.info("COLLECTION METRICS: Processing date: #{date}")
40-
collections = SupplejackApi::RecordMetric.where(date:).pluck(:display_collection).uniq
38+
record_metrics_dates_between(date_range).each do |date|
39+
logger.info("COLLECTION METRIC: Processing date: #{date}")
40+
display_collections = SupplejackApi::RecordMetric
41+
.where(date:, processed_by_collection_metrics: false)
42+
.distinct(:display_collection)
4143

42-
collections.each do |collection|
43-
Rails.logger.info("COLLECTION METRICS: Processing collection: #{collection}")
44-
record_metrics = record_metrics_to_be_processed(date, collection)
45-
collection_metrics = find_or_create_by(date:, display_collection: collection).inc(
46-
searches: record_metrics.sum(:appeared_in_searches),
47-
record_page_views: record_metrics.sum(:page_views),
48-
user_set_views: record_metrics.sum(:user_set_views),
49-
user_story_views: record_metrics.sum(:user_story_views),
50-
records_added_to_user_sets: record_metrics.sum(:added_to_user_sets),
51-
records_added_to_user_stories: record_metrics.sum(:added_to_user_stories),
52-
total_source_clickthroughs: record_metrics.sum(:source_clickthroughs)
53-
)
44+
display_collections.each do |display_collection|
45+
logger.info("COLLECTION METRIC: Processing collection: #{display_collection}")
46+
record_metrics = record_metrics_to_be_processed(date, display_collection)
5447

55-
if collection_metrics.save
48+
if update_collection_metrics(record_metrics, date, display_collection)
5649
record_metrics.update_all(processed_by_collection_metrics: true)
5750
else
58-
Rails.logger.error "Unable to summarize record metrics from collection: #{collection} date: #{date}"
51+
logger.error "Unable to summarize record metrics from collection: #{collection} date: #{date}"
5952
end
6053
end
6154
regenerate_all_collection_metrics!(date)
6255
end
6356
end
6457

58+
def self.update_collection_metrics(record_metrics, date, display_collection)
59+
collection_metrics = find_or_create_by(date:, display_collection:).inc(
60+
searches: record_metrics.sum(:appeared_in_searches),
61+
record_page_views: record_metrics.sum(:page_views),
62+
user_set_views: record_metrics.sum(:user_set_views),
63+
user_story_views: record_metrics.sum(:user_story_views),
64+
records_added_to_user_sets: record_metrics.sum(:added_to_user_sets),
65+
records_added_to_user_stories: record_metrics.sum(:added_to_user_stories),
66+
total_source_clickthroughs: record_metrics.sum(:source_clickthroughs)
67+
)
68+
69+
collection_metrics.save
70+
end
71+
6572
def self.record_metrics_to_be_processed(date, display_collection)
66-
Rails.logger.info("COLLECTION METRICS: Gathering records to be processed: #{date} #{display_collection}")
73+
logger.info("COLLECTION METRIC: Gathering records to be processed: #{date} #{display_collection}")
6774
SupplejackApi::RecordMetric.where(
6875
date:,
6976
display_collection:,
70-
:processed_by_collection_metrics.in => [nil, '', false]
77+
processed_by_collection_metrics: false
7178
)
7279
end
7380

7481
def self.regenerate_all_collection_metrics!(date)
75-
Rails.logger.info("COLLECTION METRICS: Regenerate all collection metrics #{date}")
82+
logger.info("COLLECTION METRIC: Regenerate all collection metrics #{date}")
7683
delete_all(date:, display_collection: 'all')
84+
logger.info('COLLECTION METRIC: deleted_all')
7785
all_collections = new(date:, display_collection: 'all')
7886
where(date:, :display_collection.nin => ['all']).find_all do |collection|
7987
all_collections.inc(
@@ -86,6 +94,11 @@ def self.regenerate_all_collection_metrics!(date)
8694
total_source_clickthroughs: collection.total_source_clickthroughs
8795
).save!
8896
end
97+
logger.info('COLLECTION METRIC: saved')
98+
end
99+
100+
def self.record_metrics_dates_between(date_range)
101+
record_metrics_dates_between_for(:processed_by_collection_metrics, date_range)
89102
end
90103
end
91104
end
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# frozen_string_literal: true
2+
3+
module SupplejackApi
4+
module MetricHelpers
5+
# produce a logging prefix matching the original style in the model files
6+
# e.g. "TopMetric" -> "TOP METRIC", "TopCollectionMetric" -> "TOP COLLECTION METRIC"
7+
def log_prefix
8+
klass = name.to_s.split('::').last
9+
klass.gsub(/([a-z\d])([A-Z])/, '\1 \2').tr('_', ' ').upcase
10+
end
11+
12+
# Fetch distinct dates for RecordMetric where the given processed flag is false
13+
def record_metrics_dates_between_for(processed_field, date_range)
14+
logger.info("#{log_prefix}: Fetching dates for #{processed_field}")
15+
dates = SupplejackApi::RecordMetric
16+
.where(date: date_range, processed_field => false)
17+
.distinct(:date)
18+
logger.info("#{log_prefix}: Processing dates: #{dates}")
19+
dates
20+
end
21+
22+
# Mark all RecordMetric rows for a given date as processed using the given flag
23+
def stamp_record_metrics_for(processed_field, date)
24+
logger.info("#{log_prefix}: Stamping all records on #{date} for #{processed_field}")
25+
SupplejackApi::RecordMetric
26+
.where(date:, processed_field => false)
27+
.update_all(processed_field => true)
28+
logger.info("#{log_prefix}: Stamped all records on: #{date} for #{processed_field}")
29+
end
30+
end
31+
end

app/models/supplejack_api/record_metric.rb

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,18 @@ class RecordMetric
2424

2525
index({ record_id: 1, display_collection: 1, date: 1 }, background: true)
2626

27-
index({ display_collection: 1, date: 1, processed_by_collection_metrics: 1 }, background: true)
28-
index({ display_collection: 1, date: 1, processed_by_top_metrics: 1 }, background: true)
29-
index({ display_collection: 1, date: 1, processed_by_top_collection_metrics: 1 }, background: true)
27+
index({ date: 1, display_collection: 1, processed_by_collection_metrics: 1 }, background: true)
28+
index({ date: 1, display_collection: 1, processed_by_top_metrics: 1 }, background: true)
29+
index({ date: 1, display_collection: 1, processed_by_top_collection_metrics: 1 }, background: true)
3030

3131
index({ display_collection: 1, date: 1 }, background: true)
3232

3333
index({ date: 1 }, background: true)
3434

35+
index({ date: 1, processed_by_collection_metrics: 1 }, background: true)
36+
index({ date: 1, processed_by_top_metrics: 1 }, background: true)
37+
index({ date: 1, processed_by_top_collection_metrics: 1 }, background: true)
38+
3539
index({ processed_by_collection_metrics: 1 }, background: true)
3640
index({ processed_by_top_metrics: 1 }, background: true)
3741
index({ processed_by_top_collection_metrics: 1 }, background: true)
@@ -41,6 +45,7 @@ class RecordMetric
4145
processed_by_top_metrics: 1,
4246
processed_by_top_collection_metrics: 1
4347
},
48+
name: 'all_metrics',
4449
background: true
4550
)
4651

@@ -54,5 +59,23 @@ def self.spawn(record_id, metrics, display_collection, date = Time.now.utc.begin
5459
upsert: true
5560
)
5661
end
62+
63+
# this method deletes processed metrics in batches to avoid memory issues
64+
# and loads on the db
65+
def self.delete_all_processed_metrics(batch_size = 5_000, sleep_time = 0.05)
66+
scope = SupplejackApi::RecordMetric.where(
67+
processed_by_collection_metrics: true,
68+
processed_by_top_metrics: true,
69+
processed_by_top_collection_metrics: true
70+
)
71+
72+
loop do
73+
ids = scope.only(:_id).limit(batch_size).pluck(:id)
74+
break if ids.empty?
75+
76+
SupplejackApi::RecordMetric.where(:_id.in => ids).delete_all
77+
sleep sleep_time
78+
end
79+
end
5780
end
5881
end

app/models/supplejack_api/request_metric.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@ def self.summarize
4949

5050
metrics.each do |metric|
5151
metric.records.each do |record|
52-
summary[date][record['record_id']]['metrics'][metric.metric] += 1
53-
summary[date][record['record_id']]['display_collection'] = record['display_collection']
52+
record_id = record['record_id']
53+
entry = summary[date][record_id]
54+
entry['metrics'][metric.metric] += 1
55+
entry['display_collection'] = record['display_collection']
5456
end
5557
end
5658
end

app/models/supplejack_api/top_collection_metric.rb

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ class TopCollectionMetric
66
include Mongoid::Document
77
include Mongoid::Timestamps
88
include SupplejackApi::Concerns::QueryableByDate
9+
include SupplejackApi::Concerns::MetricHelpers
910

1011
METRICS = %i[
1112
page_views
@@ -33,10 +34,7 @@ def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginni
3334

3435
metrics = []
3536

36-
dates = SupplejackApi::RecordMetric.where(date: date_range).map(&:date).uniq
37-
Rails.logger.info("TOP COLLECTION METRIC: processing dates: #{dates}")
38-
39-
dates.each do |date|
37+
record_metrics_dates_between(date_range).each do |date|
4038
display_collections(date).each do |dc|
4139
METRICS.each do |metric|
4240
record_metrics = record_metrics_to_be_processed(date, metric, dc)
@@ -52,18 +50,18 @@ def self.spawn(date_range = (Time.zone.at(0).utc..Time.now.yesterday.utc.beginni
5250
metrics.push(top_collection_metric)
5351
end
5452
end
55-
Rails.logger.info("TOP COLLECTION METRIC: Stampping all records on #{date}")
53+
5654
stamp_record_metrics(date)
5755
end
5856

5957
metrics
6058
end
6159

6260
def self.display_collections(date)
63-
Rails.logger.info("TOP COLLECTION METRIC: Finding all display collections on #{date}")
61+
logger.info("TOP COLLECTION METRIC: Finding all display collections on #{date}")
6462
SupplejackApi::RecordMetric.where(
6563
date:,
66-
:processed_by_top_collection_metrics.in => [nil, '', false]
64+
processed_by_top_collection_metrics: false
6765
).map(&:display_collection).uniq
6866
end
6967

@@ -75,11 +73,13 @@ def self.calculate_results(record_metrics, metric)
7573
end
7674

7775
def self.update_top_collection_metric(top_collection_metric, results)
78-
if top_collection_metric.results.blank?
76+
existing_results = top_collection_metric.results
77+
78+
if existing_results.blank?
7979
top_collection_metric.update(results:)
8080
else
81-
merged_results = top_collection_metric.results.merge(results) { |_key, a, b| a + b }
82-
merged_results = merged_results.sort_by { |_k, v| -v }.first(200).to_h
81+
merged_results = existing_results.merge(results) { |_key, existing, incoming| existing + incoming }
82+
merged_results = merged_results.sort_by { |_k, value| -value }.first(200).to_h
8383

8484
top_collection_metric.update(results: merged_results)
8585
end
@@ -96,18 +96,22 @@ def self.find_or_create_top_collection_metric(date, metric, display_collection)
9696
end
9797

9898
def self.record_metrics_to_be_processed(date, metric, display_collection)
99-
Rails.logger.info("TOP COLLECTION METRIC: Gathering top 200 records to be
100-
processed #{date}, #{metric}, #{display_collection}")
99+
logger.info('TOP COLLECTION METRIC: ' \
100+
"Gathering top 200 records to be processed #{date}, #{metric}, #{display_collection}")
101101
SupplejackApi::RecordMetric.where(
102102
date:,
103103
metric.ne => 0,
104104
display_collection:,
105-
:processed_by_top_collection_metrics.in => [nil, '', false]
105+
processed_by_top_collection_metrics: false
106106
).order_by(metric => 'desc').limit(200)
107107
end
108108

109+
def self.record_metrics_dates_between(date_range)
110+
record_metrics_dates_between_for(:processed_by_top_collection_metrics, date_range)
111+
end
112+
109113
def self.stamp_record_metrics(date)
110-
SupplejackApi::RecordMetric.where(date:).update_all(processed_by_top_collection_metrics: true)
114+
stamp_record_metrics_for(:processed_by_top_collection_metrics, date)
111115
end
112116
end
113117
end

0 commit comments

Comments
 (0)