Skip to content

Commit 4b5d52e

Browse files
Merge pull request #101 from pitangainnovare/impl/optimize-db-usage-3
Impl/optimize db usage 3
2 parents 7acfcaa + 9ab5834 commit 4b5d52e

File tree

2 files changed

+48
-97
lines changed

2 files changed

+48
-97
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.11.1
1+
1.12.0

metrics/tasks.py

Lines changed: 47 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -213,39 +213,6 @@ def _setup_parsing_environment(log_file, robots_list, mmdb):
213213
return lp, utm
214214

215215

216-
def _load_metrics_objs_cache(log_file):
217-
"""
218-
Loads the necessary objects into a cache for efficient access during log processing.
219-
220-
Args:
221-
log_file (LogFile): The log file being processed.
222-
223-
Returns:
224-
dict: A cache containing items and user agents.
225-
"""
226-
logging.info(f'Loading metrics objects cache for {log_file.collection}')
227-
cache = {
228-
'items': {},
229-
'user_agents': {},
230-
'user_sessions': {},
231-
'item_accesses': {},
232-
}
233-
234-
items_qs = Item.objects.filter(collection=log_file.collection).select_related('journal', 'article', 'collection')
235-
for it in items_qs:
236-
key = (it.collection.acron3, it.journal_id, it.article_id)
237-
cache['items'][key] = it
238-
logging.info(f'Loaded {len(cache["items"])} items for {log_file.collection}')
239-
240-
user_agents_qs = UserAgent.objects.all()
241-
for ua in user_agents_qs:
242-
key = (ua.name, ua.version)
243-
cache['user_agents'][key] = ua
244-
logging.info(f'Loaded {len(cache["user_agents"])} user agents')
245-
246-
return cache
247-
248-
249216
def _fetch_art_jou_ids(utm, item_access_data):
250217
"""
251218
Fetches the journal and article IDs based on the item access data.
@@ -285,26 +252,22 @@ def _process_lines(lp, utm, log_file):
285252
Returns:
286253
None.
287254
"""
288-
logging.info(f'Loading metadata cache for {log_file.collection}')
289-
cache = _load_metrics_objs_cache(log_file)
290-
291255
logging.info(f'Processing {lp.logfile}')
292256
for line in lp.parse():
293-
if not _process_line(line, utm, log_file, cache):
257+
if not _process_line(line, utm, log_file):
294258
continue
295259

296260
return True
297261

298262

299-
def _process_line(line, utm, log_file, cache):
263+
def _process_line(line, utm, log_file):
300264
"""
301265
Processes a single line from the log file, translating the URL and registering item access if valid.
302266
303267
Args:
304268
line (dict): A dictionary representing a single log line.
305269
utm (URLTranslationManager): The URL translation manager instance.
306270
log_file (LogFile): The log file being processed.
307-
cache (dict): A cache containing pre-fetched objects to avoid redundant database queries.
308271
309272
Returns:
310273
bool: True if the line was processed successfully, False otherwise.
@@ -353,18 +316,26 @@ def _process_line(line, utm, log_file, cache):
353316
return False
354317

355318
try:
356-
_register_item_access(item_access_data, line, jou_id, art_id, cache)
319+
_register_item_access(item_access_data, line, jou_id, art_id)
357320
except Exception as e:
358321
_log_discarded_line(log_file, line, tracker_choices.LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR, str(e))
359322
return False
360323

361324
return True
362325

363326

364-
def _register_item_access(item_access_data, line, jou_id, art_id, cache):
327+
def _register_item_access(item_access_data, line, jou_id, art_id):
365328
"""
366329
Registers an item access in the database, creating necessary objects if they do not exist.
367-
Handles potential deadlocks by retrying on database errors.
330+
331+
Args:
332+
item_access_data (dict): A dictionary containing item access data, including collection, ISSN, PIDs, media format, language, and content type.
333+
line (dict): The log line being processed.
334+
jou_id (int): The journal ID.
335+
art_id (int): The article ID.
336+
337+
Returns:
338+
None.
368339
"""
369340
col_acron3 = item_access_data.get('collection')
370341
media_format = item_access_data.get('media_format')
@@ -382,31 +353,26 @@ def _register_item_access(item_access_data, line, jou_id, art_id, cache):
382353
truncated_datetime = timezone.make_aware(truncated_datetime)
383354
ms_key = extract_minute_second_key(local_datetime)
384355

385-
it = _get_or_create_item(col_acron3, jou_id, art_id, cache)
386-
ua = _get_or_create_user_agent(client_name, client_version, cache)
387-
us = _get_or_create_user_session(truncated_datetime, ua, ip_address, cache)
388-
ita = _get_or_create_item_access(it, us, media_format, media_language, country_code, content_type, ms_key, cache)
356+
it = _get_or_create_item(col_acron3, jou_id, art_id)
357+
ua = _get_or_create_user_agent(client_name, client_version)
358+
us = _get_or_create_user_session(truncated_datetime, ua, ip_address)
359+
ita = _get_or_create_item_access(it, us, media_format, media_language, country_code, content_type, ms_key)
389360

390361
ita.click_timestamps[ms_key] = ita.click_timestamps.get(ms_key, 0) + 1
391362
ita.save()
392363

393364

394-
def _get_or_create_item(col_acron3, jou_id, art_id, cache, max_retries=3):
395-
item_key = (col_acron3, jou_id, art_id)
365+
def _get_or_create_item(col_acron3, jou_id, art_id, max_retries=3):
396366
for attempt in range(max_retries):
397367
try:
398-
if item_key not in cache['items']:
399-
collection_obj = Collection.objects.get(acron3=col_acron3)
400-
journal_obj = Journal.objects.get(id=jou_id)
401-
article_obj = Article.objects.get(id=art_id)
402-
it, _ = Item.objects.get_or_create(
403-
collection=collection_obj,
404-
journal=journal_obj,
405-
article=article_obj,
406-
)
407-
cache['items'][item_key] = it
408-
else:
409-
it = cache['items'][item_key]
368+
collection_obj = Collection.objects.get(acron3=col_acron3)
369+
journal_obj = Journal.objects.get(id=jou_id)
370+
article_obj = Article.objects.get(id=art_id)
371+
it, _ = Item.objects.get_or_create(
372+
collection=collection_obj,
373+
journal=journal_obj,
374+
article=article_obj,
375+
)
410376
return it
411377
except Exception as e:
412378
if attempt == max_retries - 1:
@@ -415,18 +381,13 @@ def _get_or_create_item(col_acron3, jou_id, art_id, cache, max_retries=3):
415381
return None
416382

417383

418-
def _get_or_create_user_agent(client_name, client_version, cache, max_retries=3):
419-
user_agent_key = (client_name, client_version)
384+
def _get_or_create_user_agent(client_name, client_version, max_retries=3):
420385
for attempt in range(max_retries):
421386
try:
422-
if user_agent_key not in cache['user_agents']:
423-
ua, _ = UserAgent.objects.get_or_create(
424-
name=client_name,
425-
version=client_version
426-
)
427-
cache['user_agents'][user_agent_key] = ua
428-
else:
429-
ua = cache['user_agents'][user_agent_key]
387+
ua, _ = UserAgent.objects.get_or_create(
388+
name=client_name,
389+
version=client_version
390+
)
430391
return ua
431392
except Exception as e:
432393
if attempt == max_retries - 1:
@@ -435,19 +396,14 @@ def _get_or_create_user_agent(client_name, client_version, cache, max_retries=3)
435396
return None
436397

437398

438-
def _get_or_create_user_session(truncated_datetime, ua, ip_address, cache, max_retries=3):
439-
us_key = (truncated_datetime, ua.id, ip_address)
399+
def _get_or_create_user_session(truncated_datetime, ua, ip_address, max_retries=3):
440400
for attempt in range(max_retries):
441401
try:
442-
if us_key not in cache['user_sessions']:
443-
us, _ = UserSession.objects.get_or_create(
444-
datetime=truncated_datetime,
445-
user_agent=ua,
446-
user_ip=ip_address
447-
)
448-
cache['user_sessions'][us_key] = us
449-
else:
450-
us = cache['user_sessions'][us_key]
402+
us, _ = UserSession.objects.get_or_create(
403+
datetime=truncated_datetime,
404+
user_agent=ua,
405+
user_ip=ip_address
406+
)
451407
return us
452408
except Exception as e:
453409
if attempt == max_retries - 1:
@@ -456,23 +412,18 @@ def _get_or_create_user_session(truncated_datetime, ua, ip_address, cache, max_r
456412
return None
457413

458414

459-
def _get_or_create_item_access(it, us, media_format, media_language, country_code, content_type, ms_key, cache, max_retries=3):
460-
item_access_key = (it.id, us.id, media_format, media_language, country_code, content_type)
415+
def _get_or_create_item_access(it, us, media_format, media_language, country_code, content_type, ms_key, max_retries=3):
461416
for attempt in range(max_retries):
462417
try:
463-
if item_access_key not in cache['item_accesses']:
464-
ita, _ = ItemAccess.objects.get_or_create(
465-
item=it,
466-
user_session=us,
467-
media_format=media_format,
468-
media_language=media_language,
469-
country_code=country_code,
470-
content_type=content_type,
471-
defaults={'click_timestamps': {ms_key: 1}}
472-
)
473-
cache['item_accesses'][item_access_key] = ita
474-
else:
475-
ita = cache['item_accesses'][item_access_key]
418+
ita, _ = ItemAccess.objects.get_or_create(
419+
item=it,
420+
user_session=us,
421+
media_format=media_format,
422+
media_language=media_language,
423+
country_code=country_code,
424+
content_type=content_type,
425+
defaults={'click_timestamps': {ms_key: 0}}
426+
)
476427
return ita
477428
except Exception as e:
478429
if attempt == max_retries - 1:

0 commit comments

Comments
 (0)