@@ -213,39 +213,6 @@ def _setup_parsing_environment(log_file, robots_list, mmdb):
213213 return lp , utm
214214
215215
216- def _load_metrics_objs_cache (log_file ):
217- """
218- Loads the necessary objects into a cache for efficient access during log processing.
219-
220- Args:
221- log_file (LogFile): The log file being processed.
222-
223- Returns:
224- dict: A cache containing items and user agents.
225- """
226- logging .info (f'Loading metrics objects cache for { log_file .collection } ' )
227- cache = {
228- 'items' : {},
229- 'user_agents' : {},
230- 'user_sessions' : {},
231- 'item_accesses' : {},
232- }
233-
234- items_qs = Item .objects .filter (collection = log_file .collection ).select_related ('journal' , 'article' , 'collection' )
235- for it in items_qs :
236- key = (it .collection .acron3 , it .journal_id , it .article_id )
237- cache ['items' ][key ] = it
238- logging .info (f'Loaded { len (cache ["items" ])} items for { log_file .collection } ' )
239-
240- user_agents_qs = UserAgent .objects .all ()
241- for ua in user_agents_qs :
242- key = (ua .name , ua .version )
243- cache ['user_agents' ][key ] = ua
244- logging .info (f'Loaded { len (cache ["user_agents" ])} user agents' )
245-
246- return cache
247-
248-
249216def _fetch_art_jou_ids (utm , item_access_data ):
250217 """
251218 Fetches the journal and article IDs based on the item access data.
@@ -285,26 +252,22 @@ def _process_lines(lp, utm, log_file):
285252 Returns:
286253 None.
287254 """
288- logging .info (f'Loading metadata cache for { log_file .collection } ' )
289- cache = _load_metrics_objs_cache (log_file )
290-
291255 logging .info (f'Processing { lp .logfile } ' )
292256 for line in lp .parse ():
293- if not _process_line (line , utm , log_file , cache ):
257+ if not _process_line (line , utm , log_file ):
294258 continue
295259
296260 return True
297261
298262
299- def _process_line (line , utm , log_file , cache ):
263+ def _process_line (line , utm , log_file ):
300264 """
301265 Processes a single line from the log file, translating the URL and registering item access if valid.
302266
303267 Args:
304268 line (dict): A dictionary representing a single log line.
305269 utm (URLTranslationManager): The URL translation manager instance.
306270 log_file (LogFile): The log file being processed.
307- cache (dict): A cache containing pre-fetched objects to avoid redundant database queries.
308271
309272 Returns:
310273 bool: True if the line was processed successfully, False otherwise.
@@ -353,18 +316,26 @@ def _process_line(line, utm, log_file, cache):
353316 return False
354317
355318 try :
356- _register_item_access (item_access_data , line , jou_id , art_id , cache )
319+ _register_item_access (item_access_data , line , jou_id , art_id )
357320 except Exception as e :
358321 _log_discarded_line (log_file , line , tracker_choices .LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR , str (e ))
359322 return False
360323
361324 return True
362325
363326
364- def _register_item_access (item_access_data , line , jou_id , art_id , cache ):
327+ def _register_item_access (item_access_data , line , jou_id , art_id ):
365328 """
366329 Registers an item access in the database, creating necessary objects if they do not exist.
367- Handles potential deadlocks by retrying on database errors.
330+
331+ Args:
332+ item_access_data (dict): A dictionary containing item access data, including collection, ISSN, PIDs, media format, language, and content type.
333+ line (dict): The log line being processed.
334+ jou_id (int): The journal ID.
335+ art_id (int): The article ID.
336+
337+ Returns:
338+ None.
368339 """
369340 col_acron3 = item_access_data .get ('collection' )
370341 media_format = item_access_data .get ('media_format' )
@@ -382,31 +353,26 @@ def _register_item_access(item_access_data, line, jou_id, art_id, cache):
382353 truncated_datetime = timezone .make_aware (truncated_datetime )
383354 ms_key = extract_minute_second_key (local_datetime )
384355
385- it = _get_or_create_item (col_acron3 , jou_id , art_id , cache )
386- ua = _get_or_create_user_agent (client_name , client_version , cache )
387- us = _get_or_create_user_session (truncated_datetime , ua , ip_address , cache )
388- ita = _get_or_create_item_access (it , us , media_format , media_language , country_code , content_type , ms_key , cache )
356+ it = _get_or_create_item (col_acron3 , jou_id , art_id )
357+ ua = _get_or_create_user_agent (client_name , client_version )
358+ us = _get_or_create_user_session (truncated_datetime , ua , ip_address )
359+ ita = _get_or_create_item_access (it , us , media_format , media_language , country_code , content_type , ms_key )
389360
390361 ita .click_timestamps [ms_key ] = ita .click_timestamps .get (ms_key , 0 ) + 1
391362 ita .save ()
392363
393364
394- def _get_or_create_item (col_acron3 , jou_id , art_id , cache , max_retries = 3 ):
395- item_key = (col_acron3 , jou_id , art_id )
365+ def _get_or_create_item (col_acron3 , jou_id , art_id , max_retries = 3 ):
396366 for attempt in range (max_retries ):
397367 try :
398- if item_key not in cache ['items' ]:
399- collection_obj = Collection .objects .get (acron3 = col_acron3 )
400- journal_obj = Journal .objects .get (id = jou_id )
401- article_obj = Article .objects .get (id = art_id )
402- it , _ = Item .objects .get_or_create (
403- collection = collection_obj ,
404- journal = journal_obj ,
405- article = article_obj ,
406- )
407- cache ['items' ][item_key ] = it
408- else :
409- it = cache ['items' ][item_key ]
368+ collection_obj = Collection .objects .get (acron3 = col_acron3 )
369+ journal_obj = Journal .objects .get (id = jou_id )
370+ article_obj = Article .objects .get (id = art_id )
371+ it , _ = Item .objects .get_or_create (
372+ collection = collection_obj ,
373+ journal = journal_obj ,
374+ article = article_obj ,
375+ )
410376 return it
411377 except Exception as e :
412378 if attempt == max_retries - 1 :
@@ -415,18 +381,13 @@ def _get_or_create_item(col_acron3, jou_id, art_id, cache, max_retries=3):
415381 return None
416382
417383
418- def _get_or_create_user_agent (client_name , client_version , cache , max_retries = 3 ):
419- user_agent_key = (client_name , client_version )
384+ def _get_or_create_user_agent (client_name , client_version , max_retries = 3 ):
420385 for attempt in range (max_retries ):
421386 try :
422- if user_agent_key not in cache ['user_agents' ]:
423- ua , _ = UserAgent .objects .get_or_create (
424- name = client_name ,
425- version = client_version
426- )
427- cache ['user_agents' ][user_agent_key ] = ua
428- else :
429- ua = cache ['user_agents' ][user_agent_key ]
387+ ua , _ = UserAgent .objects .get_or_create (
388+ name = client_name ,
389+ version = client_version
390+ )
430391 return ua
431392 except Exception as e :
432393 if attempt == max_retries - 1 :
@@ -435,19 +396,14 @@ def _get_or_create_user_agent(client_name, client_version, cache, max_retries=3)
435396 return None
436397
437398
438- def _get_or_create_user_session (truncated_datetime , ua , ip_address , cache , max_retries = 3 ):
439- us_key = (truncated_datetime , ua .id , ip_address )
399+ def _get_or_create_user_session (truncated_datetime , ua , ip_address , max_retries = 3 ):
440400 for attempt in range (max_retries ):
441401 try :
442- if us_key not in cache ['user_sessions' ]:
443- us , _ = UserSession .objects .get_or_create (
444- datetime = truncated_datetime ,
445- user_agent = ua ,
446- user_ip = ip_address
447- )
448- cache ['user_sessions' ][us_key ] = us
449- else :
450- us = cache ['user_sessions' ][us_key ]
402+ us , _ = UserSession .objects .get_or_create (
403+ datetime = truncated_datetime ,
404+ user_agent = ua ,
405+ user_ip = ip_address
406+ )
451407 return us
452408 except Exception as e :
453409 if attempt == max_retries - 1 :
@@ -456,23 +412,18 @@ def _get_or_create_user_session(truncated_datetime, ua, ip_address, cache, max_r
456412 return None
457413
458414
459- def _get_or_create_item_access (it , us , media_format , media_language , country_code , content_type , ms_key , cache , max_retries = 3 ):
460- item_access_key = (it .id , us .id , media_format , media_language , country_code , content_type )
415+ def _get_or_create_item_access (it , us , media_format , media_language , country_code , content_type , ms_key , max_retries = 3 ):
461416 for attempt in range (max_retries ):
462417 try :
463- if item_access_key not in cache ['item_accesses' ]:
464- ita , _ = ItemAccess .objects .get_or_create (
465- item = it ,
466- user_session = us ,
467- media_format = media_format ,
468- media_language = media_language ,
469- country_code = country_code ,
470- content_type = content_type ,
471- defaults = {'click_timestamps' : {ms_key : 1 }}
472- )
473- cache ['item_accesses' ][item_access_key ] = ita
474- else :
475- ita = cache ['item_accesses' ][item_access_key ]
418+ ita , _ = ItemAccess .objects .get_or_create (
419+ item = it ,
420+ user_session = us ,
421+ media_format = media_format ,
422+ media_language = media_language ,
423+ country_code = country_code ,
424+ content_type = content_type ,
425+ defaults = {'click_timestamps' : {ms_key : 0 }}
426+ )
476427 return ita
477428 except Exception as e :
478429 if attempt == max_retries - 1 :
0 commit comments