@@ -28,7 +28,7 @@ def extract_geometry_from_html(content):
2828 geom = json .loads (data )
2929
3030 geom_data = geom ["features" ][0 ]["geometry" ]
31- # preparing geometry data in accordance to geosAPI fields
31+ # preparing geometry data in accordance to geos API fields
3232 type_geom = {'type' : 'GeometryCollection' }
3333 geom_content = {"geometries" : [geom_data ]}
3434 type_geom .update (geom_content )
@@ -37,10 +37,10 @@ def extract_geometry_from_html(content):
3737 geom_object = GEOSGeometry (geom_data_string ) # GeometryCollection object
3838 logging .debug ('Found geometry: %s' , geom_object )
3939 return geom_object
40- except :
41- print ( "Invalid Geometry" )
40+ except Exception as e :
41+ logger . error ( "Cannot create geometry from string '%s': %s" , geom_data_string , e )
4242 except ValueError as e :
43- print ( "Not a valid GeoJSON" )
43+ logger . error ( "Error loading JSON from %s: %s" , tag . get ( "name" ), e )
4444
4545def extract_timeperiod_from_html (content ):
4646 period = [None , None ]
@@ -61,13 +61,20 @@ def parse_oai_xml_and_save_publications(content):
6161 for i in range (articles_count_in_journal ):
6262 identifier = collection .getElementsByTagName ("dc:identifier" )
6363 identifier_value = identifier [i ].firstChild .nodeValue
64+ logger .debug ("Retrieving %s" , identifier_value )
65+
6466 if identifier_value .startswith ('http' ):
6567
66- with requests .get (identifier_value ) as response :
67- soup = BeautifulSoup (response .content , 'html.parser' )
68+ try :
69+ with requests .get (identifier_value ) as response :
70+ soup = BeautifulSoup (response .content , 'html.parser' )
6871
69- geom_object = extract_geometry_from_html (soup )
70- period_start , period_end = extract_timeperiod_from_html (soup )
72+ geom_object = extract_geometry_from_html (soup )
73+ period_start , period_end = extract_timeperiod_from_html (soup )
74+ except Exception as e :
75+ logger .error ("Error retrieving and extracting geometadata from URL %s: %s" , identifier_value , e )
76+ logger .error ("Continueing with the next article..." )
77+ continue
7178
7279 else :
7380 geom_object = None
@@ -100,20 +107,20 @@ def parse_oai_xml_and_save_publications(content):
100107 abstract = abstract_text ,
101108 publicationDate = date_value ,
102109 url = identifier_value ,
103- journal = journal_value ,
110+ source = journal_value ,
104111 geometry = geom_object ,
105112 timeperiod_startdate = period_start ,
106113 timeperiod_enddate = period_end )
107114 publication .save ()
108- logger .info ('Saved new publication for %s: %s' , identifier_value , publication )
115+ logger .info ('Saved new publication for %s: %s' , identifier_value , publication . get_absolute_url () )
109116
110117def harvest_oai_endpoint (url ):
111118 try :
112119 with requests .Session () as s :
113120 response = s .get (url )
114121 parse_oai_xml_and_save_publications (response .content )
115122 except requests .exceptions .RequestException as e :
116- print ("The requested URL is invalid or has bad connection.Please change the URL" )
123+ logger . error ("The requested URL is invalid or has bad connection. Please check the URL: %s" , url )
117124
118125def send_monthly_email (trigger_source = 'manual' , sent_by = None ):
119126 recipients = User .objects .filter (userprofile__notify_new_manuscripts = True ).values_list ('email' , flat = True )
0 commit comments