@@ -147,25 +147,11 @@ def safe(n: Optional[LexborNode]):
147147 strip = True
148148 )
149149
150- # Debug: Print HTML for Delta flights
151- if name == "Delta" :
152- print (f"\n === DEBUG: Delta flight HTML ===" )
153- print (f"Flight item HTML: { item .html } " )
154- print (f"Airline name: { name } " )
155- print (f"Full HTML context: { r .text [:2000 ]} ..." ) # First 2000 chars
156- print ("=== END DEBUG ===\n " )
157-
158- # Debug: Print HTML for Frontier flights
159- if name == "Frontier" :
160- print (f"\n === DEBUG: Frontier flight HTML ===" )
161- print (f"Flight item HTML: { item .html } " )
162- print (f"Airline name: { name } " )
163- print ("=== END DEBUG ===\n " )
164-
165150 # Attempt to extract flight number from data-travelimpactmodelwebsiteurl attribute
166151 flight_number = None
167152 departure_airport = None
168153 arrival_airport = None
154+ connecting_airports = []
169155
170156 url_elem = item .css_first ('[data-travelimpactmodelwebsiteurl]' )
171157 if url_elem :
@@ -176,24 +162,56 @@ def safe(n: Optional[LexborNode]):
176162 airline_code = match .group (1 )
177163 flight_number = match .group (2 )
178164
179- # Extract airport codes from the URL
180- # Pattern: itinerary=JFK-LAX-F9-2503-20250801
181- airport_match = re .search (r'itinerary=([A-Z]{3})-([A-Z]{3})-' , url )
182- if airport_match :
183- departure_airport = airport_match .group (1 )
184- arrival_airport = airport_match .group (2 )
185-
165+ # Extract full route from the URL
166+ # Pattern: itinerary=JFK-LAX-F9-2503-20250801 (direct)
167+ # Pattern: itinerary=JFK-MCO-F9-4871-20250801,MCO-LAX-F9-4145-20250801 (with connection)
168+ route_match = re .search (r'itinerary=([A-Z0-9,-]+)-[A-Z0-9]+-\d+-\d{8}' , url )
169+ if route_match :
170+ itinerary = route_match .group (1 )
171+ # Split on commas to handle connecting flights
172+ segments = itinerary .split (',' )
173+
174+ if len (segments ) == 1 :
175+ # Direct flight
176+ route_parts = segments [0 ].split ('-' )
177+ if len (route_parts ) >= 2 :
178+ departure_airport = route_parts [0 ]
179+ arrival_airport = route_parts [- 1 ]
180+ connecting_airports = None
181+ else :
182+ # Connecting flight
183+ first_segment = segments [0 ].split ('-' )
184+ last_segment = segments [- 1 ].split ('-' )
185+
186+ if len (first_segment ) >= 2 and len (last_segment ) >= 2 :
187+ departure_airport = first_segment [0 ]
188+ arrival_airport = last_segment [- 1 ]
189+
190+ # Extract connecting airports
191+ connecting_airports = []
192+ if len (segments ) == 2 :
193+ # 2-segment flight: extract connecting airport from second segment
194+ connecting_airports .append (last_segment [0 ]) # First airport of second segment
195+ else :
196+ # Multi-segment flight: extract from intermediate segments
197+ for segment in segments [1 :- 1 ]: # Skip first and last segments
198+ segment_parts = segment .split ('-' )
199+ if len (segment_parts ) >= 2 :
200+ connecting_airports .append (segment_parts [0 ]) # First airport in each intermediate segment
201+
202+ connecting_airports = connecting_airports if connecting_airports else None
203+ # Do not overwrite arrival_airport or connecting_airports with HTML-derived values
204+
186205 # If not found in URL, try to extract from HTML elements
187206 if not departure_airport or not arrival_airport :
188- # Look for airport codes in the route information
189- route_elem = item .css_first ('.PTuQse' )
190- if route_elem :
191- route_text = route_elem .text (strip = True )
192- # Pattern: JFK – LAX
193- airport_match = re .search (r'([A-Z]{3})\s*–\s*([A-Z]{3})' , route_text )
194- if airport_match :
195- departure_airport = airport_match .group (1 )
196- arrival_airport = airport_match .group (2 )
207+ # Look for airport codes in the HTML elements
208+ departure_elem = item .css_first ('div.G2WY5c div' )
209+ arrival_elem = item .css_first ('div.c8rWCd div' )
210+
211+ if departure_elem and not departure_airport :
212+ departure_airport = departure_elem .text (strip = True )
213+ if arrival_elem and not arrival_airport :
214+ arrival_airport = arrival_elem .text (strip = True )
197215
198216 # If still not found, try looking for any span with a pattern like "AA1234", "DL567", etc.
199217 if not flight_number :
@@ -249,6 +267,7 @@ def safe(n: Optional[LexborNode]):
249267 "flight_number" : flight_number ,
250268 "departure_airport" : departure_airport ,
251269 "arrival_airport" : arrival_airport ,
270+ "connecting_airports" : connecting_airports if connecting_airports else None ,
252271 }
253272 )
254273
0 commit comments