Skip to content

Commit 679111c

Browse files
committed
Fix: Fix the async request order incorrect
1 parent bed41aa commit 679111c

File tree

3 files changed

+2509
-2516
lines changed

3 files changed

+2509
-2516
lines changed

backend/src/api/clawer.py

Lines changed: 18 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,47 +7,45 @@
77
# Enable progress bar for DataFrame operations
88
tqdm.pandas()
99

10-
async def extract_course_details(course_url: str) -> dict:
10+
async def extract_course_details(course_url: str, progress_bar: tqdm) -> dict:
1111
"""
1212
Extract course syllabus and objectives from the given URL asynchronously.
1313
1414
Args:
1515
course_url (str): The URL of the webpage.
16+
progress_bar (tqdm): Shared progress bar instance.
1617
1718
Returns:
1819
dict: A dictionary containing the syllabus and objectives.
1920
"""
2021
async with aiohttp.ClientSession() as session:
2122
try:
2223
async with session.get(course_url) as response:
23-
response.encoding = 'utf-8' # Ensure correct encoding
24+
response.encoding = 'utf-8'
2425
if response.status != 200:
2526
return {"syllabus": "", "objectives": ""}
2627
html = await response.text()
2728
soup = BeautifulSoup(html, 'html.parser')
2829

2930
# Find Course Syllabus
3031
syllabus_tag = soup.find('p', string='課程大綱 Course syllabus')
31-
if syllabus_tag:
32-
syllabus = syllabus_tag.find_next('td', colspan="12").get_text(strip=True)
33-
else:
34-
syllabus = ""
32+
syllabus = syllabus_tag.find_next('td', colspan="12").get_text(strip=True) if syllabus_tag else ""
3533

3634
# Find Course Objectives
3735
objectives_tag = soup.find('p', string='課程目標 Objectives')
38-
if objectives_tag:
39-
objectives = objectives_tag.find_next('td', colspan="12").get_text(strip=True)
40-
else:
41-
objectives = ""
36+
objectives = objectives_tag.find_next('td', colspan="12").get_text(strip=True) if objectives_tag else ""
4237

4338
return {"syllabus": syllabus, "objectives": objectives}
4439
except aiohttp.ClientError:
4540
return {"syllabus": "", "objectives": ""}
46-
41+
finally:
42+
# Update the progress bar
43+
progress_bar.update(1)
4744

4845
async def extend_course_dataframe(courses_df: pd.DataFrame, url_column: str) -> pd.DataFrame:
4946
"""
50-
Extend the DataFrame by extracting syllabus and objectives for each URL asynchronously.
47+
Extend the DataFrame by extracting syllabus and objectives for each URL asynchronously,
48+
with progress updates.
5149
5250
Args:
5351
courses_df (pd.DataFrame): DataFrame containing a column of URLs.
@@ -56,16 +54,13 @@ async def extend_course_dataframe(courses_df: pd.DataFrame, url_column: str) ->
5654
Returns:
5755
pd.DataFrame: Updated DataFrame with syllabus and objectives columns.
5856
"""
59-
tasks = [extract_course_details(url) for url in courses_df[url_column]]
60-
extracted_data = []
61-
with tqdm(total=len(tasks), desc="Fetching course details") as progress_bar:
62-
for future in asyncio.as_completed(tasks):
63-
result = await future
64-
extracted_data.append(result)
65-
progress_bar.update(1)
57+
total_tasks = len(courses_df[url_column])
58+
with tqdm(total=total_tasks, desc="Fetching course details") as progress_bar:
59+
tasks = [extract_course_details(url, progress_bar) for url in courses_df[url_column]]
60+
results = await asyncio.gather(*tasks)
6661

67-
extracted_df = pd.DataFrame(extracted_data)
68-
extended_df = pd.concat([courses_df, extracted_df], axis=1)
62+
extracted_df = pd.DataFrame(results)
63+
extended_df = pd.concat([courses_df.reset_index(drop=True), extracted_df], axis=1)
6964
return extended_df
7065

7166

@@ -77,12 +72,10 @@ async def extend_course_dataframe(courses_df: pd.DataFrame, url_column: str) ->
7772
'https://selcrs.nsysu.edu.tw/menu5/showoutline.asp?SYEAR=113&SEM=1&CrsDat=GEAI1369&Crsname=基礎訊號處理'
7873
]}
7974

80-
# Create a DataFrame with URLs
8175
test_df = pd.DataFrame(data)
8276

83-
# Extend the DataFrame with syllabus and objectives
8477
async def main():
85-
extended_courses_df = await extend_course_dataframe(test_df, 'url')
86-
print(extended_courses_df)
78+
extended_courses_df = await extend_course_dataframe(test_df, 'url')
79+
print(extended_courses_df)
8780

8881
asyncio.run(main())

0 commit comments

Comments
 (0)