77# Enable progress bar for DataFrame operations
88tqdm .pandas ()
99
10- async def extract_course_details (course_url : str ) -> dict :
10+ async def extract_course_details (course_url : str , progress_bar : tqdm ) -> dict :
1111 """
1212 Extract course syllabus and objectives from the given URL asynchronously.
1313
1414 Args:
1515 course_url (str): The URL of the webpage.
16+ progress_bar (tqdm): Shared progress bar instance.
1617
1718 Returns:
1819 dict: A dictionary containing the syllabus and objectives.
1920 """
2021 async with aiohttp .ClientSession () as session :
2122 try :
2223 async with session .get (course_url ) as response :
23- response .encoding = 'utf-8' # Ensure correct encoding
24+ response .encoding = 'utf-8'
2425 if response .status != 200 :
2526 return {"syllabus" : "" , "objectives" : "" }
2627 html = await response .text ()
2728 soup = BeautifulSoup (html , 'html.parser' )
2829
2930 # Find Course Syllabus
3031 syllabus_tag = soup .find ('p' , string = '課程大綱 Course syllabus' )
31- if syllabus_tag :
32- syllabus = syllabus_tag .find_next ('td' , colspan = "12" ).get_text (strip = True )
33- else :
34- syllabus = ""
32+ syllabus = syllabus_tag .find_next ('td' , colspan = "12" ).get_text (strip = True ) if syllabus_tag else ""
3533
3634 # Find Course Objectives
3735 objectives_tag = soup .find ('p' , string = '課程目標 Objectives' )
38- if objectives_tag :
39- objectives = objectives_tag .find_next ('td' , colspan = "12" ).get_text (strip = True )
40- else :
41- objectives = ""
36+ objectives = objectives_tag .find_next ('td' , colspan = "12" ).get_text (strip = True ) if objectives_tag else ""
4237
4338 return {"syllabus" : syllabus , "objectives" : objectives }
4439 except aiohttp .ClientError :
4540 return {"syllabus" : "" , "objectives" : "" }
46-
41+ finally :
42+ # Update the progress bar
43+ progress_bar .update (1 )
4744
4845async def extend_course_dataframe (courses_df : pd .DataFrame , url_column : str ) -> pd .DataFrame :
4946 """
50- Extend the DataFrame by extracting syllabus and objectives for each URL asynchronously.
47+ Extend the DataFrame by extracting syllabus and objectives for each URL asynchronously,
48+ with progress updates.
5149
5250 Args:
5351 courses_df (pd.DataFrame): DataFrame containing a column of URLs.
@@ -56,16 +54,13 @@ async def extend_course_dataframe(courses_df: pd.DataFrame, url_column: str) ->
5654 Returns:
5755 pd.DataFrame: Updated DataFrame with syllabus and objectives columns.
5856 """
59- tasks = [extract_course_details (url ) for url in courses_df [url_column ]]
60- extracted_data = []
61- with tqdm (total = len (tasks ), desc = "Fetching course details" ) as progress_bar :
62- for future in asyncio .as_completed (tasks ):
63- result = await future
64- extracted_data .append (result )
65- progress_bar .update (1 )
57+ total_tasks = len (courses_df [url_column ])
58+ with tqdm (total = total_tasks , desc = "Fetching course details" ) as progress_bar :
59+ tasks = [extract_course_details (url , progress_bar ) for url in courses_df [url_column ]]
60+ results = await asyncio .gather (* tasks )
6661
67- extracted_df = pd .DataFrame (extracted_data )
68- extended_df = pd .concat ([courses_df , extracted_df ], axis = 1 )
62+ extracted_df = pd .DataFrame (results )
63+ extended_df = pd .concat ([courses_df . reset_index ( drop = True ) , extracted_df ], axis = 1 )
6964 return extended_df
7065
7166
@@ -77,12 +72,10 @@ async def extend_course_dataframe(courses_df: pd.DataFrame, url_column: str) ->
7772 'https://selcrs.nsysu.edu.tw/menu5/showoutline.asp?SYEAR=113&SEM=1&CrsDat=GEAI1369&Crsname=基礎訊號處理'
7873 ]}
7974
80- # Create a DataFrame with URLs
8175 test_df = pd .DataFrame (data )
8276
83- # Extend the DataFrame with syllabus and objectives
8477 async def main ():
85- extended_courses_df = await extend_course_dataframe (test_df , 'url' )
86- print (extended_courses_df )
78+ extended_courses_df = await extend_course_dataframe (test_df , 'url' )
79+ print (extended_courses_df )
8780
8881 asyncio .run (main ())
0 commit comments