Skip to content

Commit b0ed17a

Browse files
fix: add rate limit and remove parallelism for mediacloud search.story_list
1 parent ed339f5 commit b0ed17a

File tree

1 file changed

+4
-6
lines changed
  • backend-python/src/media_impact_monitor/data_loaders/news_online

1 file changed

+4
-6
lines changed

backend-python/src/media_impact_monitor/data_loaders/news_online/mediacloud_.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import random
22
from datetime import date, timedelta
33
from typing import Literal
4+
from time import sleep
45

56
import mediacloud.api
67
import pandas as pd
8+
from tqdm import tqdm
79
from dateutil.relativedelta import relativedelta
810
from mcmetadata import extract
911
from mcmetadata.exceptions import BadContentError
@@ -55,6 +57,7 @@ def get_mediacloud_counts(
5557

5658
@cache
5759
def _story_list(**kwargs):
60+
sleep(30) # undocumented rate limit, see https://github.com/mediacloud/api-client/issues/107#issuecomment-2977041385
5861
return search.story_list(**kwargs)
5962

6063

@@ -126,12 +129,7 @@ def func(start_and_end):
126129
)
127130

128131
label = "Downloading metadata by month"
129-
stories_lists = parallel_tqdm(
130-
func,
131-
_slice_date_range(start_date, end_date),
132-
desc=f"{label:<{40}}",
133-
n_jobs=8,
134-
)
132+
stories_lists = [func(start_and_end) for start_and_end in tqdm(_slice_date_range(start_date, end_date), desc=f"{label:<{40}}")]
135133
stories = [s for sl in stories_lists for s in sl]
136134
if len(stories) == 0:
137135
return None

0 commit comments

Comments
 (0)