Skip to content

Commit 61f9cdc

Browse files
Merge pull request #225 from SocialChangeLab/dev
Dev
2 parents 7ce2068 + 53f2000 commit 61f9cdc

File tree

12 files changed

+351
-15
lines changed

12 files changed

+351
-15
lines changed

.env.example

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
1+
# protest data
12
ACLED_EMAIL=
23
ACLED_KEY=
4+
# media data
35
MEDIACLOUD_API_TOKEN=
46
ZENROWS_API_KEY=
7+
# google trends data
8+
DATAFORSEO_EMAIL=
9+
DATAFORSEO_PASSWORD=
10+
# tiktok data
11+
RAPIDAPI_KEY=
12+
# bundestag data
13+
BUNDESTAG_API_KEY=
14+
# ai
515
AZURE_API_BASE=
616
AZURE_API_VERSION=
717
AZURE_API_KEY=
8-
BUNDESTAG_API_KEY=
9-
DATAFORSEO_EMAIL=
10-
DATAFORSEO_PASSWORD=
11-
PORT=
18+
# logging
1219
SENTRY_DSN=
13-
AI_TREND_RESOLUTION=0.01 # fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
20+
# port where the server should run
21+
PORT=
22+
# fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
23+
AI_TREND_RESOLUTION=0.01

.github/workflows/deploy.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,5 @@ jobs:
6666
DATAFORSEO_PASSWORD: ${{ secrets.DATAFORSEO_PASSWORD }}
6767
BUNDESTAG_API_KEY: ${{ secrets.BUNDESTAG_API_KEY }}
6868
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
69+
RAPIDAPI_KEY: ${{ secrets.RAPIDAPI_KEY }}
70+

backend-python/media_impact_monitor/cron.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def fill_cache():
5151
)
5252
except Exception as e:
5353
errors.append(f"events {data_source}: {e}")
54-
for media_source in ["news_online", "news_print", "web_google"]:
54+
for media_source in ["news_online", "news_print", "social_tiktok", "web_google"]:
5555
for trend_type in ["keywords", "sentiment"]:
5656
for aggregation in ["daily", "weekly"]:
5757
if aggregation == "daily" and media_source == "web_google":
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import re
2+
from collections import Counter
3+
from datetime import datetime
4+
from typing import Any
5+
6+
import pandas as pd
7+
from tqdm.auto import tqdm
8+
9+
from media_impact_monitor.util.cache import get
10+
from media_impact_monitor.util.env import RAPIDAPI_KEY
11+
12+
headers = {
13+
"x-rapidapi-key": RAPIDAPI_KEY,
14+
"x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com",
15+
}
16+
17+
18+
def get_videos_for_keywords(
19+
keywords: str, n: int, cursor: int = 0
20+
) -> list[dict[str, Any]]:
21+
"""
22+
Get videos for a given set of keywords.
23+
Problem: This returns max ~150 videos, even for very popular keywords.
24+
Use hashtag query to get more videos.
25+
"""
26+
url = "https://tiktok-scraper7.p.rapidapi.com/feed/search"
27+
query = {
28+
"keywords": keywords,
29+
"region": "us", # location of the proxy server
30+
"count": 30, # max: 30
31+
"cursor": cursor,
32+
"publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months
33+
"sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted
34+
}
35+
response = get(url, headers=headers, params=query)
36+
# print(response.json())
37+
data = response.json()["data"]
38+
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
39+
if has_more and cursor < n:
40+
videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor))
41+
return videos
42+
43+
44+
def get_hashtag_suggestions(keywords: str) -> Counter:
45+
videos = get_videos_for_keywords(keywords, n=100)
46+
titles = [video["title"] for video in videos]
47+
hashtags = [re.findall(r"#(\w+)", title) for title in titles]
48+
hashtags = [item for sublist in hashtags for item in sublist]
49+
hashtag_counts = Counter(hashtags)
50+
return hashtag_counts
51+
52+
53+
def get_hashtag_id(hashtag: str) -> str:
54+
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info"
55+
querystring = {
56+
"challenge_name": hashtag,
57+
}
58+
response = get(url, headers=headers, params=querystring)
59+
return response.json()["data"]["id"]
60+
61+
62+
def get_videos_for_hashtag_id(
63+
hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True
64+
) -> list[dict[str, Any]]:
65+
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts"
66+
query = {
67+
"challenge_id": hashtag_id,
68+
"count": 20, # max: 20
69+
"cursor": cursor,
70+
}
71+
response = get(url, headers=headers, params=query)
72+
data = response.json()["data"]
73+
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
74+
if has_more and cursor < n:
75+
if verbose:
76+
print(cursor)
77+
videos.extend(
78+
get_videos_for_hashtag_id(
79+
hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose
80+
)
81+
)
82+
return videos
83+
84+
85+
def get_videos_for_hashtag(
86+
hashtag: str, n: int, cursor: int = 0, verbose: bool = True
87+
) -> list[dict[str, Any]]:
88+
hashtag_id = get_hashtag_id(hashtag)
89+
return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose)
90+
91+
92+
def get_video_history_for_hashtag(
93+
hashtag: str, n: int, verbose: bool = True
94+
) -> pd.DataFrame:
95+
"""
96+
Get video history for a hashtag.
97+
Returns a time series of views and posts.
98+
Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`).
99+
"""
100+
videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose)
101+
df = pd.DataFrame(
102+
{
103+
"date": [datetime.fromtimestamp(video["create_time"]) for video in videos],
104+
"id": [video["video_id"] for video in videos],
105+
"title": [video["title"] for video in videos],
106+
"views": [video["play_count"] for video in videos],
107+
}
108+
)
109+
df["date"] = pd.to_datetime(df["date"])
110+
df = df.sort_values("date")
111+
ts = (
112+
df.resample("1D", on="date")
113+
.agg(
114+
{
115+
"views": "sum",
116+
"id": "count",
117+
}
118+
)
119+
.rename(columns={"id": "posts"})
120+
)
121+
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
122+
return ts
123+
124+
125+
def get_comments_for_video(
126+
video_id: str, n: int, cursor: int = 0
127+
) -> list[dict[str, Any]]:
128+
url = "https://tiktok-scraper7.p.rapidapi.com/comment/list"
129+
query = {
130+
"url": video_id,
131+
"count": 50, # max: 50 (?)
132+
"cursor": cursor,
133+
}
134+
response = get(url, headers=headers, params=query)
135+
data = response.json()["data"]
136+
comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"]
137+
if has_more and cursor < n:
138+
comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor))
139+
return comments
140+
141+
142+
def get_comment_history_for_hashtag(
143+
hashtag: str, n_posts: int, n_comments: int, verbose: bool = True
144+
) -> pd.DataFrame:
145+
videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose)
146+
comments = [
147+
get_comments_for_video(video["video_id"], n=n_comments)
148+
for video in tqdm(videos)
149+
if video["comment_count"] > 0
150+
]
151+
comments = [comment for video_comments in comments for comment in video_comments]
152+
comments_df = pd.DataFrame(
153+
{
154+
"date": [
155+
datetime.fromtimestamp(comment["create_time"]) for comment in comments
156+
],
157+
"text": [comment["text"] for comment in comments],
158+
"video_id": [comment["video_id"] for comment in comments],
159+
}
160+
)
161+
ts = (
162+
comments_df.resample("1W", on="date")
163+
.agg(
164+
{
165+
"text": "count",
166+
}
167+
)
168+
.rename(columns={"text": "comments"})
169+
)
170+
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
171+
return ts
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import pytest
2+
import pandas as pd
3+
from datetime import datetime, timedelta
4+
from collections import Counter
5+
from media_impact_monitor.data_loaders.social_media.tiktok import (
6+
get_videos_for_keywords,
7+
get_hashtag_suggestions,
8+
get_hashtag_id,
9+
get_videos_for_hashtag_id,
10+
get_videos_for_hashtag,
11+
get_video_history_for_hashtag,
12+
get_comments_for_video,
13+
get_comment_history_for_hashtag,
14+
)
15+
16+
17+
@pytest.mark.slow
18+
def test_get_videos_for_keywords():
19+
videos = get_videos_for_keywords("climate change", n=50)
20+
assert len(videos) > 0
21+
assert isinstance(videos[0], dict)
22+
assert "title" in videos[0]
23+
assert "video_id" in videos[0]
24+
25+
26+
@pytest.mark.slow
27+
def test_get_hashtag_suggestions():
28+
suggestions = get_hashtag_suggestions("climate change")
29+
assert len(suggestions) > 0
30+
assert isinstance(suggestions, Counter)
31+
32+
33+
@pytest.mark.slow
34+
def test_get_hashtag_id():
35+
hashtag_id = get_hashtag_id("climatechange")
36+
assert isinstance(hashtag_id, str)
37+
assert len(hashtag_id) > 0
38+
39+
40+
@pytest.mark.slow
41+
def test_get_videos_for_hashtag_id():
42+
hashtag_id = get_hashtag_id("climatechange")
43+
videos = get_videos_for_hashtag_id(hashtag_id, n=50)
44+
assert len(videos) > 0
45+
assert isinstance(videos[0], dict)
46+
assert "title" in videos[0]
47+
assert "video_id" in videos[0]
48+
49+
50+
@pytest.mark.slow
51+
def test_get_videos_for_hashtag():
52+
videos = get_videos_for_hashtag("climatechange", n=50)
53+
assert len(videos) > 0
54+
assert isinstance(videos[0], dict)
55+
assert "title" in videos[0]
56+
assert "video_id" in videos[0]
57+
58+
59+
@pytest.mark.slow
60+
def test_get_video_history_for_hashtag():
61+
history = get_video_history_for_hashtag("climatechange", n=100)
62+
assert isinstance(history, pd.DataFrame)
63+
assert len(history) > 0
64+
assert "views" in history.columns
65+
assert "posts" in history.columns
66+
67+
68+
@pytest.mark.slow
69+
def test_get_comments_for_video():
70+
videos = get_videos_for_hashtag("climatechange", n=1)
71+
video_id = videos[0]["video_id"]
72+
comments = get_comments_for_video(video_id, n=50)
73+
assert len(comments) > 0
74+
assert isinstance(comments[0], dict)
75+
assert "text" in comments[0]
76+
77+
78+
@pytest.mark.slow
79+
def test_get_comment_history_for_hashtag():
80+
history = get_comment_history_for_hashtag(
81+
"climatechange", n_posts=10, n_comments=10
82+
)
83+
assert isinstance(history, pd.DataFrame)
84+
assert len(history) > 0
85+
assert "comments" in history.columns
86+
87+
88+
@pytest.mark.slow
89+
def test_data_freshness():
90+
videos = get_videos_for_hashtag("climatechange", n=50)
91+
latest_video_date = max(
92+
datetime.fromtimestamp(video["create_time"]) for video in videos
93+
)
94+
assert latest_video_date >= datetime.now() - timedelta(
95+
days=7
96+
), "No recent videos found"
97+
98+
99+
@pytest.mark.slow
100+
def test_video_content():
101+
videos = get_videos_for_keywords("climate change", n=50)
102+
climate_related_words = [
103+
"climate",
104+
"environment",
105+
"global warming",
106+
"sustainability",
107+
]
108+
assert any(
109+
any(word in video["title"].lower() for word in climate_related_words)
110+
for video in videos
111+
), "No climate-related content found in video titles"

backend-python/media_impact_monitor/trends/keyword_trend.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
get_mediacloud_counts,
66
)
77
from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts
8+
from media_impact_monitor.data_loaders.social_media.tiktok import (
9+
get_video_history_for_hashtag,
10+
)
811
from media_impact_monitor.data_loaders.web.google_trends import get_google_trends_counts
912
from media_impact_monitor.types_ import TrendSearch
1013
from media_impact_monitor.util.paths import src
@@ -31,6 +34,8 @@ def get_keyword_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
3134
)
3235
case "web_google":
3336
ds = get_google_trends_counts(query=query, end_date=q.end_date)
37+
case "social_tiktok":
38+
ds = get_video_history_for_hashtag(query, n=1000, verbose=True)["posts"]
3439
case _:
3540
raise ValueError(f"Unsupported media source: {q.media_source}")
3641
dss[topic] = ds
@@ -65,7 +70,14 @@ def topic_queries(media_source: str) -> dict[str, str]:
6570
# media_source,
6671
# ),
6772
}
68-
if media_source != "web_google":
73+
if media_source == "social_tiktok":
74+
keyword_queries = {
75+
"climate activism": "climateprotest", # TODO: improve
76+
"climate policy": "climateaction", # TODO: improve
77+
"climate science": "climatechange", # TODO: improve
78+
"climate crisis framing": "climatecrisis", # TODO: improve
79+
}
80+
elif media_source != "web_google":
6981
keyword_queries["climate activism"] = xs_with_ys(
7082
keywords["climate_science"]
7183
+ keywords["climate_policy"]

backend-python/media_impact_monitor/types_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# FIXME: consider renaming "Topic" to "Issue" to avoid confusion with topics within the issue (like science or policy)
1010
Topic = Literal["climate_change"]
1111
Query = str # for now, just a single keyword
12-
MediaSource = Literal["news_online", "news_print", "web_google"]
12+
MediaSource = Literal["news_online", "news_print", "web_google", "social_tiktok"]
1313

1414
StartDateField = Field(
1515
default=date(2020, 1, 1),

backend-python/media_impact_monitor/util/env.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
DATAFORSEO_PASSWORD = environ["DATAFORSEO_PASSWORD"]
1919
BUNDESTAG_API_KEY = environ["BUNDESTAG_API_KEY"]
2020
SENTRY_DSN = environ["SENTRY_DSN"]
21+
RAPIDAPI_KEY = environ["RAPIDAPI_KEY"]
2122
AI_TREND_RESOLUTION = float(environ.get("AI_TREND_RESOLUTION", 0.01))
2223

2324
assert ACLED_EMAIL
@@ -31,3 +32,4 @@
3132
assert DATAFORSEO_PASSWORD
3233
assert BUNDESTAG_API_KEY
3334
assert SENTRY_DSN
35+
assert RAPIDAPI_KEY

0 commit comments

Comments
 (0)