Merge pull request #225 from SocialChangeLab/dev

davidpomerenke · web-flow · commit 61f9cdc6e579 · 2024-10-21T01:22:09.000+02:00
Dev
diff --git a/.env.example b/.env.example
@@ -1,13 +1,23 @@
+# protest data
 ACLED_EMAIL=
 ACLED_KEY=
+# media data
 MEDIACLOUD_API_TOKEN=
 ZENROWS_API_KEY=
+# google trends data
+DATAFORSEO_EMAIL=
+DATAFORSEO_PASSWORD=
+# tiktok data
+RAPIDAPI_KEY=
+# bundestag data
+BUNDESTAG_API_KEY=
+# ai
 AZURE_API_BASE=
 AZURE_API_VERSION=
 AZURE_API_KEY=
-BUNDESTAG_API_KEY=
-DATAFORSEO_EMAIL=
-DATAFORSEO_PASSWORD=
-PORT=
+# logging
 SENTRY_DSN=
-AI_TREND_RESOLUTION=0.01 # fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
+# port where the server should run
+PORT=
+# fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
+AI_TREND_RESOLUTION=0.01 
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -66,3 +66,5 @@ jobs:
         DATAFORSEO_PASSWORD: ${{ secrets.DATAFORSEO_PASSWORD }}
         BUNDESTAG_API_KEY: ${{ secrets.BUNDESTAG_API_KEY }}
         SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
+        RAPIDAPI_KEY: ${{ secrets.RAPIDAPI_KEY }}
+
diff --git a/backend-python/media_impact_monitor/cron.py b/backend-python/media_impact_monitor/cron.py
@@ -51,7 +51,7 @@ def fill_cache():
             )
         except Exception as e:
             errors.append(f"events {data_source}: {e}")
-    for media_source in ["news_online", "news_print", "web_google"]:
+    for media_source in ["news_online", "news_print", "social_tiktok", "web_google"]:
         for trend_type in ["keywords", "sentiment"]:
             for aggregation in ["daily", "weekly"]:
                 if aggregation == "daily" and media_source == "web_google":
diff --git a/backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py
@@ -0,0 +1,171 @@
+import re
+from collections import Counter
+from datetime import datetime
+from typing import Any
+
+import pandas as pd
+from tqdm.auto import tqdm
+
+from media_impact_monitor.util.cache import get
+from media_impact_monitor.util.env import RAPIDAPI_KEY
+
+headers = {
+    "x-rapidapi-key": RAPIDAPI_KEY,
+    "x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com",
+}
+
+
+def get_videos_for_keywords(
+    keywords: str, n: int, cursor: int = 0
+) -> list[dict[str, Any]]:
+    """
+    Get videos for a given set of keywords.
+    Problem: This returns max ~150 videos, even for very popular keywords.
+    Use hashtag query to get more videos.
+    """
+    url = "https://tiktok-scraper7.p.rapidapi.com/feed/search"
+    query = {
+        "keywords": keywords,
+        "region": "us",  # location of the proxy server
+        "count": 30,  # max: 30
+        "cursor": cursor,
+        "publish_time": "0",  # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months
+        "sort_type": "0",  # 0 - Relevance 1 - Like count 3 - Date posted
+    }
+    response = get(url, headers=headers, params=query)
+    # print(response.json())
+    data = response.json()["data"]
+    videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
+    if has_more and cursor < n:
+        videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor))
+    return videos
+
+
+def get_hashtag_suggestions(keywords: str) -> Counter:
+    videos = get_videos_for_keywords(keywords, n=100)
+    titles = [video["title"] for video in videos]
+    hashtags = [re.findall(r"#(\w+)", title) for title in titles]
+    hashtags = [item for sublist in hashtags for item in sublist]
+    hashtag_counts = Counter(hashtags)
+    return hashtag_counts
+
+
+def get_hashtag_id(hashtag: str) -> str:
+    url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info"
+    querystring = {
+        "challenge_name": hashtag,
+    }
+    response = get(url, headers=headers, params=querystring)
+    return response.json()["data"]["id"]
+
+
+def get_videos_for_hashtag_id(
+    hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True
+) -> list[dict[str, Any]]:
+    url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts"
+    query = {
+        "challenge_id": hashtag_id,
+        "count": 20,  # max: 20
+        "cursor": cursor,
+    }
+    response = get(url, headers=headers, params=query)
+    data = response.json()["data"]
+    videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
+    if has_more and cursor < n:
+        if verbose:
+            print(cursor)
+        videos.extend(
+            get_videos_for_hashtag_id(
+                hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose
+            )
+        )
+    return videos
+
+
+def get_videos_for_hashtag(
+    hashtag: str, n: int, cursor: int = 0, verbose: bool = True
+) -> list[dict[str, Any]]:
+    hashtag_id = get_hashtag_id(hashtag)
+    return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose)
+
+
+def get_video_history_for_hashtag(
+    hashtag: str, n: int, verbose: bool = True
+) -> pd.DataFrame:
+    """
+    Get video history for a hashtag.
+    Returns a time series of views and posts.
+    Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`).
+    """
+    videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose)
+    df = pd.DataFrame(
+        {
+            "date": [datetime.fromtimestamp(video["create_time"]) for video in videos],
+            "id": [video["video_id"] for video in videos],
+            "title": [video["title"] for video in videos],
+            "views": [video["play_count"] for video in videos],
+        }
+    )
+    df["date"] = pd.to_datetime(df["date"])
+    df = df.sort_values("date")
+    ts = (
+        df.resample("1D", on="date")
+        .agg(
+            {
+                "views": "sum",
+                "id": "count",
+            }
+        )
+        .rename(columns={"id": "posts"})
+    )
+    ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
+    return ts
+
+
+def get_comments_for_video(
+    video_id: str, n: int, cursor: int = 0
+) -> list[dict[str, Any]]:
+    url = "https://tiktok-scraper7.p.rapidapi.com/comment/list"
+    query = {
+        "url": video_id,
+        "count": 50,  # max: 50 (?)
+        "cursor": cursor,
+    }
+    response = get(url, headers=headers, params=query)
+    data = response.json()["data"]
+    comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"]
+    if has_more and cursor < n:
+        comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor))
+    return comments
+
+
+def get_comment_history_for_hashtag(
+    hashtag: str, n_posts: int, n_comments: int, verbose: bool = True
+) -> pd.DataFrame:
+    videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose)
+    comments = [
+        get_comments_for_video(video["video_id"], n=n_comments)
+        for video in tqdm(videos)
+        if video["comment_count"] > 0
+    ]
+    comments = [comment for video_comments in comments for comment in video_comments]
+    comments_df = pd.DataFrame(
+        {
+            "date": [
+                datetime.fromtimestamp(comment["create_time"]) for comment in comments
+            ],
+            "text": [comment["text"] for comment in comments],
+            "video_id": [comment["video_id"] for comment in comments],
+        }
+    )
+    ts = (
+        comments_df.resample("1W", on="date")
+        .agg(
+            {
+                "text": "count",
+            }
+        )
+        .rename(columns={"text": "comments"})
+    )
+    ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
+    return ts
diff --git a/backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py
@@ -0,0 +1,111 @@
+import pytest
+import pandas as pd
+from datetime import datetime, timedelta
+from collections import Counter
+from media_impact_monitor.data_loaders.social_media.tiktok import (
+    get_videos_for_keywords,
+    get_hashtag_suggestions,
+    get_hashtag_id,
+    get_videos_for_hashtag_id,
+    get_videos_for_hashtag,
+    get_video_history_for_hashtag,
+    get_comments_for_video,
+    get_comment_history_for_hashtag,
+)
+
+
+@pytest.mark.slow
+def test_get_videos_for_keywords():
+    videos = get_videos_for_keywords("climate change", n=50)
+    assert len(videos) > 0
+    assert isinstance(videos[0], dict)
+    assert "title" in videos[0]
+    assert "video_id" in videos[0]
+
+
+@pytest.mark.slow
+def test_get_hashtag_suggestions():
+    suggestions = get_hashtag_suggestions("climate change")
+    assert len(suggestions) > 0
+    assert isinstance(suggestions, Counter)
+
+
+@pytest.mark.slow
+def test_get_hashtag_id():
+    hashtag_id = get_hashtag_id("climatechange")
+    assert isinstance(hashtag_id, str)
+    assert len(hashtag_id) > 0
+
+
+@pytest.mark.slow
+def test_get_videos_for_hashtag_id():
+    hashtag_id = get_hashtag_id("climatechange")
+    videos = get_videos_for_hashtag_id(hashtag_id, n=50)
+    assert len(videos) > 0
+    assert isinstance(videos[0], dict)
+    assert "title" in videos[0]
+    assert "video_id" in videos[0]
+
+
+@pytest.mark.slow
+def test_get_videos_for_hashtag():
+    videos = get_videos_for_hashtag("climatechange", n=50)
+    assert len(videos) > 0
+    assert isinstance(videos[0], dict)
+    assert "title" in videos[0]
+    assert "video_id" in videos[0]
+
+
+@pytest.mark.slow
+def test_get_video_history_for_hashtag():
+    history = get_video_history_for_hashtag("climatechange", n=100)
+    assert isinstance(history, pd.DataFrame)
+    assert len(history) > 0
+    assert "views" in history.columns
+    assert "posts" in history.columns
+
+
+@pytest.mark.slow
+def test_get_comments_for_video():
+    videos = get_videos_for_hashtag("climatechange", n=1)
+    video_id = videos[0]["video_id"]
+    comments = get_comments_for_video(video_id, n=50)
+    assert len(comments) > 0
+    assert isinstance(comments[0], dict)
+    assert "text" in comments[0]
+
+
+@pytest.mark.slow
+def test_get_comment_history_for_hashtag():
+    history = get_comment_history_for_hashtag(
+        "climatechange", n_posts=10, n_comments=10
+    )
+    assert isinstance(history, pd.DataFrame)
+    assert len(history) > 0
+    assert "comments" in history.columns
+
+
+@pytest.mark.slow
+def test_data_freshness():
+    videos = get_videos_for_hashtag("climatechange", n=50)
+    latest_video_date = max(
+        datetime.fromtimestamp(video["create_time"]) for video in videos
+    )
+    assert latest_video_date >= datetime.now() - timedelta(
+        days=7
+    ), "No recent videos found"
+
+
+@pytest.mark.slow
+def test_video_content():
+    videos = get_videos_for_keywords("climate change", n=50)
+    climate_related_words = [
+        "climate",
+        "environment",
+        "global warming",
+        "sustainability",
+    ]
+    assert any(
+        any(word in video["title"].lower() for word in climate_related_words)
+        for video in videos
+    ), "No climate-related content found in video titles"
diff --git a/backend-python/media_impact_monitor/trends/keyword_trend.py b/backend-python/media_impact_monitor/trends/keyword_trend.py
@@ -5,6 +5,9 @@
     get_mediacloud_counts,
 )
 from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts
+from media_impact_monitor.data_loaders.social_media.tiktok import (
+    get_video_history_for_hashtag,
+)
 from media_impact_monitor.data_loaders.web.google_trends import get_google_trends_counts
 from media_impact_monitor.types_ import TrendSearch
 from media_impact_monitor.util.paths import src
@@ -31,6 +34,8 @@ def get_keyword_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
                 )
             case "web_google":
                 ds = get_google_trends_counts(query=query, end_date=q.end_date)
+            case "social_tiktok":
+                ds = get_video_history_for_hashtag(query, n=1000, verbose=True)["posts"]
             case _:
                 raise ValueError(f"Unsupported media source: {q.media_source}")
         dss[topic] = ds
@@ -65,7 +70,14 @@ def topic_queries(media_source: str) -> dict[str, str]:
         #     media_source,
         # ),
     }
-    if media_source != "web_google":
+    if media_source == "social_tiktok":
+        keyword_queries = {
+            "climate activism": "climateprotest",  # TODO: improve
+            "climate policy": "climateaction",  # TODO: improve
+            "climate science": "climatechange",  # TODO: improve
+            "climate crisis framing": "climatecrisis",  # TODO: improve
+        }
+    elif media_source != "web_google":
         keyword_queries["climate activism"] = xs_with_ys(
             keywords["climate_science"]
             + keywords["climate_policy"]
diff --git a/backend-python/media_impact_monitor/types_.py b/backend-python/media_impact_monitor/types_.py
@@ -9,7 +9,7 @@
 # FIXME: consider renaming "Topic" to "Issue" to avoid confusion with topics within the issue (like science or policy)
 Topic = Literal["climate_change"]
 Query = str  # for now, just a single keyword
-MediaSource = Literal["news_online", "news_print", "web_google"]
+MediaSource = Literal["news_online", "news_print", "web_google", "social_tiktok"]
 
 StartDateField = Field(
     default=date(2020, 1, 1),
diff --git a/backend-python/media_impact_monitor/util/env.py b/backend-python/media_impact_monitor/util/env.py
@@ -18,6 +18,7 @@
 DATAFORSEO_PASSWORD = environ["DATAFORSEO_PASSWORD"]
 BUNDESTAG_API_KEY = environ["BUNDESTAG_API_KEY"]
 SENTRY_DSN = environ["SENTRY_DSN"]
+RAPIDAPI_KEY = environ["RAPIDAPI_KEY"]
 AI_TREND_RESOLUTION = float(environ.get("AI_TREND_RESOLUTION", 0.01))
 
 assert ACLED_EMAIL
@@ -31,3 +32,4 @@
 assert DATAFORSEO_PASSWORD
 assert BUNDESTAG_API_KEY
 assert SENTRY_DSN
+assert RAPIDAPI_KEY
diff --git a/frontend-nextjs/src/components/DataSourceSelect.tsx b/frontend-nextjs/src/components/DataSourceSelect.tsx
diff --git a/frontend-nextjs/src/stores/filtersStore.ts b/frontend-nextjs/src/stores/filtersStore.ts
diff --git a/frontend-nextjs/src/utility/eventMediaUtil.ts b/frontend-nextjs/src/utility/eventMediaUtil.ts
diff --git a/frontend-nextjs/src/utility/textUtil.tsx b/frontend-nextjs/src/utility/textUtil.tsx

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ def fill_cache():`
`51`	`51`	`)`
`52`	`52`	`except Exception as e:`
`53`	`53`	`errors.append(f"events {data_source}: {e}")`
`54`		`- for media_source in ["news_online", "news_print", "web_google"]:`
	`54`	`+ for media_source in ["news_online", "news_print", "social_tiktok", "web_google"]:`
`55`	`55`	`for trend_type in ["keywords", "sentiment"]:`
`56`	`56`	`for aggregation in ["daily", "weekly"]:`
`57`	`57`	`if aggregation == "daily" and media_source == "web_google":`