Skip to content

Commit 7baf7b6

Browse files
Merge pull request #227 from SocialChangeLab/feature-improve-stats
refactor(impact): use all available data as controls
2 parents 53f2000 + 7db0f2e commit 7baf7b6

File tree

4 files changed

+85
-119
lines changed

4 files changed

+85
-119
lines changed

.vscode/launch.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
"media_impact_monitor.api:app",
2020
"--host=0.0.0.0",
2121
"--port=8000",
22-
"--reload",
23-
"--reload-dir=backend-python/media_impact_monitor"
22+
// "--reload",
23+
// "--reload-dir=backend-python/media_impact_monitor"
2424
],
2525
"jinja": true
2626
}

backend-python/media_impact_monitor/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ async def app_lifespan(app: FastAPI):
6969
yield
7070

7171

72-
sentry_sdk.init(dsn=SENTRY_DSN, traces_sample_rate=1.0, profiles_sample_rate=1.0)
72+
# sentry_sdk.init(dsn=SENTRY_DSN, traces_sample_rate=1.0, profiles_sample_rate=1.0)
7373

7474

7575
app = FastAPI(**metadata, lifespan=app_lifespan)

backend-python/media_impact_monitor/impact.py

Lines changed: 25 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,13 @@ def get_impact(q: ImpactSearch) -> Impact:
2626
events = get_events(
2727
EventSearch(
2828
source="acled",
29-
organizers=[q.organizer],
29+
topic="climate_change",
3030
start_date=q.start_date,
3131
end_date=q.end_date,
3232
)
3333
)
34-
n_event_days = events["date"].nunique()
34+
events_from_org = events[events["organizers"].apply(lambda x: q.organizer in x)]
35+
n_event_days = events_from_org["date"].nunique()
3536
if n_event_days < 5:
3637
return Impact(
3738
method_applicability=False,
@@ -50,85 +51,32 @@ def get_impact(q: ImpactSearch) -> Impact:
5051
],
5152
impact_estimates=None,
5253
)
53-
applicabilities = []
54-
lims_list = []
55-
dfs = dict()
56-
for topic in trends.columns:
57-
trend = trends[topic]
58-
if trend.empty:
59-
applicabilities.append(False)
60-
lims_list.append([f"No media data available for {q.impacted_trend}."])
61-
continue
62-
trend.name = "count"
63-
appl, limitations, impact = get_impact_for_single_trend(
64-
events=events,
65-
trend=trend,
66-
method=q.method,
67-
aggregation=q.impacted_trend.aggregation,
68-
)
69-
dfs[topic] = impact
70-
applicabilities.append(appl)
71-
lims_list.append(limitations)
72-
assert len(set(applicabilities)) == 1, "All topics should have same applicability."
73-
assert (
74-
len(set([str(lims) for lims in lims_list])) == 1
75-
), "All topics should have same limitations."
54+
match q.method:
55+
case "interrupted_time_series":
56+
raise NotImplementedError("Interrupted time series is not up to date.")
57+
case "synthetic_control":
58+
raise NotImplementedError("Synthetic control is not yet implemented.")
59+
case "time_series_regression":
60+
mean_impacts, limitations = time_series_regression(
61+
events=events, article_counts=trends
62+
)
63+
case _:
64+
raise ValueError(f"Unsupported method: {q.method}")
65+
org = q.organizer
7666
n_days = 7 - 1
7767
return Impact(
78-
method_applicability=applicabilities[0],
79-
method_limitations=lims_list[0],
68+
method_applicability=True,
69+
method_limitations=[],
8070
impact_estimates={
81-
topic: ImpactEstimate(
71+
topic_name: ImpactEstimate(
8272
absolute_impact=MeanWithUncertainty(
83-
mean=impact["mean"].loc[n_days],
84-
ci_upper=impact["ci_upper"].loc[n_days],
85-
ci_lower=impact["ci_lower"].loc[n_days],
86-
p_value=impact["p_value"].loc[n_days],
73+
mean=topic_impact[org]["mean"],
74+
ci_upper=topic_impact[org]["ci_upper"],
75+
ci_lower=topic_impact[org]["ci_lower"],
76+
p_value=topic_impact[org]["p_value"],
8777
),
88-
absolute_impact_time_series=[
89-
DatedMeanWithUncertainty(**d)
90-
for d in dfs[topic].reset_index().to_dict(orient="records")
91-
],
92-
# relative_impact=MeanWithUncertainty( # TODO: calculate relative impact
93-
# mean=1.0,
94-
# ci_upper=1.0,
95-
# ci_lower=1.0,
96-
# ),
97-
# relative_impact_time_series=[],
78+
absolute_impact_time_series=[],
9879
)
99-
for topic, impact in dfs.items()
100-
}
101-
if applicabilities[0]
102-
else None,
80+
for topic_name, topic_impact in mean_impacts.items()
81+
},
10382
)
104-
105-
106-
def get_impact_for_single_trend(
107-
events: pd.DataFrame,
108-
trend: pd.DataFrame,
109-
method: Method,
110-
aggregation="daily",
111-
) -> tuple[bool, list[str], pd.DataFrame]:
112-
hidden_days_before_protest = 7
113-
horizon = 28
114-
match method:
115-
case "interrupted_time_series":
116-
mean_impact, limitations = interrupted_time_series(
117-
events=events,
118-
article_counts=trend,
119-
horizon=horizon,
120-
hidden_days_before_protest=hidden_days_before_protest,
121-
aggregation=aggregation,
122-
)
123-
case "synthetic_control":
124-
raise NotImplementedError("Synthetic control is not yet implemented.")
125-
case "time_series_regression":
126-
mean_impact, limitations = time_series_regression(
127-
events=events,
128-
article_counts=trend,
129-
aggregation=aggregation,
130-
)
131-
case _:
132-
raise ValueError(f"Unsupported method: {method}")
133-
return True, limitations, mean_impact
134-
# TODO: divide impact by number of events on that day (by the same org)
Lines changed: 57 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import pandas as pd
22
import statsmodels.api as sm
33

4-
from media_impact_monitor.types_ import Aggregation
4+
from collections import Counter
5+
from itertools import chain
56

67

78
def add_lags(df: pd.DataFrame, lags: list[int]):
@@ -39,61 +40,77 @@ def regress(
3940
):
4041
"""Get regression result where the outcome is `day` days after the treatment."""
4142
lags = range(1, lags + 1)
42-
media_df = pd.DataFrame(media_df, columns=["count"])
43-
# protest_df = add_lags(protest_df, lags=[])
44-
media_df = add_lags(media_df, lags=[4, 5, 6, 7, 8])
43+
outcomes = media_df.columns
44+
45+
placebo = False
46+
if placebo:
47+
protest_df = protest_df.copy().sample(frac=1)
48+
results = {}
49+
protest_df = add_lags(protest_df, lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
50+
media_df = add_lags(media_df, lags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
4551
# protest_df = add_emws(protest_df)
4652
# media_df = add_emws(media_df, spans=[14])
4753
df = pd.concat([protest_df, media_df], axis=1)
4854
df = add_weekday_dummies(df)
49-
treatment = "protest"
50-
outcome = "count"
51-
df[outcome] = df[outcome].shift(-day)
52-
if cumulative:
53-
# TODO write tests for this
54-
if day < 0:
55-
indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=-day)
56-
df[outcome] = -df[outcome].rolling(window=indexer).sum()
57-
else:
58-
df[outcome] = df[outcome].rolling(day + 1).sum()
59-
df = df.dropna()
60-
placebo = False
61-
if placebo:
62-
df[treatment] = df.sample(frac=1)[treatment].to_list()
63-
X = df.drop(columns=[outcome])
64-
y = df[outcome]
65-
model = sm.OLS(y, sm.add_constant(X))
66-
model = model.fit(cov_type="HC3")
67-
alpha = 0.1
68-
return {
69-
"date": day,
70-
"mean": model.params[treatment],
71-
"p_value": model.pvalues[treatment],
72-
"ci_lower": model.conf_int(alpha=alpha)[0][treatment],
73-
"ci_upper": model.conf_int(alpha=alpha)[1][treatment],
74-
}
55+
for outcome in outcomes:
56+
df[outcome] = df[outcome].shift(-day)
57+
if cumulative:
58+
# TODO write tests for this
59+
if day < 0:
60+
indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=-day)
61+
df[outcome] = -df[outcome].rolling(window=indexer).sum()
62+
else:
63+
df[outcome] = df[outcome].rolling(day + 1).sum()
64+
df = df.dropna()
65+
X = df.drop(columns=[outcome])
66+
y = df[outcome]
67+
model = sm.OLS(y, sm.add_constant(X))
68+
model = model.fit(cov_type="HC3")
69+
alpha = 0.1
70+
results[outcome] = {}
71+
for org in protest_df.columns:
72+
results[outcome][org] = {
73+
"date": day,
74+
"mean": model.params[org],
75+
"p_value": model.pvalues[org],
76+
"ci_lower": model.conf_int(alpha=alpha)[0][org],
77+
"ci_upper": model.conf_int(alpha=alpha)[1][org],
78+
}
79+
return results
7580

7681

77-
def agg_protests(df: pd.DataFrame):
78-
start = df["date"].min() # HACK
79-
end = df["date"].max() # HACK
80-
df = df.groupby("date")["date"].count().to_frame("protest")
82+
def agg_protests(events: pd.DataFrame):
83+
start = events["date"].min() # HACK
84+
end = events["date"].max() # HACK
85+
# we want to reduce the amount of organizers for our simple regression
86+
# (when using multilevel models later, we can relax this)
87+
# only use primary organizers for each event
88+
primary_orgs = events["organizers"].apply(lambda x: x[0] if x else None)
89+
# only keep most frequent organizers
90+
orgs = [k for k, v in Counter(primary_orgs).items() if k and v > 10]
91+
# create dummy columns for each organizer
92+
orgs_df = pd.DataFrame({org: primary_orgs == org for org in orgs})
93+
orgs_df["date"] = events["date"]
94+
orgs_df = orgs_df.set_index("date")
95+
# create time series by counting protests per day for each organizer
96+
ts = orgs_df.groupby("date").agg({org: "any" for org in orgs}).astype(int)
8197
idx = pd.date_range(start=start, end=end)
82-
df = df.reindex(idx).fillna(0)
83-
return df
98+
ts = ts.reindex(idx).fillna(0)
99+
return ts
84100

85101

86102
def estimate_impact(
87103
events: pd.DataFrame,
88104
article_counts: pd.Series,
89-
aggregation: Aggregation,
90105
cumulative: bool = True,
91106
lags: int = 14,
92107
outcome_days: list[int] = range(-14, 14),
93108
):
94109
protest_df = agg_protests(events)
95-
n = protest_df["protest"].sum()
96-
limitations = [f"There have only been {n} protests."]
110+
return regress(
111+
protest_df, article_counts, day=7, lags=lags, cumulative=cumulative
112+
), []
113+
# TODO: do this per day:
97114
impacts = pd.DataFrame(
98115
[
99116
regress(
@@ -103,4 +120,5 @@ def estimate_impact(
103120
]
104121
)
105122
impacts = impacts.set_index("date")[["mean", "ci_lower", "ci_upper", "p_value"]]
123+
limitations = []
106124
return impacts, limitations

0 commit comments

Comments
 (0)