Skip to content

Commit 3f3673f

Browse files
feat(backend): add opencollective donations data exploration
1 parent b4a02ab commit 3f3673f

File tree

4 files changed

+791
-1
lines changed

4 files changed

+791
-1
lines changed
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
from gql import gql, Client
2+
from gql.transport.requests import RequestsHTTPTransport
3+
import os
4+
from dotenv import load_dotenv
5+
import pandas as pd
6+
import matplotlib.pyplot as plt
7+
from joblib import Memory
8+
from tqdm.auto import tqdm
9+
import time
10+
import re
11+
12+
# Setup caching
13+
memory = Memory(".cache", verbose=0)
14+
cache = memory.cache
15+
wait_time = 1
16+
17+
load_dotenv()
18+
transport = RequestsHTTPTransport(
19+
url="https://api.opencollective.com/graphql/v2",
20+
headers={"Personal-Token": os.getenv("OPENCOLLECTIVE_API_TOKEN")},
21+
)
22+
client = Client(transport=transport, fetch_schema_from_transport=True)
23+
24+
25+
@cache(ignore=["wait_time"])
26+
def fetch(query, variable_values, wait_time=0.1, **kwargs):
27+
time.sleep(wait_time)
28+
return client.execute(query, variable_values=variable_values, **kwargs)
29+
30+
31+
def fetch_climate_orgs(limit=1000):
32+
# Define search terms
33+
search_terms = [
34+
"climate",
35+
"for future",
36+
"extinction rebellion",
37+
"xr",
38+
"fossil",
39+
"oil",
40+
]
41+
42+
query = gql("""
43+
query GetAccounts($limit: Int, $offset: Int, $searchTerm: String) {
44+
accounts(
45+
limit: $limit
46+
offset: $offset
47+
isActive: true
48+
searchTerm: $searchTerm
49+
type: COLLECTIVE
50+
) {
51+
nodes {
52+
slug
53+
name
54+
legalName
55+
description
56+
longDescription
57+
tags
58+
location {
59+
name
60+
address
61+
country
62+
}
63+
stats {
64+
totalAmountReceived {
65+
value
66+
currency
67+
valueInCents
68+
}
69+
totalAmountReceivedTimeSeries {
70+
dateFrom
71+
dateTo
72+
timeUnit
73+
nodes {
74+
date
75+
amount {
76+
value
77+
currency
78+
valueInCents
79+
}
80+
label
81+
}
82+
}
83+
}
84+
}
85+
}
86+
}
87+
""")
88+
89+
all_orgs = []
90+
seen_slugs = set() # To prevent duplicates
91+
92+
# Fetch orgs for each search term
93+
for term in search_terms:
94+
response = fetch(
95+
query, variable_values={"limit": limit, "offset": 0, "searchTerm": term}
96+
)
97+
98+
# Add only unique organizations
99+
for org in response["accounts"]["nodes"]:
100+
if org["slug"] not in seen_slugs:
101+
all_orgs.append(org)
102+
seen_slugs.add(org["slug"])
103+
104+
print(f"Found {len(all_orgs)} unique organizations")
105+
return all_orgs
106+
107+
108+
# Fetch transactions for an organization with pagination
109+
@cache
110+
def fetch_transactions(org_slug, total_limit=100_000, page_size=1000):
111+
query = gql("""
112+
query GetAccountTransactions(
113+
$account: [AccountReferenceInput!]
114+
$limit: Int!
115+
$offset: Int!
116+
$orderBy: ChronologicalOrderInput!
117+
) {
118+
transactions(
119+
account: $account
120+
limit: $limit
121+
offset: $offset
122+
orderBy: $orderBy
123+
) {
124+
nodes {
125+
id
126+
createdAt
127+
type
128+
amount {
129+
value
130+
currency
131+
}
132+
}
133+
totalCount
134+
}
135+
}
136+
""")
137+
138+
all_transactions = []
139+
offset = 0
140+
while offset < total_limit:
141+
variables = {
142+
"account": [{"slug": org_slug}],
143+
"limit": min(page_size, total_limit - offset),
144+
"offset": offset,
145+
"orderBy": {"field": "CREATED_AT", "direction": "DESC"},
146+
}
147+
148+
response = fetch(query, variables, wait_time)
149+
transactions = response["transactions"]["nodes"]
150+
total_count = response["transactions"]["totalCount"]
151+
152+
all_transactions.extend(transactions)
153+
154+
# Break if we've fetched all available transactions
155+
if len(transactions) < page_size or offset + page_size >= total_count:
156+
break
157+
158+
offset += page_size
159+
print(f"Fetched {len(all_transactions)} transactions for {org_slug}")
160+
return all_transactions
161+
162+
def get_transactions_df(orgs):
163+
all_transactions = []
164+
for org in tqdm(orgs):
165+
transactions = fetch_transactions(org["slug"])
166+
if transactions:
167+
# Convert to DataFrame with just date and amount
168+
df = pd.DataFrame(
169+
[
170+
{
171+
"date": pd.to_datetime(t["createdAt"]).floor("D"), # Floor to day
172+
"amount": float(t["amount"]["value"]) if "amount" in t else 0,
173+
}
174+
for t in transactions
175+
]
176+
)
177+
if not df.empty:
178+
df["organization"] = org["name"]
179+
all_transactions.append(df)
180+
if not all_transactions:
181+
return None
182+
return pd.concat(all_transactions)
183+
184+
def generalize_group_name(name):
185+
if re.search(r"xr|extinction.?rebellion|scientist.?rebellion", name.lower()):
186+
return "Extinction Rebellion"
187+
elif re.search(r"(4|for).?future|fff|klimatreik", name.lower()):
188+
return "Fridays For Future"
189+
elif re.search(r"fossil.?free", name.lower()):
190+
return "Fossil Free"
191+
else:
192+
return name
193+
194+
def group_by_wealth(df, top_n=10):
195+
# Calculate total donations per organization
196+
total_by_org = df.groupby("organization")["amount"].sum().sort_values(ascending=False)
197+
# Get top N organizations
198+
top_orgs = set(total_by_org.head(top_n).index)
199+
# Create a mapping function
200+
def map_org(org):
201+
return org if org in top_orgs else "Other"
202+
return df.assign(organization=df["organization"].apply(map_org))
203+
204+
def get_monthly_dfs(df, pivot=False):
205+
monthly = (
206+
df.set_index("date")
207+
.groupby(["organization", pd.Grouper(freq="W")])["amount"]
208+
.sum()
209+
.reset_index()
210+
)
211+
212+
# Create separate positive and negative DataFrames
213+
positive_df = monthly[monthly["amount"] > 0].copy()
214+
negative_df = monthly[monthly["amount"] < 0].copy()
215+
216+
if pivot:
217+
# Pivot to get organizations as columns
218+
positive_pivot = positive_df.pivot(
219+
index="date", columns="organization", values="amount"
220+
).fillna(0)
221+
negative_pivot = negative_df.pivot(
222+
index="date", columns="organization", values="amount"
223+
).fillna(0)
224+
return positive_pivot, negative_pivot
225+
else:
226+
return positive_df, negative_df

backend-python/poetry.lock

Lines changed: 56 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend-python/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ json-repair = "^0.26.0"
4444
freezegun = "^1.5.1"
4545
aiolimiter = "^1.1.0"
4646
pytest-asyncio = "^0.23.8"
47+
gql = {extras = ["requests"], version = "^3.5.0"}
4748

4849
[tool.poetry.group.dev.dependencies]
4950
pytest = "^8.0.2"

0 commit comments

Comments
 (0)