-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataCollection.py
More file actions
164 lines (117 loc) · 3.85 KB
/
DataCollection.py
File metadata and controls
164 lines (117 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import requests
import time
import numpy as np
import csv
DIRECTORY_NAME = "data/"
AUTH_TOKENS = [
"",
]
SLEEP_T = 2
# helper functions for making queries
postQueryEndpoint = "https://data.stackexchange.com/query/save/1"
getQueryEndpoint = "https://data.stackexchange.com/query/job/jobId"
def saveAs(fileName, content):
# Writing to sample.json
with open(fileName, "w") as outfile:
outfile.write(content)
global lastAuthToken
lastAuthToken = 0
def getAuthToken():
global lastAuthToken
lastAuthToken = (lastAuthToken + 1) % len(AUTH_TOKENS)
return AUTH_TOKENS[lastAuthToken]
def makeQuery(query, fileName=None):
query = {
"sql": query,
"g-recaptcha-response": getAuthToken()
}
# create initial query
response = requests.post(postQueryEndpoint, data = query)
response = response.json()
running = response.get("running")
getEndpoint = getQueryEndpoint
# if this is a larger query, modify the get endpoint
if running:
getEndpoint = getEndpoint.replace("jobId", response.get("job_id"))
# continue checking every interval for results
while running:
time.sleep(SLEEP_T)
print("??")
response = requests.get(getEndpoint, params = {"_": time.time()}).json()
running = response.get("running")
# save content to json if fileName provided
content = response.get("resultSets")[0]
if fileName: saveAs(fileName, content)
return content
# making queries
def constructColumns(columns):
columnNames = []
for column in columns:
columnNames.append(column.get("name"))
return ','.join(columnNames) + '\n'
NUM_USERS = 9001
INCREMENT = 1801
# code used to retrieve random ids
# import random
# random.seed("SLURPY HERPY DERP")
# ids = set()
# def createUsers():
# response = makeQuery("""
# SELECT DISTINCT OwnerUserId FROM Posts
# """).get("rows")[1:]
# print("cumming")
# size = len(response)
# while len(ids) < NUM_USERS:
# id = response[random.randint(0, size)][0]
# if id == None or id <= 0: continue
# ids.add(id)
# createUsers()
# ids = list(ids)
import math
import json
FILE_NAME = "data/UserIds.json"
# write all userids
# with open(FILE_NAME, 'w') as f:
# json.dump(ids, f)
# load all usersids
with open(FILE_NAME, 'r') as f:
ids = json.load(f)
with open('Posts.csv', 'w', newline='') as csvfile:
# Create a CSV writer object
writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
for i in range(1, int(math.ceil(NUM_USERS/INCREMENT) + 1)):
low = INCREMENT * (i - 1)
high = INCREMENT * i
userIds = ids[low:high]
query = f"""
SELECT * FROM Posts WHERE OwnerUserId IN ({', '.join(map(str, userIds))});
"""
response = makeQuery(query)
print("done")
if i == 1:
columns = []
for column in response.get("columns"):
columns.append(column.get("name"))
# Write the header row
writer.writerow(columns)
# Loop through the JSON data and write each row to the CSV file
for row in response.get("rows"):
writer.writerow(row)
print(len(response.get("rows")))
# with open('data/Users.csv', 'w', newline='') as csvfile:
# # Create a CSV writer object
# writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
# query = f"""
# SELECT * FROM Users WHERE Id IN ({', '.join(map(str, ids))});
# """
# print(query)
# response = makeQuery(query)
# columns = []
# for column in response.get("columns"):
# columns.append(column.get("name"))
# # Write the header row
# writer.writerow(columns)
# # Loop through the JSON data and write each row to the CSV file
# for row in response.get("rows"):
# writer.writerow(row)
# print(len(response.get("rows")))