StackOverflowProj/DataCollection.py at main · mederickg/StackOverflowProj · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import requests
import time
import numpy as np
import csv

DIRECTORY_NAME = "data/"


AUTH_TOKENS = [
    "",
]

SLEEP_T = 2

# helper functions for making queries
postQueryEndpoint = "https://data.stackexchange.com/query/save/1"
getQueryEndpoint = "https://data.stackexchange.com/query/job/jobId"


def saveAs(fileName, content):
    # Writing to sample.json
    with open(fileName, "w") as outfile:
        outfile.write(content)

global lastAuthToken
lastAuthToken = 0
def getAuthToken():
    global lastAuthToken
    lastAuthToken = (lastAuthToken + 1) % len(AUTH_TOKENS)
    return AUTH_TOKENS[lastAuthToken]

def makeQuery(query, fileName=None):
    query = {
        "sql": query,
        "g-recaptcha-response": getAuthToken()
    }

    # create initial query
    response = requests.post(postQueryEndpoint, data = query)
    response = response.json()

    running = response.get("running")
    getEndpoint = getQueryEndpoint

    # if this is a larger query, modify the get endpoint
    if running:
        getEndpoint = getEndpoint.replace("jobId", response.get("job_id"))

    # continue checking every interval for results
    while running:
        time.sleep(SLEEP_T)
        print("??")
        response = requests.get(getEndpoint, params = {"_": time.time()}).json()
        running = response.get("running")

    # save content to json if fileName provided
    content = response.get("resultSets")[0]
    if fileName: saveAs(fileName, content)

    return content


# making queries
def constructColumns(columns):
    columnNames = []
    for column in columns:
        columnNames.append(column.get("name"))
    return ','.join(columnNames) + '\n'


NUM_USERS = 9001
INCREMENT = 1801

# code used to retrieve random ids
# import random
# random.seed("SLURPY HERPY DERP")

# ids = set()

# def createUsers():
#     response = makeQuery("""
#     SELECT DISTINCT OwnerUserId FROM Posts
#     """).get("rows")[1:]
#     print("cumming")

#     size = len(response)

#     while len(ids) < NUM_USERS:
#         id = response[random.randint(0, size)][0]
#         if id == None or id <= 0: continue
#         ids.add(id)

# createUsers()
# ids = list(ids)

import math
import json

FILE_NAME = "data/UserIds.json"

# write all userids
# with open(FILE_NAME, 'w') as f:
#     json.dump(ids, f)

# load all usersids
with open(FILE_NAME, 'r') as f:
    ids = json.load(f)


with open('Posts.csv', 'w', newline='') as csvfile:
    # Create a CSV writer object
    writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)

    for i in range(1, int(math.ceil(NUM_USERS/INCREMENT) + 1)):
        low = INCREMENT * (i - 1)
        high = INCREMENT * i

        userIds = ids[low:high]
        query = f"""
        SELECT * FROM Posts WHERE OwnerUserId IN ({', '.join(map(str, userIds))});
        """

        response = makeQuery(query)

        print("done")
        if i == 1:
            columns = []
            for column in response.get("columns"):
                columns.append(column.get("name"))

            # Write the header row
            writer.writerow(columns)

        # Loop through the JSON data and write each row to the CSV file
        for row in response.get("rows"):
            writer.writerow(row)

        print(len(response.get("rows")))


# with open('data/Users.csv', 'w', newline='') as csvfile:
#     # Create a CSV writer object
#     writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)

#     query = f"""
#     SELECT * FROM Users WHERE Id IN ({', '.join(map(str, ids))});
#     """
#     print(query)
#     response = makeQuery(query)

#     columns = []
#     for column in response.get("columns"):
#         columns.append(column.get("name"))

#     # Write the header row
#     writer.writerow(columns)

#     # Loop through the JSON data and write each row to the CSV file
#     for row in response.get("rows"):
#         writer.writerow(row)

#     print(len(response.get("rows")))