Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ice_scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@
}
field_office_to_aor = {v: k for k, v in area_of_responsibility.items()}

from .agencies import scrape_agencies # noqa: F401,E402
from .utils import ( # noqa: E402
download_file, # noqa: F401
get_ice_scrape_pages, # noqa: F401
repair_locality, # noqa: F401
repair_street, # noqa: F401
Expand Down
71 changes: 71 additions & 0 deletions ice_scrapers/agencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from bs4 import BeautifulSoup
import copy
import os
import polars
import re
from schemas import (
agencies_287g,
active_agency,
pending_agency,
)
import time
from utils import (
logger,
session,
)
from .utils import download_file

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
base_xlsx_url = "https://www.ice.gov/identify-and-arrest/287g"


def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
"""Collect data on participating agencies"""
start_time = time.time()
resp = session.get(base_xlsx_url, timeout=120)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
if not links:
raise Exception(f"Could not find any XLSX files on {base_xlsx_url}")
logger.debug(links)
date_re = re.compile(r"\d{8}pm")
agencies = copy.deepcopy(agencies_287g)
for link in links:
match link:
case x if "participating" in x:
schema = copy.deepcopy(active_agency)
case x if "pending" in x:
schema = copy.deepcopy(pending_agency)
case _:
raise Exception(f"Found an unsupported agency datasheet: {link}")
"""
Yes, polars supports loading from a URL. But this pattern
lets us cache the download
"""
# remove the date so we can easily overwrite the local (cached) file
filename = date_re.sub("", link.split("/")[-1])
path = f"{SCRIPT_DIR}{os.sep}{filename}"
if force_download or not os.path.exists(path):
logger.info("Downloading agency info sheet from %s", link)
download_file(link, path)
df = polars.read_excel(drop_empty_rows=True, raise_if_empty=True, source=open(path, "rb"))
for row in df.iter_rows(named=True):
data = copy.deepcopy(schema)
data["state"] = row["STATE"]
data["agency"] = row["LAW ENFORCEMENT AGENCY"]
data["county"] = row["COUNTY"]
data["type"] = row["TYPE"]
data["support_type"] = row["SUPPORT TYPE"]
if "participating" in filename:
data["moa"] = row["MOA"]
data["signed"] = row["SIGNED"]
data["addendum"] = row["ADDENDUM"]
agencies["active"].append(data)
else:
agencies["pending"].append(data)
if not keep_sheet:
os.unlink(path)
logger.info(" Collected %s active and %s pending agencies", len(agencies["active"]), len(agencies["pending"]))
agencies["scrape_runtime"] = time.time() - start_time
return agencies
18 changes: 9 additions & 9 deletions ice_scrapers/facilities_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,6 @@
import copy
import datetime
import re
from ice_scrapers import (
get_ice_scrape_pages,
repair_locality,
repair_street,
repair_zip,
repair_name,
special_facilities,
update_facility,
)
from schemas import facility_schema
import time
from utils import (
Expand All @@ -19,6 +10,15 @@
session,
timestamp_format,
)
from .utils import (
get_ice_scrape_pages,
repair_locality,
repair_street,
repair_zip,
repair_name,
special_facilities,
update_facility,
)

base_scrape_url = "https://www.ice.gov/detention-facilities"

Expand Down
2 changes: 1 addition & 1 deletion ice_scrapers/field_offices.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from ice_scrapers import (
area_of_responsibility,
field_office_to_aor,
get_ice_scrape_pages,
)
import re
from schemas import (
Expand All @@ -17,6 +16,7 @@
logger,
session,
)
from .utils import get_ice_scrape_pages

base_scrape_url = "https://www.ice.gov/contact/field-offices"

Expand Down
20 changes: 12 additions & 8 deletions ice_scrapers/general.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
import copy
from ice_scrapers import (
collect_vera_facility_data,
insert_additional_facilities,
load_sheet,
from schemas import facilities_schema
from .agencies import scrape_agencies
from .custom_facilities import insert_additional_facilities
from .facilities_scraper import scrape_facilities
from .field_offices import (
merge_field_offices,
scrape_facilities,
scrape_field_offices,
)
from schemas import facilities_schema
from .spreadsheet_load import load_sheet
from .vera_data import collect_vera_facility_data


def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False) -> dict:
def facilities_scrape_wrapper(
keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False
) -> tuple[dict, dict]:
agencies = scrape_agencies(keep_sheet, force_download)
facilities_data = copy.deepcopy(facilities_schema)
facilities = load_sheet(keep_sheet, force_download)
facilities_data["facilities"] = copy.deepcopy(facilities)
Expand All @@ -21,4 +25,4 @@ def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = Tr
facilities_data = merge_field_offices(facilities_data, field_offices)
facilities_data = insert_additional_facilities(facilities_data)

return facilities_data
return facilities_data, agencies
11 changes: 4 additions & 7 deletions ice_scrapers/spreadsheet_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from ice_scrapers import (
ice_facility_types,
ice_inspection_types,
)
from .utils import (
download_file,
repair_locality,
repair_name,
repair_street,
Expand Down Expand Up @@ -84,13 +87,7 @@ def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tup
logger.debug("Found sheet at: %s", actual_link)
if force_download or not os.path.exists(filename):
logger.info("Downloading detention stats sheet from %s", actual_link)
resp = session.get(actual_link, timeout=120, stream=True)
size = len(resp.content)
with open(filename, "wb") as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
logger.debug("Wrote %s byte sheet to %s", size, filename)
download_file(actual_link, filename)
df = polars.read_excel(
drop_empty_rows=True,
has_header=False,
Expand Down
13 changes: 13 additions & 0 deletions ice_scrapers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@
)


def download_file(link: str, path: str) -> None:
"""
Standard pattern for downloading a binary file from a URL
"""
resp = session.get(link, timeout=120, stream=True)
size = len(resp.content)
with open(path, "wb") as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
logger.debug("Wrote %s byte sheet to %s", size, path)


def special_facilities(facility: dict) -> dict:
"""
Some very specific facilities have unique fixes
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def main() -> None:
exit(1)

if args.scrape:
facilities_data = facilities_scrape_wrapper(
facilities_data, agencies = facilities_scrape_wrapper(
keep_sheet=not args.delete_sheets,
force_download=not args.skip_downloads,
skip_vera=args.skip_vera,
Expand Down
25 changes: 25 additions & 0 deletions schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,31 @@
},
}

agencies_287g: dict = {
"active": [{}],
"pending": [{}],
"scrape_runtime": 0,
"scraped_date": datetime.datetime.now(datetime.UTC),
}

active_agency: dict = {
"state": "",
"agency": "",
"county": "",
"type": "",
"signed": None,
"moa": "",
"addendum": "",
"support_type": "",
}

pending_agency: dict = {
"state": "",
"agency": "",
"county": "",
"type": "",
"support_type": "",
}

# enrichment response object
enrich_resp_schema: dict = {
Expand Down