Open-Security-Mapping-Project · johnseekins · Nov 2, 2025 · Nov 2, 2025 · Nov 2, 2025 · Nov 2, 2025
diff --git a/ice_scrapers/__init__.py b/ice_scrapers/__init__.py
@@ -131,7 +131,9 @@
 }
 field_office_to_aor = {v: k for k, v in area_of_responsibility.items()}
 
+from .agencies import scrape_agencies  # noqa: F401,E402
 from .utils import (  # noqa: E402
+    download_file,  # noqa: F401
     get_ice_scrape_pages,  # noqa: F401
     repair_locality,  # noqa: F401
     repair_street,  # noqa: F401

diff --git a/ice_scrapers/agencies.py b/ice_scrapers/agencies.py
@@ -0,0 +1,71 @@
+from bs4 import BeautifulSoup
+import copy
+import os
+import polars
+import re
+from schemas import (
+    agencies_287g,
+    active_agency,
+    pending_agency,
+)
+import time
+from utils import (
+    logger,
+    session,
+)
+from .utils import download_file
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+base_xlsx_url = "https://www.ice.gov/identify-and-arrest/287g"
+
+
+def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
+    """Collect data on participating agencies"""
+    start_time = time.time()
+    resp = session.get(base_xlsx_url, timeout=120)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.content, "html.parser")
+    links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
+    if not links:
+        raise Exception(f"Could not find any XLSX files on {base_xlsx_url}")
+    logger.debug(links)
+    date_re = re.compile(r"\d{8}pm")
+    agencies = copy.deepcopy(agencies_287g)
+    for link in links:
+        match link:
+            case x if "participating" in x:
+                schema = copy.deepcopy(active_agency)
+            case x if "pending" in x:
+                schema = copy.deepcopy(pending_agency)
+            case _:
+                raise Exception(f"Found an unsupported agency datasheet: {link}")
+        """
+        Yes, polars supports loading from a URL. But this pattern
+        lets us cache the download
+        """
+        # remove the date so we can easily overwrite the local (cached) file
+        filename = date_re.sub("", link.split("/")[-1])
+        path = f"{SCRIPT_DIR}{os.sep}{filename}"
+        if force_download or not os.path.exists(path):
+            logger.info("Downloading agency info sheet from %s", link)
+            download_file(link, path)
+        df = polars.read_excel(drop_empty_rows=True, raise_if_empty=True, source=open(path, "rb"))
+        for row in df.iter_rows(named=True):
+            data = copy.deepcopy(schema)
+            data["state"] = row["STATE"]
+            data["agency"] = row["LAW ENFORCEMENT AGENCY"]
+            data["county"] = row["COUNTY"]
+            data["type"] = row["TYPE"]
+            data["support_type"] = row["SUPPORT TYPE"]
+            if "participating" in filename:
+                data["moa"] = row["MOA"]
+                data["signed"] = row["SIGNED"]
+                data["addendum"] = row["ADDENDUM"]
+                agencies["active"].append(data)
+            else:
+                agencies["pending"].append(data)
+        if not keep_sheet:
+            os.unlink(path)
+    logger.info("  Collected %s active and %s pending agencies", len(agencies["active"]), len(agencies["pending"]))
+    agencies["scrape_runtime"] = time.time() - start_time
+    return agencies
diff --git a/ice_scrapers/facilities_scraper.py b/ice_scrapers/facilities_scraper.py
@@ -2,15 +2,6 @@
 import copy
 import datetime
 import re
-from ice_scrapers import (
-    get_ice_scrape_pages,
-    repair_locality,
-    repair_street,
-    repair_zip,
-    repair_name,
-    special_facilities,
-    update_facility,
-)
 from schemas import facility_schema
 import time
 from utils import (
@@ -19,6 +10,15 @@
     session,
     timestamp_format,
 )
+from .utils import (
+    get_ice_scrape_pages,
+    repair_locality,
+    repair_street,
+    repair_zip,
+    repair_name,
+    special_facilities,
+    update_facility,
+)
 
 base_scrape_url = "https://www.ice.gov/detention-facilities"
 

diff --git a/ice_scrapers/field_offices.py b/ice_scrapers/field_offices.py
@@ -5,7 +5,6 @@
 from ice_scrapers import (
     area_of_responsibility,
     field_office_to_aor,
-    get_ice_scrape_pages,
 )
 import re
 from schemas import (
@@ -17,6 +16,7 @@
     logger,
     session,
 )
+from .utils import get_ice_scrape_pages
 
 base_scrape_url = "https://www.ice.gov/contact/field-offices"
 

diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py
@@ -1,16 +1,20 @@
 import copy
-from ice_scrapers import (
-    collect_vera_facility_data,
-    insert_additional_facilities,
-    load_sheet,
+from schemas import facilities_schema
+from .agencies import scrape_agencies
+from .custom_facilities import insert_additional_facilities
+from .facilities_scraper import scrape_facilities
+from .field_offices import (
     merge_field_offices,
-    scrape_facilities,
     scrape_field_offices,
 )
-from schemas import facilities_schema
+from .spreadsheet_load import load_sheet
+from .vera_data import collect_vera_facility_data
 
 
-def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False) -> dict:
+def facilities_scrape_wrapper(
+    keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False
+) -> tuple[dict, dict]:
+    agencies = scrape_agencies(keep_sheet, force_download)
     facilities_data = copy.deepcopy(facilities_schema)
     facilities = load_sheet(keep_sheet, force_download)
     facilities_data["facilities"] = copy.deepcopy(facilities)
@@ -21,4 +25,4 @@ def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = Tr
     facilities_data = merge_field_offices(facilities_data, field_offices)
     facilities_data = insert_additional_facilities(facilities_data)
 
-    return facilities_data
+    return facilities_data, agencies
diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py
@@ -4,6 +4,9 @@
 from ice_scrapers import (
     ice_facility_types,
     ice_inspection_types,
+)
+from .utils import (
+    download_file,
     repair_locality,
     repair_name,
     repair_street,
@@ -84,13 +87,7 @@ def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tup
     logger.debug("Found sheet at: %s", actual_link)
     if force_download or not os.path.exists(filename):
         logger.info("Downloading detention stats sheet from %s", actual_link)
-        resp = session.get(actual_link, timeout=120, stream=True)
-        size = len(resp.content)
-        with open(filename, "wb") as f:
-            for chunk in resp.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-        logger.debug("Wrote %s byte sheet to %s", size, filename)
+        download_file(actual_link, filename)
     df = polars.read_excel(
         drop_empty_rows=True,
         has_header=False,

diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py
@@ -7,6 +7,19 @@
 )
 
 
+def download_file(link: str, path: str) -> None:
+    """
+    Standard pattern for downloading a binary file from a URL
+    """
+    resp = session.get(link, timeout=120, stream=True)
+    size = len(resp.content)
+    with open(path, "wb") as f:
+        for chunk in resp.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+    logger.debug("Wrote %s byte sheet to %s", size, path)
+
+
 def special_facilities(facility: dict) -> dict:
     """
     Some very specific facilities have unique fixes

diff --git a/main.py b/main.py
@@ -128,7 +128,7 @@ def main() -> None:
         exit(1)
 
     if args.scrape:
-        facilities_data = facilities_scrape_wrapper(
+        facilities_data, agencies = facilities_scrape_wrapper(
             keep_sheet=not args.delete_sheets,
             force_download=not args.skip_downloads,
             skip_vera=args.skip_vera,

diff --git a/schemas.py b/schemas.py
@@ -105,6 +105,31 @@
     },
 }
 
+agencies_287g: dict = {
+    "active": [{}],
+    "pending": [{}],
+    "scrape_runtime": 0,
+    "scraped_date": datetime.datetime.now(datetime.UTC),
+}
+
+active_agency: dict = {
+    "state": "",
+    "agency": "",
+    "county": "",
+    "type": "",
+    "signed": None,
+    "moa": "",
+    "addendum": "",
+    "support_type": "",
+}
+
+pending_agency: dict = {
+    "state": "",
+    "agency": "",
+    "county": "",
+    "type": "",
+    "support_type": "",
+}
 
 # enrichment response object
 enrich_resp_schema: dict = {