Skip to content

Commit 32ec7c0

Browse files
iferencikIoan Ferencik
andauthored
buildings download (#86)
* pmtiles attempt * async PMTiles buildings * preliminary buildings from pmtiles * finish pmtiles download * finish building download * move iterlen to util * docstrings * fgb buildiongs source * clean imports * fast FGB * finish buildings download --------- Co-authored-by: Ioan Ferencik <[email protected]>
1 parent d88f699 commit 32ec7c0

File tree

12 files changed

+900
-36
lines changed

12 files changed

+900
-36
lines changed

cbsurge/admin/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import logging
1313
from cbsurge.admin.osm import fetch_admin as fetch_osm_admin, ADMIN_LEVELS
1414
from cbsurge.admin.ocha import fetch_admin as fetch_ocha_admin
15+
from cbsurge.util import BboxParamType
1516
import click
1617
import json
1718

@@ -37,6 +38,7 @@ def convert(self, value, param, ctx):
3738
)
3839

3940
return bbox
41+
4042
@click.group()
4143
def admin():
4244
f"""Command line interface for {__package__} package"""

cbsurge/admin/ocha.py

Lines changed: 22 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
import os
55
import httpx
66
import logging
7-
from urllib.parse import urlparse,urlencode, ParseResult
7+
from urllib.parse import urlparse,urlencode
88
import pycountry
99
from shapely.predicates import intersects
1010
from shapely.geometry import box, shape
1111
import h3.api.basic_int as h3
1212
import shapely
1313
from cbsurge.admin.util import is_int
1414
from tqdm import tqdm
15-
15+
from cbsurge.util import http_get_json
1616

1717
logger = logging.getLogger(__name__)
1818

@@ -22,18 +22,28 @@
2222
ARCGIS_COD_SERVICE = 'COD_External'
2323

2424

25-
def http_get_json(url=None, timeout=None):
25+
def countries_for_bbox(bounding_box=None):
2626
"""
27-
Generic HTTP get function using httpx
28-
:param url: str, the url to be fetched
29-
:param timeout: httpx.Timeout instance
30-
:return: python dict representing the result as parsed json
27+
Retrieves the ISO 3166-1 alpha-3 country code for all countries whose extent intersects the
28+
bounding box f rom ESRI ARcGIS oneline World countries layer
29+
:param bounding_box: tuple of numbers, xmin/west, ymin/south, xmax/east, ymax/north
30+
:return: tuple with string representing iso3 country codes
31+
32+
3133
"""
32-
with httpx.Client(timeout=timeout) as client:
33-
response = client.get(url)
34-
response.raise_for_status()
35-
if response.status_code == 200:
36-
return response.json()
34+
str_bbox = map(str, bounding_box)
35+
url = f'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/World_Countries_(Generalized)/FeatureServer/0/query?where=1%3D1&outFields=*&geometry={",".join(str_bbox)}&geometryType=esriGeometryEnvelope&inSR=4326&spatialRel=esriSpatialRelIntersects&returnGeometry=false&outSR=4326&f=json'
36+
try:
37+
timeout = httpx.Timeout(connect=10, read=1800, write=1800, pool=1000)
38+
data = http_get_json(url=url, timeout=timeout)
39+
countries = list()
40+
for country in data['features']:
41+
iso2 = country['attributes']['ISO']
42+
countries.append(pycountry.countries.get(alpha_2=iso2).alpha_3)
43+
return tuple(countries)
44+
except Exception as e:
45+
logger.error(f'Failed to fetch countries that intersect bbox {bounding_box}. {e}')
46+
raise
3747

3848
def fetch_ocha_countries(bounding_box = None, ):
3949
"""
@@ -77,29 +87,6 @@ def fetch_ocha_countries(bounding_box = None, ):
7787
raise
7888

7989

80-
def countries_for_bbox(bounding_box=None):
81-
"""
82-
Retrieves the ISO 3166-1 alpha-3 country code for all countries whose extent intersects the
83-
bounding box f rom ESRI ARcGIS oneline World countries layer
84-
:param bounding_box: tuple of numbers, xmin/west, ymin/south, xmax/east, ymax/north
85-
:return: tuple with string representing iso3 country codes
86-
87-
88-
"""
89-
str_bbox = map(str, bounding_box)
90-
url = f'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/World_Countries_(Generalized)/FeatureServer/0/query?where=1%3D1&outFields=*&geometry={",".join(str_bbox)}&geometryType=esriGeometryEnvelope&inSR=4326&spatialRel=esriSpatialRelIntersects&returnGeometry=false&outSR=4326&f=json'
91-
try:
92-
timeout = httpx.Timeout(connect=10, read=1800, write=1800, pool=1000)
93-
data = http_get_json(url=url, timeout=timeout)
94-
countries = list()
95-
for country in data['features']:
96-
iso2 = country['attributes']['ISO']
97-
countries.append(pycountry.countries.get(alpha_2=iso2).alpha_3)
98-
return tuple(countries)
99-
except Exception as e:
100-
logger.error(f'Failed to fetch countries that intersect bbox {bounding_box}. {e}')
101-
raise
102-
10390
def fetch_ocha_admin_levels(iso3_country=None):
10491
"""
10592
Retrieves the available admin boundaries admin levels in OCHA COD database for a given country

cbsurge/admin/util.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
def is_int(val):
23
"""
34
Check value/variable is integer
@@ -46,3 +47,4 @@ def bbox_to_geojson_polygon(west, south, east, north):
4647

4748
return geojson
4849

50+

cbsurge/cli.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,17 @@
11
import click as click
22

33
from cbsurge.admin import admin
4+
5+
from cbsurge.exposure.builtenv import builtenv
6+
import click
7+
8+
@click.group
9+
def cli(ctx):
10+
"""Main CLI for the application."""
11+
pass
12+
cli.add_command(admin)
13+
cli.add_command(builtenv)
14+
415
from cbsurge.exposure.population import population
516
from cbsurge.stats import stats
617

@@ -15,5 +26,6 @@ def cli():
1526
cli.add_command(stats)
1627

1728

29+
1830
if __name__ == '__main__':
1931
cli()
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import click
2+
3+
@click.group()
4+
def builtenv():
5+
f"""Command line interface for {__package__} package"""
6+
pass
7+
8+
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import click
2+
3+
4+
5+
@click.group()
6+
def buildings():
7+
f"""Command line interface for {__package__} package"""
8+
pass
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import json
2+
import os.path
3+
4+
from cbsurge.exposure.builtenv.buildings.fgbgdal import get_countries_for_bbox_osm, GMOSM_BUILDINGS_ROOT
5+
from pyogrio.raw import open_arrow, write_arrow
6+
import logging
7+
import time
8+
from tqdm import tqdm
9+
from pyogrio.core import read_info
10+
from osgeo import ogr, osr, gdal
11+
12+
13+
14+
15+
logger = logging.getLogger(__name__)
16+
17+
ARROWTYPE2OGRTYPE = {'string':ogr.OFTString, 'double':ogr.OFTReal, 'int64':ogr.OFTInteger64, 'int':ogr.OFTInteger}
18+
19+
20+
def add_fields_to_layer(layer=None, template_layer_info=None):
21+
22+
types = dict([(e[3:], getattr(ogr, e)) for e in dir(ogr) if e.startswith('OFT')])
23+
for field_dict in template_layer_info['layers'][0]['fields']:
24+
layer.CreateField(ogr.FieldDefn(field_dict['name'], types[field_dict['type']]))
25+
26+
27+
def download_pyogrio(bbox=None, out_path=None, batch_size:[int,None]=1000):
28+
"""
29+
Download/stream buildings from VIDA buildings using pyogrio/pyarrow API
30+
31+
:param bbox: iterable of floats, xmin, ymin, xmax,ymax
32+
:param out_path: str, full path where the buildings layer will be written
33+
:param batch_size: int, default=1000, the max number of buildings to download in one batch
34+
If supplied, the buildings are downloaded in batches otherwise they are streamd through pyarrow library
35+
:return:
36+
"""
37+
38+
39+
countries = get_countries_for_bbox_osm(bbox=bbox)
40+
assert len(countries)> 0, f'The bounding box {bbox} does not intersect any country. Please make sure it makes sense!'
41+
42+
with ogr.GetDriverByName('FlatGeobuf').CreateDataSource(out_path) as dst_ds:
43+
for country in countries :
44+
remote_country_fgb_url = f'/vsicurl/{GMOSM_BUILDINGS_ROOT}/country_iso={country}/{country}.fgb'
45+
if batch_size is not None:
46+
with open_arrow(remote_country_fgb_url, bbox=bbox, use_pyarrow=True, batch_size=batch_size) as source:
47+
meta, reader = source
48+
fields = meta.pop('fields')
49+
schema = reader.schema
50+
51+
if dst_ds.GetLayerCount() == 0:
52+
src_epsg = int(meta['crs'].split(':')[-1])
53+
src_srs = osr.SpatialReference()
54+
src_srs.ImportFromEPSG(src_epsg)
55+
dst_lyr = dst_ds.CreateLayer('buildings', geom_type=ogr.wkbPolygon, srs=src_srs)
56+
for name in schema.names:
57+
if 'wkb' in name or 'geometry' in name:continue
58+
field = schema.field(name)
59+
field_type = ARROWTYPE2OGRTYPE[field.type]
60+
dst_lyr.CreateField(ogr.FieldDefn(name, field_type))
61+
logger.info(f'Downloading buildings in batches from {remote_country_fgb_url}')
62+
for batch in reader:
63+
logger.debug(f'Writing {batch.num_rows} records')
64+
dst_lyr.WritePyArrow(batch)
65+
else:
66+
with open_arrow(remote_country_fgb_url, bbox=bbox, use_pyarrow=False) as source:
67+
meta, reader = source
68+
src_epsg = int(meta['crs'].split(':')[-1])
69+
src_srs = osr.SpatialReference()
70+
src_srs.ImportFromEPSG(src_epsg)
71+
logger.info(f'Streaming buildings from {remote_country_fgb_url}')
72+
write_arrow(reader, out_path,layer='buildings',driver='FlatGeobuf',append=True,
73+
geometry_name='wkb_geometry', geometry_type='Polygon', crs=src_srs.ExportToWkt())
74+
info = read_info(out_path, layer='buildings')
75+
logger.info(f'{info["features"]} buildings were downloaded from {",".join(countries)} country datasets')
76+
77+
78+
79+
def download_gdal(bbox=None, out_path=None, batch_size:[int, None]=1000):
80+
"""
81+
Download/stream buildings from VIDA buildings using gdal/pyarrow API
82+
:param bbox: iterable of floats, xmin, ymin, xmax,ymax
83+
:param out_path: str, full path where the buildings layer will be written
84+
:param batch_size: int, default=1000, the max number of buildings to be downloaded in one batch
85+
If supplied, the buildings are downloaded in batches otherwise they are streamd through pyarrow library.
86+
Batch downloading should be preferred in case of large bounding boxes/area
87+
88+
:return:
89+
"""
90+
91+
92+
countries = get_countries_for_bbox_osm(bbox=bbox)
93+
assert len(countries)> 0, f'The bounding box {bbox} does not intersect any country. Please make sure it makes sense!'
94+
buildings = 0
95+
with ogr.GetDriverByName('FlatGeobuf').CreateDataSource(out_path) as dst_ds:
96+
97+
for country in get_countries_for_bbox_osm(bbox=bbox) :
98+
remote_country_fgb_url = f'/vsicurl/{GMOSM_BUILDINGS_ROOT}/country_iso={country}/{country}.fgb'
99+
with ogr.Open(remote_country_fgb_url, gdal.OF_READONLY) as src_ds:
100+
src_lyr = src_ds.GetLayer(0)
101+
src_lyr.SetSpatialFilterRect(*bbox)
102+
if batch_size is not None:
103+
stream = src_lyr.GetArrowStream([f"MAX_FEATURES_IN_BATCH={batch_size}"])
104+
else:
105+
stream = src_lyr.GetArrowStream()
106+
schema = stream.GetSchema()
107+
if dst_ds.GetLayerCount() == 0:
108+
src_srs = src_lyr.GetSpatialRef()
109+
dst_lyr = dst_ds.CreateLayer('buildings', geom_type=ogr.wkbPolygon, srs=src_srs)
110+
for i in range(schema.GetChildrenCount()):
111+
if 'wkb' in schema.GetChild(i).GetName() or 'geometry' in schema.GetChild(i).GetName():continue
112+
dst_lyr.CreateFieldFromArrowSchema(schema.GetChild(i))
113+
if batch_size is not None:
114+
logger.info(f'Downloading buildings in batches from {remote_country_fgb_url}')
115+
else:
116+
logger.info(f'Streaming buildings from {remote_country_fgb_url}')
117+
while True:
118+
array = stream.GetNextRecordBatch()
119+
if array is None:
120+
break
121+
assert dst_lyr.WriteArrowBatch(schema, array) == ogr.OGRERR_NONE
122+
buildings+= array.GetLength()
123+
logger.info(f'{buildings} buildings were downloaded from {",".join(countries)} country datasets')
124+
125+
126+
127+
128+
129+
130+
if __name__ == '__main__':
131+
import asyncio
132+
httpx_logger = logging.getLogger('httpx')
133+
httpx_logger.setLevel(100)
134+
logging.basicConfig()
135+
logger.setLevel(logging.INFO)
136+
137+
nf = 5829
138+
bbox = 33.681335, -0.131836, 35.966492, 1.158979 # KEN/UGA
139+
#bbox = 19.5128619671,40.9857135911,19.5464217663,41.0120783699 # ALB, Divjake
140+
# bbox = 31.442871,18.062312,42.714844,24.196869 # EGY/SDN
141+
# bbox = 15.034157,49.282809,16.02842,49.66207 # CZE
142+
bbox = 19.350384,41.206737,20.059003,41.571459 # ALB, TIRANA
143+
bbox = 19.726666,39.312705,20.627545,39.869353, # ALB/GRC
144+
145+
#a = asyncio.run(get_admin_level_bbox(iso3='ZWE'))
146+
#print(a)
147+
url = 'https://undpgeohub.blob.core.windows.net/userdata/9426cffc00b069908b2868935d1f3e90/datasets/bldgsc_20241029084831.fgb/bldgsc.pmtiles?sv=2025-01-05&ss=b&srt=o&se=2025-11-29T15%3A58%3A37Z&sp=r&sig=bQ8pXRRkNqdsJbxcIZ1S596u4ZvFwmQF3TJURt3jSP0%3D'
148+
#validate_source()
149+
out_path = '/tmp/bldgs1.fgb'
150+
# cntry = get_countries_for_bbox_osm(bbox=bbox)
151+
# print(cntry)
152+
start = time.time()
153+
#asyncio.run(download(bbox=bbox))
154+
155+
download_gdal(bbox=bbox, out_path=out_path, batch_size=3000)
156+
157+
end = time.time()
158+
print((end-start))

0 commit comments

Comments
 (0)