Skip to content

Commit a977cee

Browse files
authored
Optimizing the creation of diff files for OSMCha. (#675)
* Add existing split _addif file to the container * Update split_addif to optimize memory * Update scripts to process adiff files * Remove split_adiff.py * Update scripts functiones * Update docker to fetch new repo version * Update function for ohmx * Update docker for ohmx * Change permission for scripts * Update to build docker image * Update docker image for ohmx adiff builder
1 parent 40bb351 commit a977cee

File tree

6 files changed

+234
-6
lines changed

6 files changed

+234
-6
lines changed

.github/workflows/chartpress.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ on:
66
- 'staging'
77
- 'staging_deploy'
88
- 'vtiles_admin'
9+
- '1249_fix_split_adiff'
910
jobs:
1011
build:
1112
runs-on: ubuntu-22.04

hetzner/osmcha/osmcha.base.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ services:
9090
command: sh /app/update.sh
9191

9292
osmcha_ohmx_adiff:
93-
image: ghcr.io/openhistoricalmap/ohmx-adiff-builder:0.0.1-0.dev.git.2836.hbee0465
93+
image: ghcr.io/openhistoricalmap/ohmx-adiff-builder:0.0.1-0.dev.git.3256.h4c03a3a
9494
container_name: osmcha_ohmx_adiff_staging
9595
environment:
9696
- API_URL=https://api.openhistoricalmap.org
@@ -105,6 +105,11 @@ services:
105105
volumes:
106106
- ohmx_db:/data
107107
- ${HOME}/.aws:/root/.aws:ro
108+
- /production/services/images/ohmx-adiff-builder/config.sh:/app/config.sh
109+
- /production/services/images/ohmx-adiff-builder/functions.sh:/app/functions.sh
110+
- /production/services/images/ohmx-adiff-builder/start.sh:/app/start.sh
111+
- /production/services/images/ohmx-adiff-builder/process_min_range.sh:/app/process_min_range.sh
112+
- /production/services/images/ohmx-adiff-builder/update.sh:/app/update.sh
108113
networks:
109114
- ohm_network
110115
cpus: '8.0'

images/ohmx-adiff-builder/Dockerfile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ FROM ghcr.io/bdon/osmexpress:latest
22

33
ENV PATH="/root/.cargo/bin:${PATH}"
44

5-
RUN apk add --no-cache python3 py3-pip git bash xmlstarlet curl aws-cli
5+
RUN apk add --no-cache python3 py3-pip git bash xmlstarlet curl aws-cli py3-lxml
66

77
RUN apk add --no-cache --virtual .build-deps build-base patch python3-dev make cmake zlib-dev expat-dev bzip2-dev boost-dev rust cargo && \
88
\
9-
pip install --break-system-packages osmx py3-requests osmium && \
9+
pip install --break-system-packages osmx requests osmium && \
1010
\
1111
echo "--- Clonando y modificando osm-cli para OpenHistoricalMap ---" && \
1212
git clone https://github.com/jake-low/osm-cli.git /tmp/osm-cli && \
@@ -28,12 +28,12 @@ ENV PATH="/root/.local/bin:${PATH}"
2828

2929
WORKDIR /app
3030

31-
RUN echo "Update repo 08/08/2025"
31+
# RUN echo "Update repo 08/08/2025"
3232

3333
RUN git clone https://github.com/OpenHistoricalMap/osmx-adiff-builder.git /app
3434

35-
COPY start.sh .
36-
COPY update.sh .
35+
COPY *.sh .
36+
COPY *.py .
3737

3838
RUN chmod +x /app/*.sh && chmod +x /app/*.py
3939

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
# Configuration file for ohmx-adiff-builder
3+
4+
export WORKDIR=/data
5+
6+
## Database path
7+
export OSMX_DB_DIR=$WORKDIR/db/
8+
export OSMX_DB_PATH=$OSMX_DB_DIR/osmx.db
9+
export PLANET_FILE_PATH=$WORKDIR/planet.osm.pbf
10+
11+
## URLs services
12+
export REPLICATION_URL="${REPLICATION_URL:-https://s3.amazonaws.com/planet.openhistoricalmap.org/replication/minute}"
13+
export API_URL=${API_URL:-https://api.openstreetmap.org}
14+
15+
## Required directories (sin barra final para evitar dobles barras)
16+
export REPLICATION_ADIFFS_DIR=$WORKDIR/stage-data/replication-adiffs
17+
export SPLIT_ADIFFS_DIR=$WORKDIR/stage-data/split-adiffs
18+
export CHANGESET_DIR=$WORKDIR/stage-data/changesets
19+
export BUCKET_DIR=$WORKDIR/stage-data/bucket-data
20+
export UPLOAD_TRACK_FILE=$WORKDIR/stage-data/uploaded_files.md5
21+
export BAD_CHANGESETS_DIR=$WORKDIR/stage-data/bad_changesets
22+
23+
## Sequence number
24+
export OSMX_INITIAL_SEQNUM=${OSMX_INITIAL_SEQNUM:-0}
25+
26+
## Process diff files from the last 60 min
27+
export FILTER_ADIFF_FILES=60
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
#!/bin/bash
2+
# Common functions for ohmx-adiff-builder scripts
3+
4+
upload_diff_files() {
5+
# Optional parameter: "once" to execute only once, any other value or empty for continuous loop
6+
local mode="${1:-loop}"
7+
8+
mkdir -p "$(dirname "$UPLOAD_TRACK_FILE")"
9+
touch "$UPLOAD_TRACK_FILE"
10+
11+
declare -A uploaded_md5s
12+
while read -r line; do
13+
file=$(echo "$line" | awk '{print $1}')
14+
hash=$(echo "$line" | awk '{print $2}')
15+
uploaded_md5s["$file"]="$hash"
16+
done < "$UPLOAD_TRACK_FILE"
17+
18+
# Function to process files once
19+
process_upload() {
20+
echo "Uploading files at $(date)..."
21+
# Search in bucket-data (where compressed changeset files are)
22+
# and also in bucket-data/replication/minute (where replication files are)
23+
# Use process substitution to avoid subshell
24+
while IFS= read -r filepath; do
25+
[ -z "$filepath" ] || [ ! -f "$filepath" ] && continue
26+
filename=$(basename "$filepath")
27+
current_md5=$(md5sum "$filepath" | awk '{print $1}')
28+
29+
if [[ -n "${uploaded_md5s[$filename]}" ]]; then
30+
if [[ "${uploaded_md5s[$filename]}" == "$current_md5" ]]; then
31+
echo "Skipping unchanged: $filename"
32+
continue
33+
else
34+
echo "File changed: $filename — reuploading"
35+
fi
36+
else
37+
echo "New file: $filename — uploading"
38+
fi
39+
40+
aws s3 cp "$filepath" "s3://$AWS_S3_BUCKET/ohm-augmented-diffs/changesets/$filename" \
41+
--content-type "application/xml" \
42+
--content-encoding "gzip" && \
43+
uploaded_md5s["$filename"]="$current_md5"
44+
done < <(find "$BUCKET_DIR" -type f -name '*.adiff' -mmin -60 2>/dev/null)
45+
46+
# Update control file
47+
: > "$UPLOAD_TRACK_FILE"
48+
for fname in "${!uploaded_md5s[@]}"; do
49+
echo "$fname ${uploaded_md5s[$fname]}" >> "$UPLOAD_TRACK_FILE"
50+
done
51+
}
52+
53+
if [[ "$mode" == "once" ]]; then
54+
# Execute only once
55+
process_upload
56+
else
57+
# Execute in continuous loop
58+
while true; do
59+
process_upload
60+
sleep 60
61+
done
62+
fi
63+
}
64+
65+
# Function to download and generate adiff files for a seqno range
66+
# Downloads .osc files from replication server, generates .adiff files, and updates osmx database
67+
download_and_generate_adiffs() {
68+
local seqno_min=$1
69+
local seqno_max=$2
70+
71+
echo "Downloading and generating adiffs for range: $seqno_min - $seqno_max"
72+
73+
eval "$(mise activate bash --shims)"
74+
75+
for seqno in $(seq "$seqno_min" "$seqno_max"); do
76+
echo "Processing seqno: $seqno"
77+
78+
# Get the replication URL for this seqno
79+
# Format: seqno is padded to 9 digits, split into 3 parts: XXX/XXX/XXX
80+
seqno_padded=$(printf "%09d" "$seqno")
81+
part1=$(echo "$seqno_padded" | cut -c1-3)
82+
part2=$(echo "$seqno_padded" | cut -c4-6)
83+
part3=$(echo "$seqno_padded" | cut -c7-9)
84+
url="${REPLICATION_URL}/${part1}/${part2}/${part3}.osc.gz"
85+
86+
# Download and decompress the .osc file
87+
if ! curl -sL "$url" | gzip -d > "${seqno}.osc" 2>/dev/null; then
88+
echo "Warning: Failed to download seqno $seqno from $url, skipping..."
89+
continue
90+
fi
91+
92+
# Check if .osc file was downloaded successfully
93+
if [ ! -f "${seqno}.osc" ] || [ ! -s "${seqno}.osc" ]; then
94+
echo "Warning: Empty or missing .osc file for seqno $seqno, skipping..."
95+
continue
96+
fi
97+
98+
# Generate augmented diff
99+
tmpfile=$(mktemp)
100+
if ! python augmented_diff.py "$OSMX_DB_PATH" "${seqno}.osc" | xmlstarlet format > "$tmpfile" 2>/dev/null; then
101+
echo "Warning: Failed to generate adiff for seqno $seqno, skipping..."
102+
rm -f "${seqno}.osc" "$tmpfile"
103+
continue
104+
fi
105+
106+
# Move adiff to replication directory
107+
mkdir -p "$REPLICATION_ADIFFS_DIR"
108+
mv "$tmpfile" "$REPLICATION_ADIFFS_DIR/${seqno}.adiff"
109+
110+
# Get timestamp from replication state (if available) or use current time
111+
timestamp=$(date -u +%Y-%m-%dT%H:%M:%SZ)
112+
113+
# Update osmx database
114+
if osmx update "$OSMX_DB_PATH" "${seqno}.osc" "$seqno" "$timestamp" --commit 2>/dev/null; then
115+
echo "Successfully processed seqno $seqno"
116+
else
117+
echo "Warning: Failed to update osmx database for seqno $seqno"
118+
fi
119+
120+
# Clean up .osc file
121+
rm -f "${seqno}.osc"
122+
done
123+
}
124+
125+
# Function to process adiff files by sequence number (seqno) range
126+
# Splits adiffs, moves them to bucket-data, merges split adiffs, uploads, and cleans up
127+
process_adiff_range() {
128+
local seqno_start=$1
129+
local seqno_end=$2
130+
131+
mkdir -p "$REPLICATION_ADIFFS_DIR" "$SPLIT_ADIFFS_DIR" "$CHANGESET_DIR" "$BUCKET_DIR/replication/minute"
132+
133+
echo "Processing range: $seqno_start - $seqno_end"
134+
135+
# Find files in the specified range
136+
for seqno in $(seq "$seqno_start" "$seqno_end"); do
137+
adiff_file="$REPLICATION_ADIFFS_DIR/${seqno}.adiff"
138+
echo "Processing adiff file: $adiff_file"
139+
[ ! -f "$adiff_file" ] && continue
140+
141+
seqno=$(basename -s .adiff "$adiff_file")
142+
tmpdir=$(mktemp -d)
143+
144+
# split the adiff file
145+
python split_adiff.py "$adiff_file" "$tmpdir"
146+
147+
for file in "$tmpdir"/*.adiff; do
148+
[ ! -f "$file" ] && continue
149+
changeset=$(basename -s .adiff "$file")
150+
mkdir -p "${SPLIT_ADIFFS_DIR}/${changeset}"
151+
mv "$file" "${SPLIT_ADIFFS_DIR}/${changeset}/${seqno}.adiff"
152+
done
153+
154+
rm -rf "$tmpdir"
155+
156+
# move the adiff file to the output directory. this means it won't be processed
157+
# again in the future and can be uploaded to R2 and deleted locally.
158+
# compress it first
159+
tmpfile=$(mktemp)
160+
gzip -c < "$adiff_file" > "$tmpfile"
161+
# move it into place atomically
162+
mkdir -p "${BUCKET_DIR}/replication/minute"
163+
mv "$tmpfile" "${BUCKET_DIR}/replication/minute/$(basename "$adiff_file")"
164+
rm "$adiff_file"
165+
done
166+
167+
# merge all our split files, potentially updating existing changesets.
168+
# this is done using a makefile script in order to avoid needlessly reprocessing
169+
# changesets whose set of input (split-adiffs/) files haven't changed.
170+
[ -f "merge.mk" ] && make -f merge.mk || true
171+
172+
upload_diff_files "once"
173+
174+
# clean up old stage-data that we don't need anymore
175+
[ -f "gc.sh" ] && ./gc.sh "$SPLIT_ADIFFS_DIR" "$CHANGESET_DIR" || true
176+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
# Usage: ./process_min_range.sh SEQNO_START SEQNO_END [generate_adiffs|process_adiffs|both]
3+
# Example: ./process_min_range.sh 1884610 1884615 generate_adiffs
4+
# ./process_min_range.sh 1884610 1884615 process_adiffs
5+
6+
source "config.sh"
7+
source "functions.sh"
8+
9+
min_seqno="$1"
10+
max_seqno="$2"
11+
mode="$3"
12+
13+
if [ "$mode" == "generate_adiffs" ]; then
14+
download_and_generate_adiffs "$min_seqno" "$max_seqno"
15+
process_adiff_range "$min_seqno" "$max_seqno"
16+
elif [ "$mode" == "process_adiffs" ]; then
17+
process_adiff_range "$min_seqno" "$max_seqno"
18+
fi
19+

0 commit comments

Comments
 (0)