From 17bcbe8f6951a41cbd47d0a72682bbb510d0d2b6 Mon Sep 17 00:00:00 2001 From: knutdrand Date: Thu, 5 Feb 2026 16:55:30 +0100 Subject: [PATCH] docs: document evaluation pipeline and expanding window cross-validation Add docstrings to dataset_splitting.py and prediction_evaluator.py, and create a contributor documentation page explaining the internal evaluation pipeline with focus on the expanding window algorithm. CLIM-473 --- chap_core/assessment/dataset_splitting.py | 135 +++++++++++++++-- chap_core/assessment/prediction_evaluator.py | 35 +++++ docs/contributor/evaluation_pipeline.md | 150 +++++++++++++++++++ mkdocs.yml | 1 + 4 files changed, 310 insertions(+), 11 deletions(-) create mode 100644 docs/contributor/evaluation_pipeline.md diff --git a/chap_core/assessment/dataset_splitting.py b/chap_core/assessment/dataset_splitting.py index a9835a186..b35be595d 100644 --- a/chap_core/assessment/dataset_splitting.py +++ b/chap_core/assessment/dataset_splitting.py @@ -1,3 +1,12 @@ +"""Train/test splitting utilities for time series evaluation. + +Provides functions for splitting spatio-temporal datasets into training and +test sets for model evaluation. The main entry point is ``train_test_generator``, +which implements an expanding window cross-validation strategy where the +training set grows with each successive split while the prediction window +slides forward. +""" + from typing import Iterable, Iterator, Optional, Protocol, Type from chap_core.climate_predictor import FutureWeatherFetcher @@ -18,6 +27,29 @@ def split_test_train_on_period( include_future_weather: bool = False, future_weather_class: Type[ClimateData] = ClimateData, ): + """Generate train/test splits at each split point. + + For each split point, produces a (train, test) tuple where training data + ends just before the split point and test data starts at the split point. + + Parameters + ---------- + data_set + The full dataset to split. + split_points + Time periods at which to split (each becomes the start of a test set). + future_length + Optional time delta to limit test set length. + include_future_weather + If True, return (train, test, future_weather) tuples instead. + future_weather_class + Dataclass type for future weather data. + + Yields + ------ + tuple + (train, test) or (train, test, future_weather) for each split point. + """ func = train_test_split_with_weather if include_future_weather else train_test_split if include_future_weather: @@ -34,6 +66,27 @@ def train_test_split( extension: Optional[IsTimeDelta] = None, restrict_test=True, ): + """Split a dataset into train and test sets at a single split point. + + Parameters + ---------- + data_set + The full dataset. + prediction_start_period + First period of the test set. Training data ends at the period + immediately before this. + extension + Optional time delta to extend the test set end beyond + ``prediction_start_period``. + restrict_test + If True, restrict the test set to the prediction window. + If False, return the full dataset as the test set. + + Returns + ------- + tuple[DataSet, DataSet] + (train_data, test_data). + """ last_train_period = previous(prediction_start_period) train_data = data_set.restrict_time_period(slice(None, last_train_period)) if extension is not None: @@ -54,28 +107,50 @@ def train_test_generator( stride: int = 1, future_weather_provider: Optional[FutureWeatherFetcher] = None, ) -> tuple[DataSet, Iterator[tuple[DataSet, DataSet, DataSet]]]: - """ - Genereate a train set along with an iterator of test data that contains tuples of full data up until a - split point and data without target variables for the remaining steps + """Generate expanding-window train/test splits for backtesting. + + Implements an expanding window cross-validation strategy. A fixed training + set is returned (used to train the model once), along with an iterator of + ``n_test_sets`` splits. Each split consists of: + + - **historic_data**: all data up to the split point (expands each split) + - **masked_future_data**: future covariates *without* disease_cases + - **future_data**: full future data including disease_cases (ground truth) + + The split indices are computed from the end of the dataset working + backwards:: + + split_idx = -(prediction_length + (n_test_sets - 1) * stride + 1) + + Example with a dataset of 20 periods, prediction_length=3, n_test_sets=2, + stride=1:: + + split_idx = -(3 + (2 - 1) * 1 + 1) = -5 -> index 15 + + Split 0: historic = periods[..15], future = periods[16..18] + Split 1: historic = periods[..16], future = periods[17..19] + + Train set = periods[..15] (same as split 0 historic) Parameters ---------- dataset - The full dataset + The full dataset. prediction_length - How many periods to predict + Number of periods to predict in each test window. n_test_sets - How many test sets to generate + Number of test splits to generate. stride - How many periods to stride between test sets + Number of periods to advance between successive splits. future_weather_provider - A function that can provide future weather data for the test sets + Optional callable that provides future weather data (with + disease_cases masked) for each test split. Returns ------- - tuple[DataSet, Iterable[tuple[DataSet, DataSet]]] - The train set and an iterator of test sets - + tuple[DataSet, Iterable[tuple[DataSet, DataSet, DataSet]]] + The training set and an iterator of + (historic_data, masked_future_data, future_data) tuples. """ split_idx = -(prediction_length + (n_test_sets - 1) * stride + 1) train_set = dataset.restrict_time_period(slice(None, dataset.period_range[split_idx])) @@ -121,6 +196,25 @@ def train_test_split_with_weather( def get_split_points_for_data_set(data_set: DataSet, max_splits: int, start_offset=1) -> list[TimePeriod]: + """Compute evenly-spaced split points for a dataset. + + Uses the time periods from the first location (assumes all locations share + the same time range). + + Parameters + ---------- + data_set + The dataset to compute split points for. + max_splits + Maximum number of split points to return. + start_offset + Number of initial periods to skip before the first possible split. + + Returns + ------- + list[TimePeriod] + Up to ``max_splits`` evenly-spaced time periods. + """ periods = ( next(iter(data_set.data())).data().time_period ) # Uses the time for the first location, assumes it to be the same for all! @@ -128,5 +222,24 @@ def get_split_points_for_data_set(data_set: DataSet, max_splits: int, start_offs def get_split_points_for_period_range(max_splits: int, periods, start_offset: int) -> list[TimePeriod]: + """Compute evenly-spaced split points from a period range. + + Divides the available periods (after ``start_offset``) into + ``max_splits + 1`` equal segments and returns the boundary points. + + Parameters + ---------- + max_splits + Maximum number of split points to return. + periods + Sequence of time periods to select from. + start_offset + Number of initial periods to skip. + + Returns + ------- + list[TimePeriod] + Up to ``max_splits`` evenly-spaced time periods. + """ delta = (len(periods) - 1 - start_offset) // (max_splits + 1) return list(periods)[start_offset + delta :: delta][:max_splits] diff --git a/chap_core/assessment/prediction_evaluator.py b/chap_core/assessment/prediction_evaluator.py index a9c6d1a7a..2f7e60eb7 100644 --- a/chap_core/assessment/prediction_evaluator.py +++ b/chap_core/assessment/prediction_evaluator.py @@ -1,3 +1,11 @@ +"""Model evaluation through backtesting. + +Provides functions for training a model and evaluating its predictions against +held-out test data using expanding window cross-validation. The main entry +points are ``backtest`` (yields per-split prediction results) and +``evaluate_model`` (runs a full evaluation with GluonTS metrics). +""" + import logging from collections import defaultdict from typing import Dict, Iterable, Protocol, TypeVar @@ -40,6 +48,33 @@ def train(self, data: DataSet) -> Predictor: ... def backtest( estimator: Estimator, data: DataSet, prediction_length, n_test_sets, stride=1, weather_provider=None ) -> Iterable[DataSet]: + """Train a model once and generate predictions for each test split. + + Uses ``train_test_generator`` to create an expanding window split of the + data. The estimator is trained on the initial training set, then the + trained predictor generates forecasts for each successive test window. + + Parameters + ---------- + estimator + Model estimator with a ``train`` method. + data + Full dataset to split and evaluate on. + prediction_length + Number of periods to predict per test window. + n_test_sets + Number of expanding window test splits. + stride + Periods to advance between successive splits. + weather_provider + Optional future weather data provider. + + Yields + ------ + DataSet[SamplesWithTruth] + For each test split, a dataset mapping locations to + ``SamplesWithTruth`` (predicted samples merged with observed values). + """ train, test_generator = train_test_generator( data, prediction_length, n_test_sets, future_weather_provider=weather_provider ) diff --git a/docs/contributor/evaluation_pipeline.md b/docs/contributor/evaluation_pipeline.md new file mode 100644 index 000000000..431a83b60 --- /dev/null +++ b/docs/contributor/evaluation_pipeline.md @@ -0,0 +1,150 @@ +# Evaluation Pipeline + +This document explains how model evaluation (backtesting) works internally in CHAP, with a focus on the expanding window cross-validation strategy used to split time series data. + +## Overview + +The evaluation pipeline answers the question: *"How well does a model predict disease cases on data it has not seen?"* + +It does this by: + +1. Splitting a historical dataset into training and test portions +2. Training the model on the training data +3. Generating predictions for each test window +4. Comparing predictions against observed values (ground truth) + +Because disease surveillance data is a time series, we cannot use random train/test splits. Instead, CHAP uses **expanding window cross-validation**, where the training data always precedes the test data chronologically. + +## Pipeline Architecture + +The evaluation flow from entry point to results: + +``` +Evaluation.create() # evaluation.py + | + +--> backtest() # prediction_evaluator.py + | | + | +--> train_test_generator() # dataset_splitting.py + | | Returns (train_set, splits_iterator) + | | + | +--> estimator.train(train_set) + | | Returns predictor + | | + | +--> for each split: + | predictor.predict(historic, future) + | Merge predictions with ground truth + | Yield DataSet[SamplesWithTruth] + | + +--> Evaluation.from_samples_with_truth() + Wraps results in an Evaluation object +``` + +## Expanding Window Cross-Validation + +### The Problem + +Standard k-fold cross-validation randomly assigns data points to folds. This is invalid for time series because: + +- Models would train on future data and predict the past +- Temporal autocorrelation would leak information between folds + +### The Strategy + +CHAP uses an **expanding window** approach where: + +- The model is trained once on an initial training set +- Multiple test windows are created by sliding forward through the data +- For each test window, the model receives all historical data up to that point + +The key parameters are: + +- **prediction_length**: how many periods each test window covers +- **n_test_sets**: how many test windows to create +- **stride**: how many periods to advance between windows + +### How Split Indices Are Calculated + +The `train_test_generator` function computes splits from the end of the dataset working backwards: + +``` +split_idx = -(prediction_length + (n_test_sets - 1) * stride + 1) +``` + +This ensures the last test window ends at the final period of the dataset. + +### Concrete Example + +Consider a dataset with 20 monthly periods (indices 0-19), `prediction_length=3`, `n_test_sets=3`, `stride=1`: + +``` +split_idx = -(3 + (3 - 1) * 1 + 1) = -6 -> index 14 + +Split 0: historic = [0..14], future = [15, 16, 17] +Split 1: historic = [0..15], future = [16, 17, 18] +Split 2: historic = [0..16], future = [17, 18, 19] + +Train set = [0..14] (same as split 0 historic data) +``` + +Visually, with `T` = train, `H` = extra historic context, `F` = future/test: + +``` +Period: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + +Train: T T T T T T T T T T T T T T T +Split 0: T T T T T T T T T T T T T T T F F F +Split 1: T T T T T T T T T T T T T T T H F F F +Split 2: T T T T T T T T T T T T T T T H H F F F +``` + +Note how the historic data expands with each split while the future window slides forward. + +### What the Model Sees + +For each test split, the predictor receives: + +- **historic_data**: full dataset (all features including disease_cases) up to the split point +- **future_data (masked)**: future covariates (e.g. climate data) *without* disease_cases -- this is what the model uses to make predictions +- **future_data (truth)**: full future data including disease_cases -- used after prediction to evaluate accuracy + +## Key Components + +### `chap_core/assessment/dataset_splitting.py` + +Handles splitting datasets into train/test portions: + +- `train_test_generator()` -- main function implementing expanding window splits +- `train_test_split()` -- single split at one time point +- `split_test_train_on_period()` -- generates splits at multiple split points +- `get_split_points_for_data_set()` -- computes evenly-spaced split points + +### `chap_core/assessment/prediction_evaluator.py` + +Runs the model and collects predictions: + +- `backtest()` -- trains model once, yields predictions for each split +- `evaluate_model()` -- full evaluation with GluonTS metrics and PDF report + +### `chap_core/assessment/evaluation.py` + +High-level evaluation abstraction: + +- `Evaluation.create()` -- end-to-end factory: runs backtest and wraps results +- `Evaluation.from_samples_with_truth()` -- builds evaluation from raw prediction results +- `Evaluation.to_file()` / `from_file()` -- NetCDF serialization for sharing results + +## Code Flow: `Evaluation.create()` + +Step-by-step walkthrough of what happens when `Evaluation.create()` is called (e.g. from the CLI `chap evaluate` command): + +1. **`backtest()`** is called with the estimator and dataset +2. Inside `backtest()`, **`train_test_generator()`** computes the split index and creates: + - A training set (data up to the first split point) + - An iterator of (historic, masked_future, future_truth) tuples +3. The estimator is **trained once** on the training set, producing a predictor +4. For each test split, the predictor generates samples and they are **merged with ground truth** into `SamplesWithTruth` objects +5. Back in `create()`, **`train_test_generator()`** is called again to determine the last training period +6. **`from_samples_with_truth()`** assembles an `Evaluation` object containing: + - `BackTest` with all forecasts and observations + - Historical observations for plotting context +7. The `Evaluation` can then be **exported** to NetCDF, used for metric computation, or visualized diff --git a/mkdocs.yml b/mkdocs.yml index dbed80a1e..2c0766ce1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -112,6 +112,7 @@ nav: - Database Migrations: contributor/database_migrations.md - Documentation: contributor/writing_building_documentation.md - Development Tools: contributor/development_tools.md + - Evaluation Pipeline: contributor/evaluation_pipeline.md - Evaluation Abstraction: contributor/evaluation_abstraction.md - Preference Learning: contributor/preference_learning.md - Web API: