dhis2-chap · knutdrand · Feb 5, 2026 · Feb 5, 2026
diff --git a/chap_core/assessment/dataset_splitting.py b/chap_core/assessment/dataset_splitting.py
@@ -1,3 +1,12 @@
+"""Train/test splitting utilities for time series evaluation.
+
+Provides functions for splitting spatio-temporal datasets into training and
+test sets for model evaluation. The main entry point is ``train_test_generator``,
+which implements an expanding window cross-validation strategy where the
+training set grows with each successive split while the prediction window
+slides forward.
+"""
+
 from typing import Iterable, Iterator, Optional, Protocol, Type
 
 from chap_core.climate_predictor import FutureWeatherFetcher
@@ -18,6 +27,29 @@ def split_test_train_on_period(
     include_future_weather: bool = False,
     future_weather_class: Type[ClimateData] = ClimateData,
 ):
+    """Generate train/test splits at each split point.
+
+    For each split point, produces a (train, test) tuple where training data
+    ends just before the split point and test data starts at the split point.
+
+    Parameters
+    ----------
+    data_set
+        The full dataset to split.
+    split_points
+        Time periods at which to split (each becomes the start of a test set).
+    future_length
+        Optional time delta to limit test set length.
+    include_future_weather
+        If True, return (train, test, future_weather) tuples instead.
+    future_weather_class
+        Dataclass type for future weather data.
+
+    Yields
+    ------
+    tuple
+        (train, test) or (train, test, future_weather) for each split point.
+    """
     func = train_test_split_with_weather if include_future_weather else train_test_split
 
     if include_future_weather:
@@ -34,6 +66,27 @@ def train_test_split(
     extension: Optional[IsTimeDelta] = None,
     restrict_test=True,
 ):
+    """Split a dataset into train and test sets at a single split point.
+
+    Parameters
+    ----------
+    data_set
+        The full dataset.
+    prediction_start_period
+        First period of the test set. Training data ends at the period
+        immediately before this.
+    extension
+        Optional time delta to extend the test set end beyond
+        ``prediction_start_period``.
+    restrict_test
+        If True, restrict the test set to the prediction window.
+        If False, return the full dataset as the test set.
+
+    Returns
+    -------
+    tuple[DataSet, DataSet]
+        (train_data, test_data).
+    """
     last_train_period = previous(prediction_start_period)
     train_data = data_set.restrict_time_period(slice(None, last_train_period))
     if extension is not None:
@@ -54,28 +107,50 @@ def train_test_generator(
     stride: int = 1,
     future_weather_provider: Optional[FutureWeatherFetcher] = None,
 ) -> tuple[DataSet, Iterator[tuple[DataSet, DataSet, DataSet]]]:
-    """
-    Genereate a train set along with an iterator of test data that contains tuples of full data up until a
-    split point and data without target variables for the remaining steps
+    """Generate expanding-window train/test splits for backtesting.
+
+    Implements an expanding window cross-validation strategy. A fixed training
+    set is returned (used to train the model once), along with an iterator of
+    ``n_test_sets`` splits. Each split consists of:
+
+    - **historic_data**: all data up to the split point (expands each split)
+    - **masked_future_data**: future covariates *without* disease_cases
+    - **future_data**: full future data including disease_cases (ground truth)
+
+    The split indices are computed from the end of the dataset working
+    backwards::
+
+        split_idx = -(prediction_length + (n_test_sets - 1) * stride + 1)
+
+    Example with a dataset of 20 periods, prediction_length=3, n_test_sets=2,
+    stride=1::
+
+        split_idx = -(3 + (2 - 1) * 1 + 1) = -5  -> index 15
+
+        Split 0: historic = periods[..15], future = periods[16..18]
+        Split 1: historic = periods[..16], future = periods[17..19]
+
+        Train set = periods[..15]  (same as split 0 historic)
 
     Parameters
     ----------
     dataset
-        The full dataset
+        The full dataset.
     prediction_length
-        How many periods to predict
+        Number of periods to predict in each test window.
     n_test_sets
-        How many test sets to generate
+        Number of test splits to generate.
     stride
-        How many periods to stride between test sets
+        Number of periods to advance between successive splits.
     future_weather_provider
-        A function that can provide future weather data for the test sets
+        Optional callable that provides future weather data (with
+        disease_cases masked) for each test split.
 
     Returns
     -------
-    tuple[DataSet, Iterable[tuple[DataSet, DataSet]]]
-        The train set and an iterator of test sets
-
+    tuple[DataSet, Iterable[tuple[DataSet, DataSet, DataSet]]]
+        The training set and an iterator of
+        (historic_data, masked_future_data, future_data) tuples.
     """
     split_idx = -(prediction_length + (n_test_sets - 1) * stride + 1)
     train_set = dataset.restrict_time_period(slice(None, dataset.period_range[split_idx]))
@@ -121,12 +196,50 @@ def train_test_split_with_weather(
 
 
 def get_split_points_for_data_set(data_set: DataSet, max_splits: int, start_offset=1) -> list[TimePeriod]:
+    """Compute evenly-spaced split points for a dataset.
+
+    Uses the time periods from the first location (assumes all locations share
+    the same time range).
+
+    Parameters
+    ----------
+    data_set
+        The dataset to compute split points for.
+    max_splits
+        Maximum number of split points to return.
+    start_offset
+        Number of initial periods to skip before the first possible split.
+
+    Returns
+    -------
+    list[TimePeriod]
+        Up to ``max_splits`` evenly-spaced time periods.
+    """
     periods = (
         next(iter(data_set.data())).data().time_period
     )  # Uses the time for the first location, assumes it to be the same for all!
     return get_split_points_for_period_range(max_splits, periods, start_offset)
 
 
 def get_split_points_for_period_range(max_splits: int, periods, start_offset: int) -> list[TimePeriod]:
+    """Compute evenly-spaced split points from a period range.
+
+    Divides the available periods (after ``start_offset``) into
+    ``max_splits + 1`` equal segments and returns the boundary points.
+
+    Parameters
+    ----------
+    max_splits
+        Maximum number of split points to return.
+    periods
+        Sequence of time periods to select from.
+    start_offset
+        Number of initial periods to skip.
+
+    Returns
+    -------
+    list[TimePeriod]
+        Up to ``max_splits`` evenly-spaced time periods.
+    """
     delta = (len(periods) - 1 - start_offset) // (max_splits + 1)
     return list(periods)[start_offset + delta :: delta][:max_splits]
diff --git a/chap_core/assessment/prediction_evaluator.py b/chap_core/assessment/prediction_evaluator.py
@@ -1,3 +1,11 @@
+"""Model evaluation through backtesting.
+
+Provides functions for training a model and evaluating its predictions against
+held-out test data using expanding window cross-validation. The main entry
+points are ``backtest`` (yields per-split prediction results) and
+``evaluate_model`` (runs a full evaluation with GluonTS metrics).
+"""
+
 import logging
 from collections import defaultdict
 from typing import Dict, Iterable, Protocol, TypeVar
@@ -40,6 +48,33 @@ def train(self, data: DataSet) -> Predictor: ...
 def backtest(
     estimator: Estimator, data: DataSet, prediction_length, n_test_sets, stride=1, weather_provider=None
 ) -> Iterable[DataSet]:
+    """Train a model once and generate predictions for each test split.
+
+    Uses ``train_test_generator`` to create an expanding window split of the
+    data. The estimator is trained on the initial training set, then the
+    trained predictor generates forecasts for each successive test window.
+
+    Parameters
+    ----------
+    estimator
+        Model estimator with a ``train`` method.
+    data
+        Full dataset to split and evaluate on.
+    prediction_length
+        Number of periods to predict per test window.
+    n_test_sets
+        Number of expanding window test splits.
+    stride
+        Periods to advance between successive splits.
+    weather_provider
+        Optional future weather data provider.
+
+    Yields
+    ------
+    DataSet[SamplesWithTruth]
+        For each test split, a dataset mapping locations to
+        ``SamplesWithTruth`` (predicted samples merged with observed values).
+    """
     train, test_generator = train_test_generator(
         data, prediction_length, n_test_sets, future_weather_provider=weather_provider
     )

diff --git a/docs/contributor/evaluation_pipeline.md b/docs/contributor/evaluation_pipeline.md
@@ -0,0 +1,150 @@
+# Evaluation Pipeline
+
+This document explains how model evaluation (backtesting) works internally in CHAP, with a focus on the expanding window cross-validation strategy used to split time series data.
+
+## Overview
+
+The evaluation pipeline answers the question: *"How well does a model predict disease cases on data it has not seen?"*
+
+It does this by:
+
+1. Splitting a historical dataset into training and test portions
+2. Training the model on the training data
+3. Generating predictions for each test window
+4. Comparing predictions against observed values (ground truth)
+
+Because disease surveillance data is a time series, we cannot use random train/test splits. Instead, CHAP uses **expanding window cross-validation**, where the training data always precedes the test data chronologically.
+
+## Pipeline Architecture
+
+The evaluation flow from entry point to results:
+
+```
+Evaluation.create()                          # evaluation.py
+    |
+    +--> backtest()                          # prediction_evaluator.py
+    |       |
+    |       +--> train_test_generator()      # dataset_splitting.py
+    |       |       Returns (train_set, splits_iterator)
+    |       |
+    |       +--> estimator.train(train_set)
+    |       |       Returns predictor
+    |       |
+    |       +--> for each split:
+    |               predictor.predict(historic, future)
+    |               Merge predictions with ground truth
+    |               Yield DataSet[SamplesWithTruth]
+    |
+    +--> Evaluation.from_samples_with_truth()
+            Wraps results in an Evaluation object
+```
+
+## Expanding Window Cross-Validation
+
+### The Problem
+
+Standard k-fold cross-validation randomly assigns data points to folds. This is invalid for time series because:
+
+- Models would train on future data and predict the past
+- Temporal autocorrelation would leak information between folds
+
+### The Strategy
+
+CHAP uses an **expanding window** approach where:
+
+- The model is trained once on an initial training set
+- Multiple test windows are created by sliding forward through the data
+- For each test window, the model receives all historical data up to that point
+
+The key parameters are:
+
+- **prediction_length**: how many periods each test window covers
+- **n_test_sets**: how many test windows to create
+- **stride**: how many periods to advance between windows
+
+### How Split Indices Are Calculated
+
+The `train_test_generator` function computes splits from the end of the dataset working backwards:
+
+```
+split_idx = -(prediction_length + (n_test_sets - 1) * stride + 1)
+```
+
+This ensures the last test window ends at the final period of the dataset.
+
+### Concrete Example
+
+Consider a dataset with 20 monthly periods (indices 0-19), `prediction_length=3`, `n_test_sets=3`, `stride=1`:
+
+```
+split_idx = -(3 + (3 - 1) * 1 + 1) = -6  -> index 14
+
+Split 0: historic = [0..14],  future = [15, 16, 17]
+Split 1: historic = [0..15],  future = [16, 17, 18]
+Split 2: historic = [0..16],  future = [17, 18, 19]
+
+Train set = [0..14]  (same as split 0 historic data)
+```
+
+Visually, with `T` = train, `H` = extra historic context, `F` = future/test:
+
+```
+Period:   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
+
+Train:    T  T  T  T  T  T  T  T  T  T  T  T  T  T  T
+Split 0:  T  T  T  T  T  T  T  T  T  T  T  T  T  T  T  F  F  F
+Split 1:  T  T  T  T  T  T  T  T  T  T  T  T  T  T  T  H  F  F  F
+Split 2:  T  T  T  T  T  T  T  T  T  T  T  T  T  T  T  H  H  F  F  F
+```
+
+Note how the historic data expands with each split while the future window slides forward.
+
+### What the Model Sees
+
+For each test split, the predictor receives:
+
+- **historic_data**: full dataset (all features including disease_cases) up to the split point
+- **future_data (masked)**: future covariates (e.g. climate data) *without* disease_cases -- this is what the model uses to make predictions
+- **future_data (truth)**: full future data including disease_cases -- used after prediction to evaluate accuracy
+
+## Key Components
+
+### `chap_core/assessment/dataset_splitting.py`
+
+Handles splitting datasets into train/test portions:
+
+- `train_test_generator()` -- main function implementing expanding window splits
+- `train_test_split()` -- single split at one time point
+- `split_test_train_on_period()` -- generates splits at multiple split points
+- `get_split_points_for_data_set()` -- computes evenly-spaced split points
+
+### `chap_core/assessment/prediction_evaluator.py`
+
+Runs the model and collects predictions:
+
+- `backtest()` -- trains model once, yields predictions for each split
+- `evaluate_model()` -- full evaluation with GluonTS metrics and PDF report
+
+### `chap_core/assessment/evaluation.py`
+
+High-level evaluation abstraction:
+
+- `Evaluation.create()` -- end-to-end factory: runs backtest and wraps results
+- `Evaluation.from_samples_with_truth()` -- builds evaluation from raw prediction results
+- `Evaluation.to_file()` / `from_file()` -- NetCDF serialization for sharing results
+
+## Code Flow: `Evaluation.create()`
+
+Step-by-step walkthrough of what happens when `Evaluation.create()` is called (e.g. from the CLI `chap evaluate` command):
+
+1. **`backtest()`** is called with the estimator and dataset
+2. Inside `backtest()`, **`train_test_generator()`** computes the split index and creates:
+     - A training set (data up to the first split point)
+     - An iterator of (historic, masked_future, future_truth) tuples
+3. The estimator is **trained once** on the training set, producing a predictor
+4. For each test split, the predictor generates samples and they are **merged with ground truth** into `SamplesWithTruth` objects
+5. Back in `create()`, **`train_test_generator()`** is called again to determine the last training period
+6. **`from_samples_with_truth()`** assembles an `Evaluation` object containing:
+     - `BackTest` with all forecasts and observations
+     - Historical observations for plotting context
+7. The `Evaluation` can then be **exported** to NetCDF, used for metric computation, or visualized
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -112,6 +112,7 @@ nav:
           - Database Migrations: contributor/database_migrations.md
           - Documentation: contributor/writing_building_documentation.md
           - Development Tools: contributor/development_tools.md
+          - Evaluation Pipeline: contributor/evaluation_pipeline.md
           - Evaluation Abstraction: contributor/evaluation_abstraction.md
           - Preference Learning: contributor/preference_learning.md
       - Web API: