mdtanker · mdtanker · Dec 12, 2025
diff --git a/docs/how_to/uncertainty_parameter_sampling.ipynb b/docs/how_to/uncertainty_parameter_sampling.ipynb
diff --git a/docs/how_to/uncertainty_workflow.png b/docs/how_to/uncertainty_workflow.png
diff --git a/docs/tutorial/10_uncertainty_analysis.ipynb b/docs/tutorial/10_uncertainty_analysis.ipynb
@@ -6,7 +6,11 @@
    "source": [
     "# Uncertainty analysis\n",
     "\n",
-    "Here we will demonstrate a full uncertainty analysis of the inversion. We use a stochastic approach, where we 1) choose the important input parameters to the inversion, 2) define each of there uncertainty distributions, 3) run a series of inversions which sample these inputs from their uncertainty distributions, and 4) use the ensemble of inverted topography models to define the mean result and the uncertainty."
+    "**Invert4Geom** uses a stochastic approach for quantifying uncertainties. Simply put, an inversion can be treated as a black box, which takes inputs and outputs a topography model. This inputs include gravity data, a starting model, and various parameters, such as amount of damping, density contrast, reference level, and parameters for estimating the regional gravity field. The concept of a stochastic uncertainty analysis is to see how changes to these *inputs* affects the *output* (inverted topography). \n",
+    "\n",
+    "For example, we can perform several inversions, changing the density contrast value used in each one, and compare the inverted topography for each of these. For this produced **ensemble** of inverted topography models, we can calculated the cell-wise mean and the cell-wise standard deviation. The grid of cell-wise means would be our stochastic topography model, and the grid of cell-wise standard deviations would be our estimate of the spatially-variable uncertainty of the model. This procedure is described in the below figure.\n",
+    "\n",
+    "![uncertainty_workflow](uncertainty_workflow.png)"
    ]
   },
   {

diff --git a/src/invert4geom/__init__.py b/src/invert4geom/__init__.py
@@ -54,14 +54,17 @@
     synthetic_topography_simple,
 )
 from .uncertainty import (  # noqa: E402
+    create_lhc,
     full_workflow_uncertainty_loop,
     merged_stats,
     randomly_sample_data,
     regional_misfit_uncertainty,
 )
 from .utils import (  # noqa: E402
     create_topography,
+    dist_nearest_points,
     gravity_decay_buffer,
+    normalize_xarray,
     normalized_mindist,
     optimal_spline_damping,
     rmse,

diff --git a/src/invert4geom/plotting.py b/src/invert4geom/plotting.py
@@ -1430,25 +1430,12 @@
 
         plt.show()
 
-    dim = np.shape(df)[1]
-
-    param_values = df.to_numpy()
-
-    problem = {
-        "num_vars": dim,
-        "names": [i.replace("_", " ") for i in df.columns],
-        "bounds": [[-1, 1]] * dim,
-    }
-
-    # Rescale to the unit hypercube for the analysis
-    sample = utils.scale_normalized(param_values, problem["bounds"])
-
     # 2D projection
     if plot_2d_projections:
         if len(df.columns) == 1:
             pass
         else:
-            plot_sampled_projection_2d(sample, problem["names"])
+            plot_sampled_projection_2d(df)
 
 
 def projection_2d(
@@ -1466,36 +1453,155 @@
 
 
 def plot_sampled_projection_2d(
-    sample: NDArray,
-    var_names: list[str],
+    sample: pd.DataFrame,
 ) -> None:
     """
     Plots the samples projected on each 2D plane
 
     Parameters
     ----------
-    sample : numpy.ndarray
+    sample : pd.DataFrame
         The sampled values
-    var_names : list[str]
-        The names of the variables
     """
-    dim = sample.shape[1]
+    df = sample.copy()
+    var_names = df.columns.tolist()
+    dim = len(var_names)
+
+    norm_var_names = [f"{c}_normalized" for c in var_names]
+
+    # Rescale to the unit hypercube for the analysis
+    for c in var_names:
+        df[f"{c}_normalized"] = utils.normalize(df[c].values, low=0, high=1)
+
+    fig, axs = plt.subplots(dim, dim, figsize=(6, 6))  # nrows, ncols
+
+    cmap = plt.cm.get_cmap("Greys")
+
+    vals_list = []
+    da_list = []
+    for i in range(dim):
+        for j in range(dim):
+            if i == j:
+                min_dist = None
+            else:
+                # create grid of parameter coordinates
+                (x, y) = vd.grid_coordinates(
+                    region=[
+                        df[norm_var_names[j]].min(),
+                        df[norm_var_names[j]].max(),
+                        df[norm_var_names[i]].min(),
+                        df[norm_var_names[i]].max(),
+                    ],
+                    shape=(100, 100),
+                    pixel_register=True,
+                )
+
+                # make dataarray of parameter space
+                grid = vd.make_xarray_grid(
+                    (x, y),
+                    np.ones_like(x),
+                    dims=(norm_var_names[i], norm_var_names[j]),
+                    data_names="data",
+                ).data
+
+                min_dist = utils.normalized_mindist(
+                    df[[norm_var_names[i], norm_var_names[j]]],
+                    grid=grid,
+                    # low=0,
+                    # high=1,
+                )
+                vals_list.append(min_dist.to_numpy().flatten())
+            da_list.append(min_dist)
 
+    vals = np.concatenate(vals_list)
+    # cpt_lims = (0, polar_utils.get_combined_min_max(vals)[1])
+    cpt_lims = (0, 0.5)
+    k = 0
     for i in range(dim):
         for j in range(dim):
-            plt.subplot(dim, dim, i * dim + j + 1)
-            plt.scatter(
-                sample[:, j],
-                sample[:, i],
+            if da_list[k] is not None:
+                im = da_list[k].plot(
+                    ax=axs[i, j],
+                    cmap=cmap,
+                    add_colorbar=False,
+                    vmin=cpt_lims[0],
+                    vmax=cpt_lims[1],
+                    add_labels=False,
+                )
+
+            axs[i, j].scatter(
+                df[norm_var_names[j]],
+                df[norm_var_names[i]],
                 s=2,
+                zorder=10,
+                clip_on=False,
             )
+            axs[i, j].set_xlim(df[norm_var_names[j]].min(), df[norm_var_names[j]].max())
+            axs[i, j].set_ylim(df[norm_var_names[i]].min(), df[norm_var_names[i]].max())
+
             if j == 0:
-                plt.ylabel(var_names[i], rotation=0, ha="right")
+                axs[i, j].set_ylabel(var_names[i], rotation=0, ha="right")
             if i == dim - 1:
-                plt.xlabel(var_names[j], rotation=20, ha="right")
+                axs[i, j].set_xlabel(var_names[j], rotation=20, ha="right")
+
+            # axs[i, j].set_xticks([])
+            # axs[i, j].set_yticks([])
+
+            # set 3 ticks for each subplot
+            axs[i, j].xaxis.set_major_locator(plt.MaxNLocator(2))
+            axs[i, j].yaxis.set_major_locator(plt.MaxNLocator(2))
+
+            # add actual variable values as tick labels
+            x_ticks = axs[i, j].get_xticks()
+            y_ticks = axs[i, j].get_yticks()
+            x_tick_labels = []
+            y_tick_labels = []
+            for xt in x_ticks:
+                # rescale to actual variable values
+                val = (
+                    xt * (df[var_names[j]].max() - df[var_names[j]].min())
+                    + df[var_names[j]].min()
+                )
+                x_tick_labels.append(f"{val:.2f}")
+            for yt in y_ticks:
+                val = (
+                    yt * (df[var_names[i]].max() - df[var_names[i]].min())
+                    + df[var_names[i]].min()
+                )
+                y_tick_labels.append(f"{val:.2f}")
+            axs[i, j].set_xticklabels(x_tick_labels)
+            axs[i, j].set_yticklabels(y_tick_labels)
+
+            k += 1
+
+    # add colorbar of distances
+    x = list(np.arange(cpt_lims[0], cpt_lims[1], 0.1))
+    x = [round(i, 2) for i in x]
+    if round(cpt_lims[1], 2) not in x:
+        x.append(cpt_lims[1])
+    cbar_ax = fig.add_axes([0.2, -0.04, 0.6, 0.02])
+    fig.colorbar(
+        im,
+        cax=cbar_ax,
+        orientation="horizontal",
+        label="Normalized distance to nearest sample",
+        ticks=x,
+        format=mpl.ticker.FormatStrFormatter("%.2g"),  # "%.2f"),
+    )
+
+    # add histogram to colorbar
+    ll, bb, ww, hh = cbar_ax.get_position().bounds
+    hist_ax = fig.add_axes([ll, bb + hh, ww, 0.06])
+    _n, bins, patches = hist_ax.hist(vals, bins=100)
+    bin_centers = 0.5 * (bins[:-1] + bins[1:])
+    col = bin_centers - min(bin_centers)
+    col /= max(col)
+    for c, p in zip(col, patches, strict=False):
+        plt.setp(p, "facecolor", cmap(c))
+
+    hist_ax.set_xlim(cpt_lims[0], cpt_lims[1])
+    hist_ax.set_axis_off()
 
-            plt.xticks([])
-            plt.yticks([])
     plt.show()
 
 

diff --git a/src/invert4geom/uncertainty.py b/src/invert4geom/uncertainty.py
@@ -39,6 +39,97 @@
 if typing.TYPE_CHECKING:
     from invert4geom.inversion import Inversion
 
+from scipy.special import erf
+
+
+def mann_kendall_test(y, prec):
+    """
+    Mann-Kendall test (precision is the number of decimals)
+    Outputs are the normalized statistic Z and the associated p-value
+    """
+    n = len(y)
+    x = np.int_(y * (10**prec))
+
+    # Sign matrix and ties
+    sm = np.zeros((n - 1, n - 1))
+    for i in range(n - 1):
+        sm[i, i:n] = np.sign(x[i + 1 : n] - x[0 : n - 1 - i])  # E203
+
+    # Compute MK statistic
+    s = np.sum(sm)
+
+    # Count ties and their c
+    # appel Mimiontributions to variance of the MK statistic
+    [val, count] = np.unique(x, return_counts=True)
+    [extent, ties] = np.unique(count, return_counts=True)
+    tie_contribution = np.zeros(len(ties))
+    for i in range(len(ties)):
+        tie_contribution[i] = (
+            ties[i] * extent[i] * (extent[i] - 1) * (2 * extent[i] + 5)
+        )
+
+    # Compute the variance
+    vs = (n * (n - 1) * (2 * n + 5) - np.sum(tie_contribution)) / 18
+    if vs < 0:
+        print("WARNING: negative variance!!!")
+
+    # Compute standard normal statistic
+    z = (s - np.sign(s)) / np.sqrt(max(vs, 1))
+
+    # Associated p-value
+    pval = 1 - erf(abs(z) / np.sqrt(2))
+
+    return [z, pval]
+
+
+def mann_kendall_test_sample(sample):
+    """
+    Same as above, but for whole sample
+    Outputs are the normalized statistic Z and the associated p-value
+    """
+    # Local variables
+    n = sample.shape[0]
+    var = sample.shape[1]
+    x = np.argsort(
+        sample, axis=0
+    )  # Ranks of the values in the ensemble, for each variable
+    mk_res = np.zeros((var, var))
+    pval = np.zeros((var, var))
+
+    # MK test results
+    for i in range(var):
+        reorder_sample = np.zeros((n, var))
+        for j in range(n):
+            reorder_sample[j, :] = sample[x[j, i], :]
+        for v in np.arange(i + 1, var):
+            [mk_res[i, v], pval[i, v]] = mann_kendall_test(reorder_sample[:, v], 5)
+            [mk_res[v, i], pval[v, i]] = [mk_res[i, v], pval[i, v]]
+
+    return [mk_res, pval]
+
+
+from scipy.stats import pearsonr
+
+
+def pearson_test_sample(sample):
+    """
+    Correlation Pearson test for whole sample. Outputs are:
+    the Pearson statistic rho
+    the p-value pval
+    """
+    # Local variables
+    var = sample.shape[1]
+    rho = np.zeros((var, var))
+    pval = np.zeros((var, var))
+
+    # Pearson test results
+    for i in range(var):
+        for v in np.arange(i + 1, var):
+            [rho[i, v], pval[i, v]] = pearsonr(sample[:, i], sample[:, v])
+            [rho[v, i], pval[v, i]] = [rho[i, v], pval[i, v]]
+
+    return [rho, pval]
+
 
 def create_lhc(
     n_samples: int,
@@ -57,12 +148,13 @@
     parameter_dict : dict
         nested dictionary, with a dictionary of 'distribution', 'loc', 'scale' and
         optionally 'log' for each parameter to be sampled. Distributions can be
-        'uniform' or 'normal'. For 'uniform', 'loc' is the lower bound and 'scale' is
-        the range of the distribution. 'loc' + 'scale' = upper bound. For 'normal',
-        'loc' is the center (mean) of the distribution and 'scale' is the standard
-        deviation. If 'log' is True, the provided 'loc' and 'scale' values are the base
-        10 exponents. For example, a uniform distribution with loc=-4, scale=6 and
-        log=True would sample values between 1e-4 and 1e2.
+        'uniform', 'uniform_discrete', or 'normal'. For 'uniform' and
+        'uniform_discrete', 'loc' is the lower bound and 'scale' is the range of the
+        distribution ('loc' + 'scale' = upper bound). For 'normal', 'loc' is the center
+        (mean) of the distribution and 'scale' is the standard deviation. If 'log' is
+        True, the provided 'loc' and 'scale' values are the base 10 exponents. For
+        example, a uniform distribution with loc=-4, scale=6 and log=True would sample
+        values between 1e-4 and 1e2.
     random_state : int, optional
         random state to use for sampling, by default 1
     criterion : str, optional