mdtanker
diff --git a/‎docs/how_to/uncertainty_parameter_sampling.ipynb‎
Lines changed: 1151 additions & 0 deletions b/‎docs/how_to/uncertainty_parameter_sampling.ipynb‎
Lines changed: 1151 additions & 0 deletions
diff --git a/‎docs/how_to/uncertainty_workflow.png‎
1.57 MB b/‎docs/how_to/uncertainty_workflow.png‎
1.57 MB
diff --git a/‎docs/tutorial/10_uncertainty_analysis.ipynb‎
Lines changed: 5 additions & 1 deletion b/‎docs/tutorial/10_uncertainty_analysis.ipynb‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/invert4geom/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎src/invert4geom/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/invert4geom/plotting.py‎
Lines changed: 134 additions & 28 deletions b/‎src/invert4geom/plotting.py‎
Lines changed: 134 additions & 28 deletions
diff --git a/‎src/invert4geom/uncertainty.py‎
Lines changed: 98 additions & 6 deletions b/‎src/invert4geom/uncertainty.py‎
Lines changed: 98 additions & 6 deletions
@@ -6,7 +6,11 @@
    "source": [
     "# Uncertainty analysis\n",
     "\n",
-    "Here we will demonstrate a full uncertainty analysis of the inversion. We use a stochastic approach, where we 1) choose the important input parameters to the inversion, 2) define each of there uncertainty distributions, 3) run a series of inversions which sample these inputs from their uncertainty distributions, and 4) use the ensemble of inverted topography models to define the mean result and the uncertainty."
+    "**Invert4Geom** uses a stochastic approach for quantifying uncertainties. Simply put, an inversion can be treated as a black box, which takes inputs and outputs a topography model. This inputs include gravity data, a starting model, and various parameters, such as amount of damping, density contrast, reference level, and parameters for estimating the regional gravity field. The concept of a stochastic uncertainty analysis is to see how changes to these *inputs* affects the *output* (inverted topography). \n",
+    "\n",
+    "For example, we can perform several inversions, changing the density contrast value used in each one, and compare the inverted topography for each of these. For this produced **ensemble** of inverted topography models, we can calculated the cell-wise mean and the cell-wise standard deviation. The grid of cell-wise means would be our stochastic topography model, and the grid of cell-wise standard deviations would be our estimate of the spatially-variable uncertainty of the model. This procedure is described in the below figure.\n",
+    "\n",
+    "![uncertainty_workflow](uncertainty_workflow.png)"
    ]
   },
   {
 
@@ -54,14 +54,17 @@
     synthetic_topography_simple,
 )
 from .uncertainty import (  # noqa: E402
+    create_lhc,
     full_workflow_uncertainty_loop,
     merged_stats,
     randomly_sample_data,
     regional_misfit_uncertainty,
 )
 from .utils import (  # noqa: E402
     create_topography,
+    dist_nearest_points,
     gravity_decay_buffer,
+    normalize_xarray,
     normalized_mindist,
     optimal_spline_damping,
     rmse,
 
@@ -1430,25 +1430,12 @@ def plot_latin_hypercube(
 
         plt.show()
 
-    dim = np.shape(df)[1]
-
-    param_values = df.to_numpy()
-
-    problem = {
-        "num_vars": dim,
-        "names": [i.replace("_", " ") for i in df.columns],
-        "bounds": [[-1, 1]] * dim,
-    }
-
-    # Rescale to the unit hypercube for the analysis
-    sample = utils.scale_normalized(param_values, problem["bounds"])
-
     # 2D projection
     if plot_2d_projections:
         if len(df.columns) == 1:
             pass
         else:
-            plot_sampled_projection_2d(sample, problem["names"])
+            plot_sampled_projection_2d(df)
 
 
 def projection_2d(
@@ -1466,36 +1453,155 @@ def projection_2d(
 
 
 def plot_sampled_projection_2d(
-    sample: NDArray,
-    var_names: list[str],
+    sample: pd.DataFrame,
 ) -> None:
     """
     Plots the samples projected on each 2D plane
 
     Parameters
     ----------
-    sample : numpy.ndarray
+    sample : pd.DataFrame
         The sampled values
-    var_names : list[str]
-        The names of the variables
     """
-    dim = sample.shape[1]
+    df = sample.copy()
+    var_names = df.columns.tolist()
+    dim = len(var_names)
+
+    norm_var_names = [f"{c}_normalized" for c in var_names]
+
+    # Rescale to the unit hypercube for the analysis
+    for c in var_names:
+        df[f"{c}_normalized"] = utils.normalize(df[c].values, low=0, high=1)
+
+    fig, axs = plt.subplots(dim, dim, figsize=(6, 6))  # nrows, ncols
+
+    cmap = plt.cm.get_cmap("Greys")
+
+    vals_list = []
+    da_list = []
+    for i in range(dim):
+        for j in range(dim):
+            if i == j:
+                min_dist = None
+            else:
+                # create grid of parameter coordinates
+                (x, y) = vd.grid_coordinates(
+                    region=[
+                        df[norm_var_names[j]].min(),
+                        df[norm_var_names[j]].max(),
+                        df[norm_var_names[i]].min(),
+                        df[norm_var_names[i]].max(),
+                    ],
+                    shape=(100, 100),
+                    pixel_register=True,
+                )
+
+                # make dataarray of parameter space
+                grid = vd.make_xarray_grid(
+                    (x, y),
+                    np.ones_like(x),
+                    dims=(norm_var_names[i], norm_var_names[j]),
+                    data_names="data",
+                ).data
+
+                min_dist = utils.normalized_mindist(
+                    df[[norm_var_names[i], norm_var_names[j]]],
+                    grid=grid,
+                    # low=0,
+                    # high=1,
+                )
+                vals_list.append(min_dist.to_numpy().flatten())
+            da_list.append(min_dist)
 
+    vals = np.concatenate(vals_list)
+    # cpt_lims = (0, polar_utils.get_combined_min_max(vals)[1])
+    cpt_lims = (0, 0.5)
+    k = 0
     for i in range(dim):
         for j in range(dim):
-            plt.subplot(dim, dim, i * dim + j + 1)
-            plt.scatter(
-                sample[:, j],
-                sample[:, i],
+            if da_list[k] is not None:
+                im = da_list[k].plot(
+                    ax=axs[i, j],
+                    cmap=cmap,
+                    add_colorbar=False,
+                    vmin=cpt_lims[0],
+                    vmax=cpt_lims[1],
+                    add_labels=False,
+                )
+
+            axs[i, j].scatter(
+                df[norm_var_names[j]],
+                df[norm_var_names[i]],
                 s=2,
+                zorder=10,
+                clip_on=False,
             )
+            axs[i, j].set_xlim(df[norm_var_names[j]].min(), df[norm_var_names[j]].max())
+            axs[i, j].set_ylim(df[norm_var_names[i]].min(), df[norm_var_names[i]].max())
+
             if j == 0:
-                plt.ylabel(var_names[i], rotation=0, ha="right")
+                axs[i, j].set_ylabel(var_names[i], rotation=0, ha="right")
             if i == dim - 1:
-                plt.xlabel(var_names[j], rotation=20, ha="right")
+                axs[i, j].set_xlabel(var_names[j], rotation=20, ha="right")
+
+            # axs[i, j].set_xticks([])
+            # axs[i, j].set_yticks([])
+
+            # set 3 ticks for each subplot
+            axs[i, j].xaxis.set_major_locator(plt.MaxNLocator(2))
+            axs[i, j].yaxis.set_major_locator(plt.MaxNLocator(2))
+
+            # add actual variable values as tick labels
+            x_ticks = axs[i, j].get_xticks()
+            y_ticks = axs[i, j].get_yticks()
+            x_tick_labels = []
+            y_tick_labels = []
+            for xt in x_ticks:
+                # rescale to actual variable values
+                val = (
+                    xt * (df[var_names[j]].max() - df[var_names[j]].min())
+                    + df[var_names[j]].min()
+                )
+                x_tick_labels.append(f"{val:.2f}")
+            for yt in y_ticks:
+                val = (
+                    yt * (df[var_names[i]].max() - df[var_names[i]].min())
+                    + df[var_names[i]].min()
+                )
+                y_tick_labels.append(f"{val:.2f}")
+            axs[i, j].set_xticklabels(x_tick_labels)
+            axs[i, j].set_yticklabels(y_tick_labels)
+
+            k += 1
+
+    # add colorbar of distances
+    x = list(np.arange(cpt_lims[0], cpt_lims[1], 0.1))
+    x = [round(i, 2) for i in x]
+    if round(cpt_lims[1], 2) not in x:
+        x.append(cpt_lims[1])
+    cbar_ax = fig.add_axes([0.2, -0.04, 0.6, 0.02])
+    fig.colorbar(
+        im,
+        cax=cbar_ax,
+        orientation="horizontal",
+        label="Normalized distance to nearest sample",
+        ticks=x,
+        format=mpl.ticker.FormatStrFormatter("%.2g"),  # "%.2f"),
+    )
+
+    # add histogram to colorbar
+    ll, bb, ww, hh = cbar_ax.get_position().bounds
+    hist_ax = fig.add_axes([ll, bb + hh, ww, 0.06])
+    _n, bins, patches = hist_ax.hist(vals, bins=100)
+    bin_centers = 0.5 * (bins[:-1] + bins[1:])
+    col = bin_centers - min(bin_centers)
+    col /= max(col)
+    for c, p in zip(col, patches, strict=False):
+        plt.setp(p, "facecolor", cmap(c))
+
+    hist_ax.set_xlim(cpt_lims[0], cpt_lims[1])
+    hist_ax.set_axis_off()
 
-            plt.xticks([])
-            plt.yticks([])
     plt.show()
 
 
 
@@ -39,6 +39,97 @@ def __init__(
 if typing.TYPE_CHECKING:
     from invert4geom.inversion import Inversion
 
+from scipy.special import erf
+
+
+def mann_kendall_test(y, prec):
+    """
+    Mann-Kendall test (precision is the number of decimals)
+    Outputs are the normalized statistic Z and the associated p-value
+    """
+    n = len(y)
+    x = np.int_(y * (10**prec))
+
+    # Sign matrix and ties
+    sm = np.zeros((n - 1, n - 1))
+    for i in range(n - 1):
+        sm[i, i:n] = np.sign(x[i + 1 : n] - x[0 : n - 1 - i])  # E203
+
+    # Compute MK statistic
+    s = np.sum(sm)
+
+    # Count ties and their c
+    # appel Mimiontributions to variance of the MK statistic
+    [val, count] = np.unique(x, return_counts=True)
+    [extent, ties] = np.unique(count, return_counts=True)
+    tie_contribution = np.zeros(len(ties))
+    for i in range(len(ties)):
+        tie_contribution[i] = (
+            ties[i] * extent[i] * (extent[i] - 1) * (2 * extent[i] + 5)
+        )
+
+    # Compute the variance
+    vs = (n * (n - 1) * (2 * n + 5) - np.sum(tie_contribution)) / 18
+    if vs < 0:
+        print("WARNING: negative variance!!!")
+
+    # Compute standard normal statistic
+    z = (s - np.sign(s)) / np.sqrt(max(vs, 1))
+
+    # Associated p-value
+    pval = 1 - erf(abs(z) / np.sqrt(2))
+
+    return [z, pval]
+
+
+def mann_kendall_test_sample(sample):
+    """
+    Same as above, but for whole sample
+    Outputs are the normalized statistic Z and the associated p-value
+    """
+    # Local variables
+    n = sample.shape[0]
+    var = sample.shape[1]
+    x = np.argsort(
+        sample, axis=0
+    )  # Ranks of the values in the ensemble, for each variable
+    mk_res = np.zeros((var, var))
+    pval = np.zeros((var, var))
+
+    # MK test results
+    for i in range(var):
+        reorder_sample = np.zeros((n, var))
+        for j in range(n):
+            reorder_sample[j, :] = sample[x[j, i], :]
+        for v in np.arange(i + 1, var):
+            [mk_res[i, v], pval[i, v]] = mann_kendall_test(reorder_sample[:, v], 5)
+            [mk_res[v, i], pval[v, i]] = [mk_res[i, v], pval[i, v]]
+
+    return [mk_res, pval]
+
+
+from scipy.stats import pearsonr
+
+
+def pearson_test_sample(sample):
+    """
+    Correlation Pearson test for whole sample. Outputs are:
+    the Pearson statistic rho
+    the p-value pval
+    """
+    # Local variables
+    var = sample.shape[1]
+    rho = np.zeros((var, var))
+    pval = np.zeros((var, var))
+
+    # Pearson test results
+    for i in range(var):
+        for v in np.arange(i + 1, var):
+            [rho[i, v], pval[i, v]] = pearsonr(sample[:, i], sample[:, v])
+            [rho[v, i], pval[v, i]] = [rho[i, v], pval[i, v]]
+
+    return [rho, pval]
+
 
 def create_lhc(
     n_samples: int,
@@ -57,12 +148,13 @@ def create_lhc(
     parameter_dict : dict
         nested dictionary, with a dictionary of 'distribution', 'loc', 'scale' and
         optionally 'log' for each parameter to be sampled. Distributions can be
-        'uniform' or 'normal'. For 'uniform', 'loc' is the lower bound and 'scale' is
-        the range of the distribution. 'loc' + 'scale' = upper bound. For 'normal',
-        'loc' is the center (mean) of the distribution and 'scale' is the standard
-        deviation. If 'log' is True, the provided 'loc' and 'scale' values are the base
-        10 exponents. For example, a uniform distribution with loc=-4, scale=6 and
-        log=True would sample values between 1e-4 and 1e2.
+        'uniform', 'uniform_discrete', or 'normal'. For 'uniform' and
+        'uniform_discrete', 'loc' is the lower bound and 'scale' is the range of the
+        distribution ('loc' + 'scale' = upper bound). For 'normal', 'loc' is the center
+        (mean) of the distribution and 'scale' is the standard deviation. If 'log' is
+        True, the provided 'loc' and 'scale' values are the base 10 exponents. For
+        example, a uniform distribution with loc=-4, scale=6 and log=True would sample
+        values between 1e-4 and 1e2.
     random_state : int, optional
         random state to use for sampling, by default 1
     criterion : str, optional
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,11 @@`
`6`	`6`	`"source": [`
`7`	`7`	`"# Uncertainty analysis\n",`
`8`	`8`	`"\n",`
`9`		`- "Here we will demonstrate a full uncertainty analysis of the inversion. We use a stochastic approach, where we 1) choose the important input parameters to the inversion, 2) define each of there uncertainty distributions, 3) run a series of inversions which sample these inputs from their uncertainty distributions, and 4) use the ensemble of inverted topography models to define the mean result and the uncertainty."`
	`9`	+ "Invert4Geom uses a stochastic approach for quantifying uncertainties. Simply put, an inversion can be treated as a black box, which takes inputs and outputs a topography model. This inputs include gravity data, a starting model, and various parameters, such as amount of damping, density contrast, reference level, and parameters for estimating the regional gravity field. The concept of a stochastic uncertainty analysis is to see how changes to these inputs affects the output (inverted topography). \n",
	`10`	`+ "\n",`
	`11`	+ "For example, we can perform several inversions, changing the density contrast value used in each one, and compare the inverted topography for each of these. For this produced ensemble of inverted topography models, we can calculated the cell-wise mean and the cell-wise standard deviation. The grid of cell-wise means would be our stochastic topography model, and the grid of cell-wise standard deviations would be our estimate of the spatially-variable uncertainty of the model. This procedure is described in the below figure.\n",
	`12`	`+ "\n",`
	`13`	`+ "![uncertainty_workflow](uncertainty_workflow.png)"`
`10`	`14`	`]`
`11`	`15`	`},`
`12`	`16`	`{`