Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,151 changes: 1,151 additions & 0 deletions docs/how_to/uncertainty_parameter_sampling.ipynb

Large diffs are not rendered by default.

Binary file added docs/how_to/uncertainty_workflow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 5 additions & 1 deletion docs/tutorial/10_uncertainty_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
"source": [
"# Uncertainty analysis\n",
"\n",
"Here we will demonstrate a full uncertainty analysis of the inversion. We use a stochastic approach, where we 1) choose the important input parameters to the inversion, 2) define each of there uncertainty distributions, 3) run a series of inversions which sample these inputs from their uncertainty distributions, and 4) use the ensemble of inverted topography models to define the mean result and the uncertainty."
"**Invert4Geom** uses a stochastic approach for quantifying uncertainties. Simply put, an inversion can be treated as a black box, which takes inputs and outputs a topography model. This inputs include gravity data, a starting model, and various parameters, such as amount of damping, density contrast, reference level, and parameters for estimating the regional gravity field. The concept of a stochastic uncertainty analysis is to see how changes to these *inputs* affects the *output* (inverted topography). \n",
"\n",
"For example, we can perform several inversions, changing the density contrast value used in each one, and compare the inverted topography for each of these. For this produced **ensemble** of inverted topography models, we can calculated the cell-wise mean and the cell-wise standard deviation. The grid of cell-wise means would be our stochastic topography model, and the grid of cell-wise standard deviations would be our estimate of the spatially-variable uncertainty of the model. This procedure is described in the below figure.\n",
"\n",
"![uncertainty_workflow](uncertainty_workflow.png)"
]
},
{
Expand Down
3 changes: 3 additions & 0 deletions src/invert4geom/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,17 @@
synthetic_topography_simple,
)
from .uncertainty import ( # noqa: E402
create_lhc,
full_workflow_uncertainty_loop,
merged_stats,
randomly_sample_data,
regional_misfit_uncertainty,
)
from .utils import ( # noqa: E402
create_topography,
dist_nearest_points,
gravity_decay_buffer,
normalize_xarray,
normalized_mindist,
optimal_spline_damping,
rmse,
Expand Down
162 changes: 134 additions & 28 deletions src/invert4geom/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -1430,25 +1430,12 @@

plt.show()

dim = np.shape(df)[1]

param_values = df.to_numpy()

problem = {
"num_vars": dim,
"names": [i.replace("_", " ") for i in df.columns],
"bounds": [[-1, 1]] * dim,
}

# Rescale to the unit hypercube for the analysis
sample = utils.scale_normalized(param_values, problem["bounds"])

# 2D projection
if plot_2d_projections:
if len(df.columns) == 1:
pass
else:
plot_sampled_projection_2d(sample, problem["names"])
plot_sampled_projection_2d(df)


def projection_2d(
Expand All @@ -1466,36 +1453,155 @@


def plot_sampled_projection_2d(
sample: NDArray,
var_names: list[str],
sample: pd.DataFrame,
) -> None:
"""
Plots the samples projected on each 2D plane

Parameters
----------
sample : numpy.ndarray
sample : pd.DataFrame
The sampled values
var_names : list[str]
The names of the variables
"""
dim = sample.shape[1]
df = sample.copy()
var_names = df.columns.tolist()
dim = len(var_names)

norm_var_names = [f"{c}_normalized" for c in var_names]

# Rescale to the unit hypercube for the analysis
for c in var_names:
df[f"{c}_normalized"] = utils.normalize(df[c].values, low=0, high=1)

fig, axs = plt.subplots(dim, dim, figsize=(6, 6)) # nrows, ncols

cmap = plt.cm.get_cmap("Greys")

vals_list = []
da_list = []
for i in range(dim):
for j in range(dim):
if i == j:
min_dist = None
else:
# create grid of parameter coordinates
(x, y) = vd.grid_coordinates(
region=[
df[norm_var_names[j]].min(),
df[norm_var_names[j]].max(),
df[norm_var_names[i]].min(),
df[norm_var_names[i]].max(),
],
shape=(100, 100),
pixel_register=True,
)

Check warning on line 1497 in src/invert4geom/plotting.py

View workflow job for this annotation

GitHub Actions / Pylint

W0632

Possible unbalanced tuple unpacking with sequence defined at line 596 of verde.coordinates: left side has 2 labels, right side has 0 values

# make dataarray of parameter space
grid = vd.make_xarray_grid(
(x, y),
np.ones_like(x),
dims=(norm_var_names[i], norm_var_names[j]),
data_names="data",
).data

min_dist = utils.normalized_mindist(
df[[norm_var_names[i], norm_var_names[j]]],
grid=grid,
# low=0,
# high=1,
)
vals_list.append(min_dist.to_numpy().flatten())
da_list.append(min_dist)

vals = np.concatenate(vals_list)
# cpt_lims = (0, polar_utils.get_combined_min_max(vals)[1])
cpt_lims = (0, 0.5)
k = 0
for i in range(dim):
for j in range(dim):
plt.subplot(dim, dim, i * dim + j + 1)
plt.scatter(
sample[:, j],
sample[:, i],
if da_list[k] is not None:
im = da_list[k].plot(
ax=axs[i, j],
cmap=cmap,
add_colorbar=False,
vmin=cpt_lims[0],
vmax=cpt_lims[1],
add_labels=False,
)

axs[i, j].scatter(
df[norm_var_names[j]],
df[norm_var_names[i]],
s=2,
zorder=10,
clip_on=False,
)
axs[i, j].set_xlim(df[norm_var_names[j]].min(), df[norm_var_names[j]].max())
axs[i, j].set_ylim(df[norm_var_names[i]].min(), df[norm_var_names[i]].max())

if j == 0:
plt.ylabel(var_names[i], rotation=0, ha="right")
axs[i, j].set_ylabel(var_names[i], rotation=0, ha="right")
if i == dim - 1:
plt.xlabel(var_names[j], rotation=20, ha="right")
axs[i, j].set_xlabel(var_names[j], rotation=20, ha="right")

# axs[i, j].set_xticks([])
# axs[i, j].set_yticks([])

# set 3 ticks for each subplot
axs[i, j].xaxis.set_major_locator(plt.MaxNLocator(2))
axs[i, j].yaxis.set_major_locator(plt.MaxNLocator(2))

# add actual variable values as tick labels
x_ticks = axs[i, j].get_xticks()
y_ticks = axs[i, j].get_yticks()
x_tick_labels = []
y_tick_labels = []
for xt in x_ticks:
# rescale to actual variable values
val = (
xt * (df[var_names[j]].max() - df[var_names[j]].min())
+ df[var_names[j]].min()
)
x_tick_labels.append(f"{val:.2f}")
for yt in y_ticks:
val = (
yt * (df[var_names[i]].max() - df[var_names[i]].min())
+ df[var_names[i]].min()
)
y_tick_labels.append(f"{val:.2f}")
axs[i, j].set_xticklabels(x_tick_labels)
axs[i, j].set_yticklabels(y_tick_labels)

k += 1

# add colorbar of distances
x = list(np.arange(cpt_lims[0], cpt_lims[1], 0.1))
x = [round(i, 2) for i in x]
if round(cpt_lims[1], 2) not in x:
x.append(cpt_lims[1])
cbar_ax = fig.add_axes([0.2, -0.04, 0.6, 0.02])
fig.colorbar(
im,

Check failure on line 1584 in src/invert4geom/plotting.py

View workflow job for this annotation

GitHub Actions / Pylint

E0606

Possibly using variable 'im' before assignment
cax=cbar_ax,
orientation="horizontal",
label="Normalized distance to nearest sample",
ticks=x,
format=mpl.ticker.FormatStrFormatter("%.2g"), # "%.2f"),
)

# add histogram to colorbar
ll, bb, ww, hh = cbar_ax.get_position().bounds
hist_ax = fig.add_axes([ll, bb + hh, ww, 0.06])
_n, bins, patches = hist_ax.hist(vals, bins=100)
bin_centers = 0.5 * (bins[:-1] + bins[1:])
col = bin_centers - min(bin_centers)
col /= max(col)
for c, p in zip(col, patches, strict=False):
plt.setp(p, "facecolor", cmap(c))

hist_ax.set_xlim(cpt_lims[0], cpt_lims[1])
hist_ax.set_axis_off()

plt.xticks([])
plt.yticks([])
plt.show()


Expand Down
104 changes: 98 additions & 6 deletions src/invert4geom/uncertainty.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,97 @@
if typing.TYPE_CHECKING:
from invert4geom.inversion import Inversion

from scipy.special import erf

Check notice on line 42 in src/invert4geom/uncertainty.py

View workflow job for this annotation

GitHub Actions / Pylint

C0411

third party import "scipy.special.erf" should be placed before first party import "invert4geom.inversion"

Check failure on line 42 in src/invert4geom/uncertainty.py

View workflow job for this annotation

GitHub Actions / Pylint

E0611

No name 'erf' in module 'scipy.special'


def mann_kendall_test(y, prec):
"""
Mann-Kendall test (precision is the number of decimals)
Outputs are the normalized statistic Z and the associated p-value
"""
n = len(y)
x = np.int_(y * (10**prec))

# Sign matrix and ties
sm = np.zeros((n - 1, n - 1))
for i in range(n - 1):
sm[i, i:n] = np.sign(x[i + 1 : n] - x[0 : n - 1 - i]) # E203

# Compute MK statistic
s = np.sum(sm)

# Count ties and their c
# appel Mimiontributions to variance of the MK statistic
[val, count] = np.unique(x, return_counts=True)

Check warning on line 63 in src/invert4geom/uncertainty.py

View workflow job for this annotation

GitHub Actions / Pylint

W0612

Unused variable 'val'
[extent, ties] = np.unique(count, return_counts=True)
tie_contribution = np.zeros(len(ties))
for i in range(len(ties)):
tie_contribution[i] = (
ties[i] * extent[i] * (extent[i] - 1) * (2 * extent[i] + 5)
)

Check notice on line 69 in src/invert4geom/uncertainty.py

View workflow job for this annotation

GitHub Actions / Pylint

C0200

Consider using enumerate instead of iterating with range and len

# Compute the variance
vs = (n * (n - 1) * (2 * n + 5) - np.sum(tie_contribution)) / 18
if vs < 0:
print("WARNING: negative variance!!!")

# Compute standard normal statistic
z = (s - np.sign(s)) / np.sqrt(max(vs, 1))

# Associated p-value
pval = 1 - erf(abs(z) / np.sqrt(2))

return [z, pval]


def mann_kendall_test_sample(sample):
"""
Same as above, but for whole sample
Outputs are the normalized statistic Z and the associated p-value
"""
# Local variables
n = sample.shape[0]
var = sample.shape[1]
x = np.argsort(
sample, axis=0
) # Ranks of the values in the ensemble, for each variable
mk_res = np.zeros((var, var))
pval = np.zeros((var, var))

# MK test results
for i in range(var):
reorder_sample = np.zeros((n, var))
for j in range(n):
reorder_sample[j, :] = sample[x[j, i], :]
for v in np.arange(i + 1, var):
[mk_res[i, v], pval[i, v]] = mann_kendall_test(reorder_sample[:, v], 5)
[mk_res[v, i], pval[v, i]] = [mk_res[i, v], pval[i, v]]

return [mk_res, pval]


from scipy.stats import pearsonr

Check notice on line 111 in src/invert4geom/uncertainty.py

View workflow job for this annotation

GitHub Actions / Pylint

C0411

third party import "scipy.stats.pearsonr" should be placed before first party import "invert4geom.inversion"


def pearson_test_sample(sample):
"""
Correlation Pearson test for whole sample. Outputs are:
the Pearson statistic rho
the p-value pval
"""
# Local variables
var = sample.shape[1]
rho = np.zeros((var, var))
pval = np.zeros((var, var))

# Pearson test results
for i in range(var):
for v in np.arange(i + 1, var):
[rho[i, v], pval[i, v]] = pearsonr(sample[:, i], sample[:, v])
[rho[v, i], pval[v, i]] = [rho[i, v], pval[i, v]]

return [rho, pval]


def create_lhc(
n_samples: int,
Expand All @@ -57,12 +148,13 @@
parameter_dict : dict
nested dictionary, with a dictionary of 'distribution', 'loc', 'scale' and
optionally 'log' for each parameter to be sampled. Distributions can be
'uniform' or 'normal'. For 'uniform', 'loc' is the lower bound and 'scale' is
the range of the distribution. 'loc' + 'scale' = upper bound. For 'normal',
'loc' is the center (mean) of the distribution and 'scale' is the standard
deviation. If 'log' is True, the provided 'loc' and 'scale' values are the base
10 exponents. For example, a uniform distribution with loc=-4, scale=6 and
log=True would sample values between 1e-4 and 1e2.
'uniform', 'uniform_discrete', or 'normal'. For 'uniform' and
'uniform_discrete', 'loc' is the lower bound and 'scale' is the range of the
distribution ('loc' + 'scale' = upper bound). For 'normal', 'loc' is the center
(mean) of the distribution and 'scale' is the standard deviation. If 'log' is
True, the provided 'loc' and 'scale' values are the base 10 exponents. For
example, a uniform distribution with loc=-4, scale=6 and log=True would sample
values between 1e-4 and 1e2.
random_state : int, optional
random state to use for sampling, by default 1
criterion : str, optional
Expand Down
Loading
Loading