Skip to content

Commit 68dc288

Browse files
committed
WIP
1 parent fc4e8f4 commit 68dc288

File tree

7 files changed

+1435
-58
lines changed

7 files changed

+1435
-58
lines changed

docs/how_to/uncertainty_parameter_sampling.ipynb

Lines changed: 1151 additions & 0 deletions
Large diffs are not rendered by default.
1.57 MB
Loading

docs/tutorial/10_uncertainty_analysis.ipynb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66
"source": [
77
"# Uncertainty analysis\n",
88
"\n",
9-
"Here we will demonstrate a full uncertainty analysis of the inversion. We use a stochastic approach, where we 1) choose the important input parameters to the inversion, 2) define each of there uncertainty distributions, 3) run a series of inversions which sample these inputs from their uncertainty distributions, and 4) use the ensemble of inverted topography models to define the mean result and the uncertainty."
9+
"**Invert4Geom** uses a stochastic approach for quantifying uncertainties. Simply put, an inversion can be treated as a black box, which takes inputs and outputs a topography model. This inputs include gravity data, a starting model, and various parameters, such as amount of damping, density contrast, reference level, and parameters for estimating the regional gravity field. The concept of a stochastic uncertainty analysis is to see how changes to these *inputs* affects the *output* (inverted topography). \n",
10+
"\n",
11+
"For example, we can perform several inversions, changing the density contrast value used in each one, and compare the inverted topography for each of these. For this produced **ensemble** of inverted topography models, we can calculated the cell-wise mean and the cell-wise standard deviation. The grid of cell-wise means would be our stochastic topography model, and the grid of cell-wise standard deviations would be our estimate of the spatially-variable uncertainty of the model. This procedure is described in the below figure.\n",
12+
"\n",
13+
"![uncertainty_workflow](uncertainty_workflow.png)"
1014
]
1115
},
1216
{

src/invert4geom/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,17 @@
5454
synthetic_topography_simple,
5555
)
5656
from .uncertainty import ( # noqa: E402
57+
create_lhc,
5758
full_workflow_uncertainty_loop,
5859
merged_stats,
5960
randomly_sample_data,
6061
regional_misfit_uncertainty,
6162
)
6263
from .utils import ( # noqa: E402
6364
create_topography,
65+
dist_nearest_points,
6466
gravity_decay_buffer,
67+
normalize_xarray,
6568
normalized_mindist,
6669
optimal_spline_damping,
6770
rmse,

src/invert4geom/plotting.py

Lines changed: 134 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1430,25 +1430,12 @@ def plot_latin_hypercube(
14301430

14311431
plt.show()
14321432

1433-
dim = np.shape(df)[1]
1434-
1435-
param_values = df.to_numpy()
1436-
1437-
problem = {
1438-
"num_vars": dim,
1439-
"names": [i.replace("_", " ") for i in df.columns],
1440-
"bounds": [[-1, 1]] * dim,
1441-
}
1442-
1443-
# Rescale to the unit hypercube for the analysis
1444-
sample = utils.scale_normalized(param_values, problem["bounds"])
1445-
14461433
# 2D projection
14471434
if plot_2d_projections:
14481435
if len(df.columns) == 1:
14491436
pass
14501437
else:
1451-
plot_sampled_projection_2d(sample, problem["names"])
1438+
plot_sampled_projection_2d(df)
14521439

14531440

14541441
def projection_2d(
@@ -1466,36 +1453,155 @@ def projection_2d(
14661453

14671454

14681455
def plot_sampled_projection_2d(
1469-
sample: NDArray,
1470-
var_names: list[str],
1456+
sample: pd.DataFrame,
14711457
) -> None:
14721458
"""
14731459
Plots the samples projected on each 2D plane
14741460
14751461
Parameters
14761462
----------
1477-
sample : numpy.ndarray
1463+
sample : pd.DataFrame
14781464
The sampled values
1479-
var_names : list[str]
1480-
The names of the variables
14811465
"""
1482-
dim = sample.shape[1]
1466+
df = sample.copy()
1467+
var_names = df.columns.tolist()
1468+
dim = len(var_names)
1469+
1470+
norm_var_names = [f"{c}_normalized" for c in var_names]
1471+
1472+
# Rescale to the unit hypercube for the analysis
1473+
for c in var_names:
1474+
df[f"{c}_normalized"] = utils.normalize(df[c].values, low=0, high=1)
1475+
1476+
fig, axs = plt.subplots(dim, dim, figsize=(6, 6)) # nrows, ncols
1477+
1478+
cmap = plt.cm.get_cmap("Greys")
1479+
1480+
vals_list = []
1481+
da_list = []
1482+
for i in range(dim):
1483+
for j in range(dim):
1484+
if i == j:
1485+
min_dist = None
1486+
else:
1487+
# create grid of parameter coordinates
1488+
(x, y) = vd.grid_coordinates(
1489+
region=[
1490+
df[norm_var_names[j]].min(),
1491+
df[norm_var_names[j]].max(),
1492+
df[norm_var_names[i]].min(),
1493+
df[norm_var_names[i]].max(),
1494+
],
1495+
shape=(100, 100),
1496+
pixel_register=True,
1497+
)
1498+
1499+
# make dataarray of parameter space
1500+
grid = vd.make_xarray_grid(
1501+
(x, y),
1502+
np.ones_like(x),
1503+
dims=(norm_var_names[i], norm_var_names[j]),
1504+
data_names="data",
1505+
).data
1506+
1507+
min_dist = utils.normalized_mindist(
1508+
df[[norm_var_names[i], norm_var_names[j]]],
1509+
grid=grid,
1510+
# low=0,
1511+
# high=1,
1512+
)
1513+
vals_list.append(min_dist.to_numpy().flatten())
1514+
da_list.append(min_dist)
14831515

1516+
vals = np.concatenate(vals_list)
1517+
# cpt_lims = (0, polar_utils.get_combined_min_max(vals)[1])
1518+
cpt_lims = (0, 0.5)
1519+
k = 0
14841520
for i in range(dim):
14851521
for j in range(dim):
1486-
plt.subplot(dim, dim, i * dim + j + 1)
1487-
plt.scatter(
1488-
sample[:, j],
1489-
sample[:, i],
1522+
if da_list[k] is not None:
1523+
im = da_list[k].plot(
1524+
ax=axs[i, j],
1525+
cmap=cmap,
1526+
add_colorbar=False,
1527+
vmin=cpt_lims[0],
1528+
vmax=cpt_lims[1],
1529+
add_labels=False,
1530+
)
1531+
1532+
axs[i, j].scatter(
1533+
df[norm_var_names[j]],
1534+
df[norm_var_names[i]],
14901535
s=2,
1536+
zorder=10,
1537+
clip_on=False,
14911538
)
1539+
axs[i, j].set_xlim(df[norm_var_names[j]].min(), df[norm_var_names[j]].max())
1540+
axs[i, j].set_ylim(df[norm_var_names[i]].min(), df[norm_var_names[i]].max())
1541+
14921542
if j == 0:
1493-
plt.ylabel(var_names[i], rotation=0, ha="right")
1543+
axs[i, j].set_ylabel(var_names[i], rotation=0, ha="right")
14941544
if i == dim - 1:
1495-
plt.xlabel(var_names[j], rotation=20, ha="right")
1545+
axs[i, j].set_xlabel(var_names[j], rotation=20, ha="right")
1546+
1547+
# axs[i, j].set_xticks([])
1548+
# axs[i, j].set_yticks([])
1549+
1550+
# set 3 ticks for each subplot
1551+
axs[i, j].xaxis.set_major_locator(plt.MaxNLocator(2))
1552+
axs[i, j].yaxis.set_major_locator(plt.MaxNLocator(2))
1553+
1554+
# add actual variable values as tick labels
1555+
x_ticks = axs[i, j].get_xticks()
1556+
y_ticks = axs[i, j].get_yticks()
1557+
x_tick_labels = []
1558+
y_tick_labels = []
1559+
for xt in x_ticks:
1560+
# rescale to actual variable values
1561+
val = (
1562+
xt * (df[var_names[j]].max() - df[var_names[j]].min())
1563+
+ df[var_names[j]].min()
1564+
)
1565+
x_tick_labels.append(f"{val:.2f}")
1566+
for yt in y_ticks:
1567+
val = (
1568+
yt * (df[var_names[i]].max() - df[var_names[i]].min())
1569+
+ df[var_names[i]].min()
1570+
)
1571+
y_tick_labels.append(f"{val:.2f}")
1572+
axs[i, j].set_xticklabels(x_tick_labels)
1573+
axs[i, j].set_yticklabels(y_tick_labels)
1574+
1575+
k += 1
1576+
1577+
# add colorbar of distances
1578+
x = list(np.arange(cpt_lims[0], cpt_lims[1], 0.1))
1579+
x = [round(i, 2) for i in x]
1580+
if round(cpt_lims[1], 2) not in x:
1581+
x.append(cpt_lims[1])
1582+
cbar_ax = fig.add_axes([0.2, -0.04, 0.6, 0.02])
1583+
fig.colorbar(
1584+
im,
1585+
cax=cbar_ax,
1586+
orientation="horizontal",
1587+
label="Normalized distance to nearest sample",
1588+
ticks=x,
1589+
format=mpl.ticker.FormatStrFormatter("%.2g"), # "%.2f"),
1590+
)
1591+
1592+
# add histogram to colorbar
1593+
ll, bb, ww, hh = cbar_ax.get_position().bounds
1594+
hist_ax = fig.add_axes([ll, bb + hh, ww, 0.06])
1595+
_n, bins, patches = hist_ax.hist(vals, bins=100)
1596+
bin_centers = 0.5 * (bins[:-1] + bins[1:])
1597+
col = bin_centers - min(bin_centers)
1598+
col /= max(col)
1599+
for c, p in zip(col, patches, strict=False):
1600+
plt.setp(p, "facecolor", cmap(c))
1601+
1602+
hist_ax.set_xlim(cpt_lims[0], cpt_lims[1])
1603+
hist_ax.set_axis_off()
14961604

1497-
plt.xticks([])
1498-
plt.yticks([])
14991605
plt.show()
15001606

15011607

src/invert4geom/uncertainty.py

Lines changed: 98 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,97 @@ def __init__(
3939
if typing.TYPE_CHECKING:
4040
from invert4geom.inversion import Inversion
4141

42+
from scipy.special import erf
43+
44+
45+
def mann_kendall_test(y, prec):
46+
"""
47+
Mann-Kendall test (precision is the number of decimals)
48+
Outputs are the normalized statistic Z and the associated p-value
49+
"""
50+
n = len(y)
51+
x = np.int_(y * (10**prec))
52+
53+
# Sign matrix and ties
54+
sm = np.zeros((n - 1, n - 1))
55+
for i in range(n - 1):
56+
sm[i, i:n] = np.sign(x[i + 1 : n] - x[0 : n - 1 - i]) # E203
57+
58+
# Compute MK statistic
59+
s = np.sum(sm)
60+
61+
# Count ties and their c
62+
# appel Mimiontributions to variance of the MK statistic
63+
[val, count] = np.unique(x, return_counts=True)
64+
[extent, ties] = np.unique(count, return_counts=True)
65+
tie_contribution = np.zeros(len(ties))
66+
for i in range(len(ties)):
67+
tie_contribution[i] = (
68+
ties[i] * extent[i] * (extent[i] - 1) * (2 * extent[i] + 5)
69+
)
70+
71+
# Compute the variance
72+
vs = (n * (n - 1) * (2 * n + 5) - np.sum(tie_contribution)) / 18
73+
if vs < 0:
74+
print("WARNING: negative variance!!!")
75+
76+
# Compute standard normal statistic
77+
z = (s - np.sign(s)) / np.sqrt(max(vs, 1))
78+
79+
# Associated p-value
80+
pval = 1 - erf(abs(z) / np.sqrt(2))
81+
82+
return [z, pval]
83+
84+
85+
def mann_kendall_test_sample(sample):
86+
"""
87+
Same as above, but for whole sample
88+
Outputs are the normalized statistic Z and the associated p-value
89+
"""
90+
# Local variables
91+
n = sample.shape[0]
92+
var = sample.shape[1]
93+
x = np.argsort(
94+
sample, axis=0
95+
) # Ranks of the values in the ensemble, for each variable
96+
mk_res = np.zeros((var, var))
97+
pval = np.zeros((var, var))
98+
99+
# MK test results
100+
for i in range(var):
101+
reorder_sample = np.zeros((n, var))
102+
for j in range(n):
103+
reorder_sample[j, :] = sample[x[j, i], :]
104+
for v in np.arange(i + 1, var):
105+
[mk_res[i, v], pval[i, v]] = mann_kendall_test(reorder_sample[:, v], 5)
106+
[mk_res[v, i], pval[v, i]] = [mk_res[i, v], pval[i, v]]
107+
108+
return [mk_res, pval]
109+
110+
111+
from scipy.stats import pearsonr
112+
113+
114+
def pearson_test_sample(sample):
115+
"""
116+
Correlation Pearson test for whole sample. Outputs are:
117+
the Pearson statistic rho
118+
the p-value pval
119+
"""
120+
# Local variables
121+
var = sample.shape[1]
122+
rho = np.zeros((var, var))
123+
pval = np.zeros((var, var))
124+
125+
# Pearson test results
126+
for i in range(var):
127+
for v in np.arange(i + 1, var):
128+
[rho[i, v], pval[i, v]] = pearsonr(sample[:, i], sample[:, v])
129+
[rho[v, i], pval[v, i]] = [rho[i, v], pval[i, v]]
130+
131+
return [rho, pval]
132+
42133

43134
def create_lhc(
44135
n_samples: int,
@@ -57,12 +148,13 @@ def create_lhc(
57148
parameter_dict : dict
58149
nested dictionary, with a dictionary of 'distribution', 'loc', 'scale' and
59150
optionally 'log' for each parameter to be sampled. Distributions can be
60-
'uniform' or 'normal'. For 'uniform', 'loc' is the lower bound and 'scale' is
61-
the range of the distribution. 'loc' + 'scale' = upper bound. For 'normal',
62-
'loc' is the center (mean) of the distribution and 'scale' is the standard
63-
deviation. If 'log' is True, the provided 'loc' and 'scale' values are the base
64-
10 exponents. For example, a uniform distribution with loc=-4, scale=6 and
65-
log=True would sample values between 1e-4 and 1e2.
151+
'uniform', 'uniform_discrete', or 'normal'. For 'uniform' and
152+
'uniform_discrete', 'loc' is the lower bound and 'scale' is the range of the
153+
distribution ('loc' + 'scale' = upper bound). For 'normal', 'loc' is the center
154+
(mean) of the distribution and 'scale' is the standard deviation. If 'log' is
155+
True, the provided 'loc' and 'scale' values are the base 10 exponents. For
156+
example, a uniform distribution with loc=-4, scale=6 and log=True would sample
157+
values between 1e-4 and 1e2.
66158
random_state : int, optional
67159
random state to use for sampling, by default 1
68160
criterion : str, optional

0 commit comments

Comments
 (0)