Skip to content

Commit f11db43

Browse files
authored
error bound tests (#3104)
1 parent 7df0fca commit f11db43

File tree

2 files changed

+69
-21
lines changed

2 files changed

+69
-21
lines changed

python/tests/test_relatedness_vector.py

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,58 @@ def assert_pcs_equal(U, D, U_full, D_full, rtol=1e-5, atol=1e-8):
818818
assert found_it, f"{k}th singular vector {u} not found in {U_full}."
819819

820820

821+
def assert_errors_bound(pca_res, D, U, w=None):
822+
# Bounds on the error are from equation 1.11 in https://arxiv.org/pdf/0909.4061 -
823+
# this gives a bound on reconstruction error (i.e., operator norm between the GRM
824+
# and the low-diml approx). But since the (L2) operator norm is
825+
# |X| = sup_v |Xv|/|v|,
826+
# this implies bounds on singular values and vectors also:
827+
# If G v = lambda v, and we've got estimated singular vectors U and values diag(L),
828+
# then let v = \sum_i b_i u_i + delta be the projection of v into U,
829+
# and we have that
830+
# |lambda v - U L U* v|^2
831+
# = \sum_i b_i^2 (lambda - L_i)^2 + lambda^2 |delta|^2
832+
# < \epsilon^2 (where epsilon is the spectral norm bound error_bound)
833+
# so
834+
# |delta| < epsilon / lambda
835+
# since this is the amount by which the eigenvector v isn't hit by the columns of U.
836+
# Then also for each i that if b_i is not small then
837+
# |lambda - L_i| < epsilon
838+
# and there must be at least one b_i that is big (since sum_i b_i^2 = 1 - |delta|^2).
839+
# More concretely, let m = min_i |lambda - L_i|^2,
840+
# so that
841+
# epsilon^2 > \sum_i (lambda - L_i)^2 b_i^2 + lambda^2 |delta|^2
842+
# >= m * \sum_i b_i^2 + lambda^2 |delta|^2
843+
# = m * (1-|delta|^2) + lambda^2 |delta|^2.
844+
# Hence,
845+
# min_i |lambda-L_i|^2 = m < (epsilon^2 - lambda^2 |delta|^2) / (1- |delta|^2).
846+
# In summary: roughly, epsilon should be the bound on error in eigenvalues,
847+
# and epsilon / sigma[k+1] the L2 bound for eigenvectors
848+
# Below, the 'roughly/should be' translates into the factor of 5.
849+
850+
f = pca_res.factors
851+
ev = pca_res.eigenvalues
852+
rs = pca_res.range_sketch
853+
eps = pca_res.error_bound
854+
if w is not None:
855+
D, U = D[w], U[w]
856+
f, ev, rs, eps = f[w], ev[w], rs[w], eps[w]
857+
n = ev.shape[0]
858+
Sigma = U @ np.diag(D) @ U.T
859+
Q = rs[:, :n]
860+
err = np.linalg.svd(Sigma - Q @ Q.T @ Sigma).S[0]
861+
assert (
862+
err <= 5 * eps**2
863+
), "Reconstruction error should be smaller than the bound squared."
864+
assert (
865+
np.max(np.abs(ev - D[:n])) < 5 * eps
866+
), "Eigenvalue error should be smaller than error bound."
867+
for k in range(n):
868+
assert (
869+
np.sum((f[:, k] - U[:, k]) ** 2) < 5 * eps**2 / ev[-1]
870+
), "Factor error should be smaller than the bound squared."
871+
872+
821873
class TestPCA:
822874

823875
def verify_error_est(
@@ -885,20 +937,10 @@ def verify_error_est(
885937
time_windows=time_windows,
886938
)
887939
if windows is None:
888-
Sigma = U @ np.diag(D) @ U.T
889-
Q = pca_res.range_sketch[:, :num_components]
890-
err = np.linalg.svd(Sigma - Q @ Q.T @ Sigma).S[0]
891-
assert (
892-
err <= pca_res.error_bound
893-
), "Realized error should be smaller than the bound."
940+
assert_errors_bound(pca_res, D, U)
894941
else:
895942
for w in range(num_windows):
896-
Sigma = U[w] @ np.diag(D[w]) @ U[w].T
897-
Q = pca_res.range_sketch[w, :, :num_components]
898-
err = np.linalg.svd(Sigma - Q @ Q.T @ Sigma).S[0]
899-
assert (
900-
err <= pca_res.error_bound[w]
901-
), "Realized error should be smaller than the bound."
943+
assert_errors_bound(pca_res, D, U, w=w)
902944

903945
def verify_pca(
904946
self,
@@ -1303,15 +1345,19 @@ def test_individuals(self, centre, num_windows, ploidy):
13031345
@pytest.mark.parametrize("num_windows", (0, 2))
13041346
@pytest.mark.parametrize("ploidy", (1, 2, 3))
13051347
def test_err_individuals(self, centre, num_windows, ploidy):
1348+
# NOTE: this is a randomized test, so if things change under the
1349+
# hood it might start to fail for perfectly normal (ie unlucky) reasons.
1350+
# If so, it's probably better to replace the test with a simpler test,
1351+
# e.g., that error_bound is roughly the right order of magnitude.
13061352
ts = msprime.sim_ancestry(
1307-
20,
1353+
30,
13081354
ploidy=ploidy,
1309-
population_size=20,
1355+
population_size=30,
13101356
sequence_length=100,
13111357
recombination_rate=0.01,
1312-
random_seed=12345,
1358+
random_seed=12346,
13131359
)
1314-
individuals = [3, 0, 2, 5, 6, 15, 12, 11, 7, 17]
1360+
individuals = np.arange(30)
13151361
time_low, time_high = (ts.nodes_time.max() / 4, ts.nodes_time.max() / 2)
13161362
self.verify_error_est(
13171363
ts,

python/tskit/trees.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10588,9 +10588,11 @@ class PCAResult:
1058810588
"""
1058910589
error_bound: np.ndarray
1059010590
"""
10591-
An estimate of the error bounds for the eigenvalues (experimental).
10592-
This quantifies the average reconstruction error E|G-USVt| where G is the
10593-
genetic relatedness matrix, U is the PC factors (scores),
10594-
S is the diagonal eigenvalue matrix, and V is the loadings.
10595-
The norm is the matrix operator norm.
10591+
An estimate of the error resulting from the randomized algorithm (experimental).
10592+
Eigenvalues should be correct to within (roughly) this additive factor,
10593+
and factors should be correct to within (roughly) this factor divided by the
10594+
next-largest eigenvalue in the Euclidean norm. These estimates are obtained from
10595+
a bound on the expected L2 operator norm between the true GRM and its
10596+
low-dimensional approximation, from equation 1.11 in
10597+
https://arxiv.org/pdf/0909.4061 .
1059610598
"""

0 commit comments

Comments
 (0)