@@ -818,6 +818,58 @@ def assert_pcs_equal(U, D, U_full, D_full, rtol=1e-5, atol=1e-8):
818818 assert found_it , f"{ k } th singular vector { u } not found in { U_full } ."
819819
820820
821+ def assert_errors_bound (pca_res , D , U , w = None ):
822+ # Bounds on the error are from equation 1.11 in https://arxiv.org/pdf/0909.4061 -
823+ # this gives a bound on reconstruction error (i.e., operator norm between the GRM
824+ # and the low-diml approx). But since the (L2) operator norm is
825+ # |X| = sup_v |Xv|/|v|,
826+ # this implies bounds on singular values and vectors also:
827+ # If G v = lambda v, and we've got estimated singular vectors U and values diag(L),
828+ # then let v = \sum_i b_i u_i + delta be the projection of v into U,
829+ # and we have that
830+ # |lambda v - U L U* v|^2
831+ # = \sum_i b_i^2 (lambda - L_i)^2 + lambda^2 |delta|^2
832+ # < \epsilon^2 (where epsilon is the spectral norm bound error_bound)
833+ # so
834+ # |delta| < epsilon / lambda
835+ # since this is the amount by which the eigenvector v isn't hit by the columns of U.
836+ # Then also for each i that if b_i is not small then
837+ # |lambda - L_i| < epsilon
838+ # and there must be at least one b_i that is big (since sum_i b_i^2 = 1 - |delta|^2).
839+ # More concretely, let m = min_i |lambda - L_i|^2,
840+ # so that
841+ # epsilon^2 > \sum_i (lambda - L_i)^2 b_i^2 + lambda^2 |delta|^2
842+ # >= m * \sum_i b_i^2 + lambda^2 |delta|^2
843+ # = m * (1-|delta|^2) + lambda^2 |delta|^2.
844+ # Hence,
845+ # min_i |lambda-L_i|^2 = m < (epsilon^2 - lambda^2 |delta|^2) / (1- |delta|^2).
846+ # In summary: roughly, epsilon should be the bound on error in eigenvalues,
847+ # and epsilon / sigma[k+1] the L2 bound for eigenvectors
848+ # Below, the 'roughly/should be' translates into the factor of 5.
849+
850+ f = pca_res .factors
851+ ev = pca_res .eigenvalues
852+ rs = pca_res .range_sketch
853+ eps = pca_res .error_bound
854+ if w is not None :
855+ D , U = D [w ], U [w ]
856+ f , ev , rs , eps = f [w ], ev [w ], rs [w ], eps [w ]
857+ n = ev .shape [0 ]
858+ Sigma = U @ np .diag (D ) @ U .T
859+ Q = rs [:, :n ]
860+ err = np .linalg .svd (Sigma - Q @ Q .T @ Sigma ).S [0 ]
861+ assert (
862+ err <= 5 * eps ** 2
863+ ), "Reconstruction error should be smaller than the bound squared."
864+ assert (
865+ np .max (np .abs (ev - D [:n ])) < 5 * eps
866+ ), "Eigenvalue error should be smaller than error bound."
867+ for k in range (n ):
868+ assert (
869+ np .sum ((f [:, k ] - U [:, k ]) ** 2 ) < 5 * eps ** 2 / ev [- 1 ]
870+ ), "Factor error should be smaller than the bound squared."
871+
872+
821873class TestPCA :
822874
823875 def verify_error_est (
@@ -885,20 +937,10 @@ def verify_error_est(
885937 time_windows = time_windows ,
886938 )
887939 if windows is None :
888- Sigma = U @ np .diag (D ) @ U .T
889- Q = pca_res .range_sketch [:, :num_components ]
890- err = np .linalg .svd (Sigma - Q @ Q .T @ Sigma ).S [0 ]
891- assert (
892- err <= pca_res .error_bound
893- ), "Realized error should be smaller than the bound."
940+ assert_errors_bound (pca_res , D , U )
894941 else :
895942 for w in range (num_windows ):
896- Sigma = U [w ] @ np .diag (D [w ]) @ U [w ].T
897- Q = pca_res .range_sketch [w , :, :num_components ]
898- err = np .linalg .svd (Sigma - Q @ Q .T @ Sigma ).S [0 ]
899- assert (
900- err <= pca_res .error_bound [w ]
901- ), "Realized error should be smaller than the bound."
943+ assert_errors_bound (pca_res , D , U , w = w )
902944
903945 def verify_pca (
904946 self ,
@@ -1303,15 +1345,19 @@ def test_individuals(self, centre, num_windows, ploidy):
13031345 @pytest .mark .parametrize ("num_windows" , (0 , 2 ))
13041346 @pytest .mark .parametrize ("ploidy" , (1 , 2 , 3 ))
13051347 def test_err_individuals (self , centre , num_windows , ploidy ):
1348+ # NOTE: this is a randomized test, so if things change under the
1349+ # hood it might start to fail for perfectly normal (ie unlucky) reasons.
1350+ # If so, it's probably better to replace the test with a simpler test,
1351+ # e.g., that error_bound is roughly the right order of magnitude.
13061352 ts = msprime .sim_ancestry (
1307- 20 ,
1353+ 30 ,
13081354 ploidy = ploidy ,
1309- population_size = 20 ,
1355+ population_size = 30 ,
13101356 sequence_length = 100 ,
13111357 recombination_rate = 0.01 ,
1312- random_seed = 12345 ,
1358+ random_seed = 12346 ,
13131359 )
1314- individuals = [ 3 , 0 , 2 , 5 , 6 , 15 , 12 , 11 , 7 , 17 ]
1360+ individuals = np . arange ( 30 )
13151361 time_low , time_high = (ts .nodes_time .max () / 4 , ts .nodes_time .max () / 2 )
13161362 self .verify_error_est (
13171363 ts ,
0 commit comments