11# ' @importFrom stats dpois
22# ' @importFrom stats runif
3+ # ' @importFrom stats AIC
34# '
45# ' @title Estimate errors due to blocking in record linkage
56# '
1112# ' @param x Reference data (required if `n` and `N` are not provided).
1213# ' @param y Query data (required if `n` is not provided).
1314# ' @param blocking_result `data.frame` or `data.table` containing blocking results (required if `n` is not provided).
15+ # ' It must contain a column named `y` storing the indices of the records in the query data set.
1416# ' @param n Integer vector of numbers of accepted pairs formed by each record in the query data set
1517# ' with records in the reference data set, based on blocking criteria (if `NULL`, derived from `blocking_result`).
1618# ' @param N Total number of records in the reference data set (if `NULL`, derived as `length(x)`).
17- # ' @param G Number of classes in the finite mixture model.
19+ # ' @param G Integer or vector of integers. Number of classes in the finite mixture model.
20+ # ' If `G` is a vector, the optimal number of classes is selected from the provided values
21+ # ' based on the Akaike Information Criterion (AIC).
1822# ' @param alpha Numeric vector of initial class proportions (length `G`; if `NULL`, initialized as `rep(1/G, G)`).
1923# ' @param p Numeric vector of initial matching probabilities in each class of the mixture model
20- # ' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)`).
24+ # ' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)` or `rep(runif(1, 0.5, 1), G)`,
25+ # ' depending on the parameter `equal_p`).
2126# ' @param lambda Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model
2227# ' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.1, 2)`).
23- # ' @param tol Convergence tolerance for the EM algorithm (default `10^(-6)`).
24- # ' @param maxiter Maximum number of iterations for the EM algorithm (default `1000`).
28+ # ' @param equal_p Logical, indicating whether the matching probabilities
29+ # ' `p` should be constrained to be equal across all latent classes (default `FALSE`).
30+ # ' @param tol Convergence tolerance for the EM algorithm (default `10^(-4)`).
31+ # ' @param maxiter Maximum number of iterations for the EM algorithm (default `100`).
2532# ' @param sample_size Bootstrap sample (from `n`) size used for calculations (if `NULL`, uses all data).
2633# '
2734# ' @details
28- # ' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file.
35+ # ' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources:
36+ # ' a register (reference data `x`) and a file (query data `y`).
2937# ' Assume that the register has no undercoverage,
30- # ' i.e. each record from the file corresponds to exactly one record from the same individual in the register.
38+ # ' i.e., each record from the file corresponds to exactly one record from the same individual in the register.
3139# ' Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with
32- # ' record \eqn{i} on the file. Assume that:\cr
40+ # ' record \eqn{i} on the file, for \eqn{i=1,2,\ldots,m}, where \eqn{m} is the number of records in the file.
41+ # ' Let \eqn{v_i} denote record \eqn{i} from the file.
42+ # ' Assume that:\cr
3343# ' \itemize{
3444# ' \item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N},
3545# ' \item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}.
7181# ' }
7282# ' where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}.
7383# '
84+ # ' @note
85+ # ' The matching probabilities \eqn{p_g} can be constrained to be equal across all latent classes
86+ # ' by setting `equal_p = TRUE`.
7487# '
75- # '
76- # ' @returns Returns a list containing:\cr
88+ # ' @returns Returns an object of class `est_block_error`, with a list containing:\cr
7789# ' \itemize{
7890# ' \item{`FPR` -- estimated false positive rate,}
7991# ' \item{`FNR` -- estimated false negative rate,}
92+ # ' \item{`G` -- number of classes used in the optimal model,}
93+ # ' \item{`log_lik` -- final log-likelihood value,}
94+ # ' \item{`equal_p` -- logical, indicating whether the matching probabilities were constrained,}
8095# ' \item{`iter` -- number of the EM algorithm iterations performed,}
81- # ' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations.}
96+ # ' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations,}
97+ # ' \item{`AIC` -- Akaike Information Criterion value in the optimal model.}
8298# ' }
8399# '
84100# ' @references
92108# ' ## an example proposed by Dasylva and Goussanou (2021)
93109# ' ## we obtain results very close to those reported in the paper
94110# '
95- # ' set.seed(111 )
111+ # ' set.seed(11 )
96112# '
97113# ' neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5))
98114# '
99115# ' errors <- est_block_error(n = neighbors,
100116# ' N = 63155,
101- # ' G = 2 ,
117+ # ' G = 1:3 ,
102118# ' tol = 10^(-3),
103- # ' maxiter = 50 )
119+ # ' equal_p = TRUE )
104120# '
105121# ' errors
106122# '
@@ -114,6 +130,7 @@ est_block_error <- function(x = NULL,
114130 alpha = NULL ,
115131 p = NULL ,
116132 lambda = NULL ,
133+ equal_p = FALSE ,
117134 tol = 10 ^ (- 4 ),
118135 maxiter = 100 ,
119136 sample_size = NULL ) {
@@ -135,6 +152,29 @@ est_block_error <- function(x = NULL,
135152 n <- sample(n , size = sample_size , replace = TRUE )
136153 }
137154
155+ if (length(G ) > 1 ) {
156+
157+ G_cand <- sort(G )
158+ results_list <- list ()
159+ aic_values <- numeric (length(G_cand ))
160+
161+ for (i in seq_along(G_cand )) {
162+
163+ fit <- est_block_error(n = n , N = N , G = G_cand [i ],
164+ alpha = NULL , p = NULL , lambda = NULL ,
165+ equal_p = equal_p , tol = tol , maxiter = maxiter )
166+ results_list [[i ]] <- fit
167+ aic_values [i ] <- fit $ AIC
168+
169+ }
170+
171+ best_idx <- which.min(aic_values )
172+ best_model <- results_list [[best_idx ]]
173+
174+ return (best_model )
175+
176+ }
177+
138178 convergence <- FALSE
139179 m <- length(n )
140180
@@ -143,7 +183,13 @@ est_block_error <- function(x = NULL,
143183 }
144184
145185 if (is.null(p )) {
146- p <- runif(G , min = 0.5 , max = 1 )
186+ if (equal_p ) {
187+ p <- rep(runif(1 , min = 0.5 , max = 1 ), G )
188+ } else {
189+ p <- runif(G , min = 0.5 , max = 1 )
190+ }
191+ } else if (equal_p && length(p ) == G ) {
192+ p <- rep(mean(p ), G )
147193 }
148194
149195 if (is.null(lambda )) {
@@ -192,7 +238,11 @@ est_block_error <- function(x = NULL,
192238 # # M
193239
194240 alpha <- 1 / m * colSums(probs_c_n )
195- p <- colSums(E_c_n_M ) / (m * alpha )
241+ if (equal_p ) {
242+ p <- rep(sum(E_c_n_M ) / m , G )
243+ } else {
244+ p <- colSums(E_c_n_M ) / (m * alpha )
245+ }
196246 lambda <- colSums(E_c_n_U ) / (m * alpha )
197247
198248 # # check
@@ -215,13 +265,20 @@ est_block_error <- function(x = NULL,
215265 FNR <- 1 - sum(alpha * p )
216266 FPR <- sum(alpha * lambda ) / (N - 1 )
217267
218- return ( structure(
268+ res <- structure(
219269 list (
220- FPR = FPR ,
221- FNR = FNR ,
222- iter = l ,
223- convergence = convergence
224- ),
225- class = " est_block_error" ))
270+ FPR = FPR ,
271+ FNR = FNR ,
272+ G = G ,
273+ log_lik = log_lik_new ,
274+ equal_p = equal_p ,
275+ iter = l ,
276+ convergence = convergence
277+ ),
278+ class = " est_block_error" )
279+
280+ res $ AIC <- AIC(res )
281+
282+ return (res )
226283
227284}
0 commit comments