Skip to content

Commit cb4b770

Browse files
authored
Merge pull request #39 from ncn-foreigners/dev
Update of the `est_block_error` function and minor changes
2 parents 6b87557 + 0b7055d commit cb4b770

File tree

15 files changed

+160
-47
lines changed

15 files changed

+160
-47
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ inst/doc
88
misc
99
vignettes/.*R
1010
vignettes/.*html
11+
.DS_Store

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: blocking
22
Type: Package
33
Title: Various Blocking Methods for Entity Resolution
4-
Version: 1.0.1
4+
Version: 1.0.2
55
Authors@R:
66
c(person(given = "Maciej",
77
family = "Beręsewicz",
@@ -19,7 +19,7 @@ LazyData: true
1919
URL: https://github.com/ncn-foreigners/blocking, https://ncn-foreigners.ue.poznan.pl/blocking/
2020
BugReports: https://github.com/ncn-foreigners/blocking/issues
2121
Roxygen: list(markdown = TRUE)
22-
RoxygenNote: 7.3.2
22+
RoxygenNote: 7.3.3
2323
Imports:
2424
text2vec,
2525
tokenizers,

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
S3method(logLik,est_block_error)
34
S3method(print,blocking)
45
S3method(print,est_block_error)
56
export(blocking)
@@ -35,6 +36,7 @@ importFrom(mlpack,lsh)
3536
importFrom(readr,read_table)
3637
importFrom(rnndescent,rnnd_build)
3738
importFrom(rnndescent,rnnd_query)
39+
importFrom(stats,AIC)
3840
importFrom(stats,dist)
3941
importFrom(stats,dpois)
4042
importFrom(stats,runif)

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# development
22

3+
# version 1.0.2
4+
5+
+ Updated `est_block_error` function.
6+
37
# version 1.0.1
48

59
+ Fixed CRAN errors.

R/blocking.R

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -127,16 +127,16 @@ blocking <- function(x,
127127
"lsh" = NULL,
128128
"kd" = NULL)
129129

130-
stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported" =
130+
stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported." =
131131
is.character(x) | is.matrix(x) | inherits(x, "Matrix"))
132132

133133

134134
if (!is.null(ann_write)) {
135-
stopifnot("Path provided in the `ann_write` is incorrect" = file.exists(ann_write) )
135+
stopifnot("Path provided in the `ann_write` is incorrect." = file.exists(ann_write) )
136136
}
137137

138138
if (ann == "nnd") {
139-
stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`" =
139+
stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`." =
140140
distance %in% c("euclidean", "cosine","manhatan", "hamming"))
141141
}
142142

@@ -145,15 +145,18 @@ blocking <- function(x,
145145
}
146146

147147
if (ann == "hnsw") {
148-
stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`" =
148+
stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`." =
149149
distance %in% c("l2", "euclidean", "cosine", "ip"))
150150
}
151151

152152
if (ann == "annoy") {
153-
stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`" =
153+
stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`." =
154154
distance %in% c("euclidean", "manhatan", "hamming", "angular"))
155155
}
156156

157+
stopifnot("Algorithm should be `nnd, hnsw, annoy, lsh, kd`." =
158+
ann %in% c("nnd", "hnsw", "annoy", "lsh", "kd"))
159+
157160
if (!is.null(y)) {
158161
deduplication <- FALSE
159162
y_default <- FALSE
@@ -167,15 +170,15 @@ blocking <- function(x,
167170

168171
if (!is.null(true_blocks)) {
169172

170-
stopifnot("`true_blocks` should be a data.frame" = is.data.frame(true_blocks))
173+
stopifnot("`true_blocks` should be a data.frame." = is.data.frame(true_blocks))
171174

172175
if (deduplication == FALSE) {
173-
stopifnot("`true blocks` should be a data.frame with columns: x, y, block" =
176+
stopifnot("`true blocks` should be a data.frame with columns: x, y, block." =
174177
length(colnames(true_blocks)) == 3,
175178
all(colnames(true_blocks) == c("x", "y", "block")))
176179
}
177180
if (deduplication) {
178-
stopifnot("`true blocks` should be a data.frame with columns: x, block" =
181+
stopifnot("`true blocks` should be a data.frame with columns: x, block." =
179182
length(colnames(true_blocks)) == 2,
180183
all(colnames(true_blocks) == c("x", "block")))
181184
}

R/est_block_error.R

Lines changed: 79 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#' @importFrom stats dpois
22
#' @importFrom stats runif
3+
#' @importFrom stats AIC
34
#'
45
#' @title Estimate errors due to blocking in record linkage
56
#'
@@ -11,25 +12,34 @@
1112
#' @param x Reference data (required if `n` and `N` are not provided).
1213
#' @param y Query data (required if `n` is not provided).
1314
#' @param blocking_result `data.frame` or `data.table` containing blocking results (required if `n` is not provided).
15+
#' It must contain a column named `y` storing the indices of the records in the query data set.
1416
#' @param n Integer vector of numbers of accepted pairs formed by each record in the query data set
1517
#' with records in the reference data set, based on blocking criteria (if `NULL`, derived from `blocking_result`).
1618
#' @param N Total number of records in the reference data set (if `NULL`, derived as `length(x)`).
17-
#' @param G Number of classes in the finite mixture model.
19+
#' @param G Integer or vector of integers. Number of classes in the finite mixture model.
20+
#' If `G` is a vector, the optimal number of classes is selected from the provided values
21+
#' based on the Akaike Information Criterion (AIC).
1822
#' @param alpha Numeric vector of initial class proportions (length `G`; if `NULL`, initialized as `rep(1/G, G)`).
1923
#' @param p Numeric vector of initial matching probabilities in each class of the mixture model
20-
#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)`).
24+
#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)` or `rep(runif(1, 0.5, 1), G)`,
25+
#' depending on the parameter `equal_p`).
2126
#' @param lambda Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model
2227
#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.1, 2)`).
23-
#' @param tol Convergence tolerance for the EM algorithm (default `10^(-6)`).
24-
#' @param maxiter Maximum number of iterations for the EM algorithm (default `1000`).
28+
#' @param equal_p Logical, indicating whether the matching probabilities
29+
#' `p` should be constrained to be equal across all latent classes (default `FALSE`).
30+
#' @param tol Convergence tolerance for the EM algorithm (default `10^(-4)`).
31+
#' @param maxiter Maximum number of iterations for the EM algorithm (default `100`).
2532
#' @param sample_size Bootstrap sample (from `n`) size used for calculations (if `NULL`, uses all data).
2633
#'
2734
#' @details
28-
#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file.
35+
#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources:
36+
#' a register (reference data `x`) and a file (query data `y`).
2937
#' Assume that the register has no undercoverage,
30-
#' i.e. each record from the file corresponds to exactly one record from the same individual in the register.
38+
#' i.e., each record from the file corresponds to exactly one record from the same individual in the register.
3139
#' Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with
32-
#' record \eqn{i} on the file. Assume that:\cr
40+
#' record \eqn{i} on the file, for \eqn{i=1,2,\ldots,m}, where \eqn{m} is the number of records in the file.
41+
#' Let \eqn{v_i} denote record \eqn{i} from the file.
42+
#' Assume that:\cr
3343
#' \itemize{
3444
#' \item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N},
3545
#' \item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}.
@@ -71,14 +81,20 @@
7181
#' }
7282
#' where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}.
7383
#'
84+
#' @note
85+
#' The matching probabilities \eqn{p_g} can be constrained to be equal across all latent classes
86+
#' by setting `equal_p = TRUE`.
7487
#'
75-
#'
76-
#' @returns Returns a list containing:\cr
88+
#' @returns Returns an object of class `est_block_error`, with a list containing:\cr
7789
#' \itemize{
7890
#' \item{`FPR` -- estimated false positive rate,}
7991
#' \item{`FNR` -- estimated false negative rate,}
92+
#' \item{`G` -- number of classes used in the optimal model,}
93+
#' \item{`log_lik` -- final log-likelihood value,}
94+
#' \item{`equal_p` -- logical, indicating whether the matching probabilities were constrained,}
8095
#' \item{`iter` -- number of the EM algorithm iterations performed,}
81-
#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations.}
96+
#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations,}
97+
#' \item{`AIC` -- Akaike Information Criterion value in the optimal model.}
8298
#' }
8399
#'
84100
#' @references
@@ -92,15 +108,15 @@
92108
#' ## an example proposed by Dasylva and Goussanou (2021)
93109
#' ## we obtain results very close to those reported in the paper
94110
#'
95-
#' set.seed(111)
111+
#' set.seed(11)
96112
#'
97113
#' neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5))
98114
#'
99115
#' errors <- est_block_error(n = neighbors,
100116
#' N = 63155,
101-
#' G = 2,
117+
#' G = 1:3,
102118
#' tol = 10^(-3),
103-
#' maxiter = 50)
119+
#' equal_p = TRUE)
104120
#'
105121
#' errors
106122
#'
@@ -114,6 +130,7 @@ est_block_error <- function(x = NULL,
114130
alpha = NULL,
115131
p = NULL,
116132
lambda = NULL,
133+
equal_p = FALSE,
117134
tol = 10^(-4),
118135
maxiter = 100,
119136
sample_size = NULL) {
@@ -135,6 +152,29 @@ est_block_error <- function(x = NULL,
135152
n <- sample(n, size = sample_size, replace = TRUE)
136153
}
137154

155+
if (length(G) > 1) {
156+
157+
G_cand <- sort(G)
158+
results_list <- list()
159+
aic_values <- numeric(length(G_cand))
160+
161+
for (i in seq_along(G_cand)) {
162+
163+
fit <- est_block_error(n = n, N = N, G = G_cand[i],
164+
alpha = NULL, p = NULL, lambda = NULL,
165+
equal_p = equal_p, tol = tol, maxiter = maxiter)
166+
results_list[[i]] <- fit
167+
aic_values[i] <- fit$AIC
168+
169+
}
170+
171+
best_idx <- which.min(aic_values)
172+
best_model <- results_list[[best_idx]]
173+
174+
return(best_model)
175+
176+
}
177+
138178
convergence <- FALSE
139179
m <- length(n)
140180

@@ -143,7 +183,13 @@ est_block_error <- function(x = NULL,
143183
}
144184

145185
if (is.null(p)) {
146-
p <- runif(G, min = 0.5, max = 1)
186+
if (equal_p) {
187+
p <- rep(runif(1, min = 0.5, max = 1), G)
188+
} else {
189+
p <- runif(G, min = 0.5, max = 1)
190+
}
191+
} else if (equal_p && length(p) == G) {
192+
p <- rep(mean(p), G)
147193
}
148194

149195
if (is.null(lambda)) {
@@ -192,7 +238,11 @@ est_block_error <- function(x = NULL,
192238
## M
193239

194240
alpha <- 1 / m * colSums(probs_c_n)
195-
p <- colSums(E_c_n_M) / (m * alpha)
241+
if (equal_p) {
242+
p <- rep(sum(E_c_n_M) / m, G)
243+
} else {
244+
p <- colSums(E_c_n_M) / (m * alpha)
245+
}
196246
lambda <- colSums(E_c_n_U) / (m * alpha)
197247

198248
## check
@@ -215,13 +265,20 @@ est_block_error <- function(x = NULL,
215265
FNR <- 1 - sum(alpha * p)
216266
FPR <- sum(alpha * lambda) / (N - 1)
217267

218-
return(structure(
268+
res <- structure(
219269
list(
220-
FPR = FPR,
221-
FNR = FNR,
222-
iter = l,
223-
convergence = convergence
224-
),
225-
class = "est_block_error"))
270+
FPR = FPR,
271+
FNR = FNR,
272+
G = G,
273+
log_lik = log_lik_new,
274+
equal_p = equal_p,
275+
iter = l,
276+
convergence = convergence
277+
),
278+
class = "est_block_error")
279+
280+
res$AIC <- AIC(res)
281+
282+
return(res)
226283

227284
}

R/methods.R

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ print.blocking <- function(x, ...) {
3434
#' @exportS3Method
3535
print.est_block_error <- function(x, ...) {
3636

37-
cat("FPR: ", x$FPR, "\n")
38-
cat("FNR: ", x$FNR, "\n")
37+
cat("Estimated FPR: ", x$FPR, "\n")
38+
cat("Estimated FNR: ", x$FNR, "\n")
39+
cat("Number of classes in the model: ", x$G, "\n")
3940

4041
cat("========================================================\n")
4142

@@ -46,3 +47,20 @@ print.est_block_error <- function(x, ...) {
4647
}
4748
}
4849

50+
#' @method logLik est_block_error
51+
#' @exportS3Method
52+
logLik.est_block_error <- function(object, ...) {
53+
54+
val <- object$log_lik
55+
if (object$equal_p) {
56+
k <- 2 * object$G
57+
} else {
58+
k <- 3 * object$G - 1
59+
}
60+
61+
attr(val, "df") <- k
62+
class(val) <- "logLik"
63+
64+
val
65+
66+
}

data/RLdata500.rda

-32 Bytes
Binary file not shown.

data/census.rda

-38.9 KB
Binary file not shown.

data/cis.rda

-32 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)