ncn-foreigners
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 2 additions & 2 deletions b/‎DESCRIPTION‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 4 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/blocking.R‎
Lines changed: 11 additions & 8 deletions b/‎R/blocking.R‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎R/est_block_error.R‎
Lines changed: 79 additions & 22 deletions b/‎R/est_block_error.R‎
Lines changed: 79 additions & 22 deletions
diff --git a/‎R/methods.R‎
Lines changed: 20 additions & 2 deletions b/‎R/methods.R‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎data/RLdata500.rda‎
-32 Bytes b/‎data/RLdata500.rda‎
-32 Bytes
diff --git a/‎data/census.rda‎
-38.9 KB b/‎data/census.rda‎
-38.9 KB
diff --git a/‎data/cis.rda‎
-32 Bytes b/‎data/cis.rda‎
-32 Bytes
@@ -8,3 +8,4 @@ inst/doc
 misc
 vignettes/.*R
 vignettes/.*html
+.DS_Store
@@ -1,7 +1,7 @@
 Package: blocking
 Type: Package
 Title: Various Blocking Methods for Entity Resolution
-Version: 1.0.1
+Version: 1.0.2
 Authors@R: 
   c(person(given = "Maciej",
              family = "Beręsewicz",
@@ -19,7 +19,7 @@ LazyData: true
 URL: https://github.com/ncn-foreigners/blocking, https://ncn-foreigners.ue.poznan.pl/blocking/
 BugReports: https://github.com/ncn-foreigners/blocking/issues
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Imports: 
     text2vec,
     tokenizers,
 
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(logLik,est_block_error)
 S3method(print,blocking)
 S3method(print,est_block_error)
 export(blocking)
@@ -35,6 +36,7 @@ importFrom(mlpack,lsh)
 importFrom(readr,read_table)
 importFrom(rnndescent,rnnd_build)
 importFrom(rnndescent,rnnd_query)
+importFrom(stats,AIC)
 importFrom(stats,dist)
 importFrom(stats,dpois)
 importFrom(stats,runif)
 
@@ -1,5 +1,9 @@
 # development
 
+# version 1.0.2
+
++ Updated `est_block_error` function.
+
 # version 1.0.1
 
 + Fixed CRAN errors.
 
@@ -127,16 +127,16 @@ blocking <- function(x,
                                             "lsh" = NULL,
                                             "kd" = NULL)
 
-  stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported" =
+  stopifnot("Only character, dense or sparse (dgCMatrix) matrix x is supported." =
               is.character(x) | is.matrix(x) | inherits(x, "Matrix"))
 
 
   if (!is.null(ann_write)) {
-    stopifnot("Path provided in the `ann_write` is incorrect" = file.exists(ann_write) )
+    stopifnot("Path provided in the `ann_write` is incorrect." = file.exists(ann_write) )
   }
 
   if (ann == "nnd") {
-    stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`" =
+    stopifnot("Distance for NND should be `euclidean, cosine, manhatan, hamming`." =
                 distance %in% c("euclidean", "cosine","manhatan", "hamming"))
   }
 
@@ -145,15 +145,18 @@ blocking <- function(x,
   }
 
   if (ann == "hnsw") {
-    stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`" =
+    stopifnot("Distance for HNSW should be `l2, euclidean, cosine, ip`." =
                 distance %in% c("l2", "euclidean", "cosine", "ip"))
   }
 
   if (ann == "annoy") {
-    stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`" =
+    stopifnot("Distance for Annoy should be `euclidean, manhatan, hamming, angular`." =
                 distance %in% c("euclidean", "manhatan", "hamming", "angular"))
   }
 
+  stopifnot("Algorithm should be `nnd, hnsw, annoy, lsh, kd`." =
+              ann %in% c("nnd", "hnsw", "annoy", "lsh", "kd"))
+
   if (!is.null(y)) {
     deduplication <- FALSE
     y_default <- FALSE
@@ -167,15 +170,15 @@ blocking <- function(x,
 
   if (!is.null(true_blocks)) {
 
-    stopifnot("`true_blocks` should be a data.frame" = is.data.frame(true_blocks))
+    stopifnot("`true_blocks` should be a data.frame." = is.data.frame(true_blocks))
 
     if (deduplication == FALSE) {
-      stopifnot("`true blocks` should be a data.frame with columns: x, y, block" =
+      stopifnot("`true blocks` should be a data.frame with columns: x, y, block." =
                   length(colnames(true_blocks)) == 3,
                 all(colnames(true_blocks) == c("x", "y", "block")))
     }
     if (deduplication) {
-      stopifnot("`true blocks` should be a data.frame with columns: x, block" =
+      stopifnot("`true blocks` should be a data.frame with columns: x, block." =
                   length(colnames(true_blocks)) == 2,
                 all(colnames(true_blocks) == c("x", "block")))
     }
 
@@ -1,5 +1,6 @@
 #' @importFrom stats dpois
 #' @importFrom stats runif
+#' @importFrom stats AIC
 #'
 #' @title Estimate errors due to blocking in record linkage
 #'
@@ -11,25 +12,34 @@
 #' @param x Reference data (required if `n` and `N` are not provided).
 #' @param y Query data (required if `n` is not provided).
 #' @param blocking_result `data.frame` or `data.table` containing blocking results (required if `n` is not provided).
+#' It must contain a column named `y` storing the indices of the records in the query data set.
 #' @param n Integer vector of numbers of accepted pairs formed by each record in the query data set
 #' with records in the reference data set, based on blocking criteria (if `NULL`, derived from `blocking_result`).
 #' @param N Total number of records in the reference data set (if `NULL`, derived as `length(x)`).
-#' @param G Number of classes in the finite mixture model.
+#' @param G Integer or vector of integers. Number of classes in the finite mixture model.
+#' If `G` is a vector, the optimal number of classes is selected from the provided values
+#' based on the Akaike Information Criterion (AIC).
 #' @param alpha Numeric vector of initial class proportions (length `G`; if `NULL`, initialized as `rep(1/G, G)`).
 #' @param p Numeric vector of initial matching probabilities in each class of the mixture model
-#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)`).
+#' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.5, 1)` or `rep(runif(1, 0.5, 1), G)`,
+#' depending on the parameter `equal_p`).
 #' @param lambda Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model
 #' (length `G`; if `NULL`, randomly initialized from `runif(G, 0.1, 2)`).
-#' @param tol Convergence tolerance for the EM algorithm (default `10^(-6)`).
-#' @param maxiter Maximum number of iterations for the EM algorithm (default `1000`).
+#' @param equal_p Logical, indicating whether the matching probabilities
+#' `p` should be constrained to be equal across all latent classes (default `FALSE`).
+#' @param tol Convergence tolerance for the EM algorithm (default `10^(-4)`).
+#' @param maxiter Maximum number of iterations for the EM algorithm (default `100`).
 #' @param sample_size Bootstrap sample (from `n`) size used for calculations (if `NULL`, uses all data).
 #'
 #' @details
-#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file.
+#' Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources:
+#' a register (reference data `x`) and a file (query data `y`).
 #' Assume that the register has no undercoverage,
-#' i.e. each record from the file corresponds to exactly one record from the same individual in the register.
+#' i.e., each record from the file corresponds to exactly one record from the same individual in the register.
 #' Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with
-#' record \eqn{i} on the file. Assume that:\cr
+#' record \eqn{i} on the file, for \eqn{i=1,2,\ldots,m}, where \eqn{m} is the number of records in the file.
+#' Let \eqn{v_i} denote record \eqn{i} from the file.
+#' Assume that:\cr
 #' \itemize{
 #' \item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N},
 #' \item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}.
@@ -71,14 +81,20 @@
 #' }
 #' where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}.
 #'
+#' @note
+#' The matching probabilities \eqn{p_g} can be constrained to be equal across all latent classes
+#' by setting `equal_p = TRUE`.
 #'
-#'
-#' @returns Returns a list containing:\cr
+#' @returns Returns an object of class `est_block_error`, with a list containing:\cr
 #' \itemize{
 #' \item{`FPR` -- estimated false positive rate,}
 #' \item{`FNR` -- estimated false negative rate,}
+#' \item{`G` -- number of classes used in the optimal model,}
+#' \item{`log_lik` -- final log-likelihood value,}
+#' \item{`equal_p` -- logical, indicating whether the matching probabilities were constrained,}
 #' \item{`iter` -- number of the EM algorithm iterations performed,}
-#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations.}
+#' \item{`convergence` -- logical, indicating whether the EM algorithm converged within `maxiter` iterations,}
+#' \item{`AIC` -- Akaike Information Criterion value in the optimal model.}
 #' }
 #'
 #' @references
@@ -92,15 +108,15 @@
 #' ## an example proposed by Dasylva and Goussanou (2021)
 #' ## we obtain results very close to those reported in the paper
 #'
-#' set.seed(111)
+#' set.seed(11)
 #'
 #' neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5))
 #'
 #' errors <- est_block_error(n = neighbors,
 #'                           N = 63155,
-#'                           G = 2,
+#'                           G = 1:3,
 #'                           tol = 10^(-3),
-#'                           maxiter = 50)
+#'                           equal_p = TRUE)
 #'
 #' errors
 #'
@@ -114,6 +130,7 @@ est_block_error <- function(x = NULL,
                             alpha = NULL,
                             p = NULL,
                             lambda = NULL,
+                            equal_p = FALSE,
                             tol = 10^(-4),
                             maxiter = 100,
                             sample_size = NULL) {
@@ -135,6 +152,29 @@ est_block_error <- function(x = NULL,
     n <- sample(n, size = sample_size, replace = TRUE)
   }
 
+  if (length(G) > 1) {
+
+    G_cand <- sort(G)
+    results_list <- list()
+    aic_values <- numeric(length(G_cand))
+
+    for (i in seq_along(G_cand)) {
+
+      fit <- est_block_error(n = n, N = N, G = G_cand[i],
+                             alpha = NULL, p = NULL, lambda = NULL,
+                             equal_p = equal_p, tol = tol, maxiter = maxiter)
+      results_list[[i]] <- fit
+      aic_values[i] <- fit$AIC
+
+    }
+
+    best_idx <- which.min(aic_values)
+    best_model <- results_list[[best_idx]]
+
+    return(best_model)
+
+  }
+
   convergence <- FALSE
   m <- length(n)
 
@@ -143,7 +183,13 @@ est_block_error <- function(x = NULL,
   }
 
   if (is.null(p)) {
-    p <- runif(G, min = 0.5, max = 1)
+    if (equal_p) {
+      p <- rep(runif(1, min = 0.5, max = 1), G)
+    } else {
+      p <- runif(G, min = 0.5, max = 1)
+    }
+  } else if (equal_p && length(p) == G) {
+    p <- rep(mean(p), G)
   }
 
   if (is.null(lambda)) {
@@ -192,7 +238,11 @@ est_block_error <- function(x = NULL,
     ## M
 
     alpha <- 1 / m * colSums(probs_c_n)
-    p <- colSums(E_c_n_M) / (m * alpha)
+    if (equal_p) {
+      p <- rep(sum(E_c_n_M) / m, G)
+    } else {
+      p <- colSums(E_c_n_M) / (m * alpha)
+    }
     lambda <- colSums(E_c_n_U) / (m * alpha)
 
     ## check
@@ -215,13 +265,20 @@ est_block_error <- function(x = NULL,
   FNR <- 1 - sum(alpha * p)
   FPR <- sum(alpha * lambda) / (N - 1)
 
-  return(structure(
+  res <- structure(
     list(
-    FPR = FPR,
-    FNR = FNR,
-    iter = l,
-    convergence = convergence
-  ),
-  class = "est_block_error"))
+      FPR = FPR,
+      FNR = FNR,
+      G = G,
+      log_lik = log_lik_new,
+      equal_p = equal_p,
+      iter = l,
+      convergence = convergence
+    ),
+    class = "est_block_error")
+
+  res$AIC <- AIC(res)
+
+  return(res)
 
 }
@@ -34,8 +34,9 @@ print.blocking <- function(x, ...) {
 #' @exportS3Method
 print.est_block_error <- function(x, ...) {
 
-  cat("FPR: ", x$FPR, "\n")
-  cat("FNR: ", x$FNR, "\n")
+  cat("Estimated FPR: ", x$FPR, "\n")
+  cat("Estimated FNR: ", x$FNR, "\n")
+  cat("Number of classes in the model: ", x$G, "\n")
 
   cat("========================================================\n")
 
@@ -46,3 +47,20 @@ print.est_block_error <- function(x, ...) {
   }
 }
 
+#' @method logLik est_block_error
+#' @exportS3Method
+logLik.est_block_error <- function(object, ...) {
+
+  val <- object$log_lik
+  if (object$equal_p) {
+    k <- 2 * object$G
+  } else {
+    k <- 3 * object$G - 1
+  }
+
+  attr(val, "df") <- k
+  class(val) <- "logLik"
+
+  val
+
+}