easystats · rempsyc · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -281,6 +281,7 @@ export(data_write)
 export(degroup)
 export(demean)
 export(describe_distribution)
+export(describe_missing)
 export(detrend)
 export(display)
 export(distribution_coef_var)

diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,10 @@ BREAKING CHANGES
 * `data_to_wide()` no longer removes empty columns that were created after
   widening data frames, to behave similarly to `tidyr::pivot_wider()` (#645).
 
+NEW FUNCTIONS
+
+* `describe_missing()`, to report on missing values in a data frame.
+
 CHANGES
 
 * Due to changes in the package `insight`, `data_tabulate()` no longer prints

diff --git a/R/describe_missing.R b/R/describe_missing.R
@@ -0,0 +1,112 @@
+#' @title Describe Missing Values in Data According to Guidelines
+#'
+#' @description Provides a detailed description of missing values in a data frame.
+#' This function reports both absolute and percentage missing values of specified
+#' variables.
+#'
+#' @inheritParams extract_column_names
+#' @param by Optional character string, indicating the names of one or more
+#' variables in the data frame. If supplied, the data will be split by these
+#' variables and summary statistics will be computed for each group. Useful
+#' for survey data by first reshaping the data to the long format.
+#' @param sort Logical. Whether to sort the result from highest to lowest
+#' percentage of missing data.
+#' @return A dataframe with the following columns:
+#'  - `variable`: Variables selected.
+#'  - `n_missing`: Number of missing values.
+#'  - `missing_percent`: Percentage of missing values.
+#'  - `complete_percent`: Percentage of non-missing values.
+#' @param ... Arguments passed down to other functions. Currently not used.
+#'
+#' @export
+#' @examples
+#' describe_missing(airquality)
+#'
+#' # Survey data
+#' set.seed(15)
+#' fun <- function() {
+#'   c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA)
+#' }
+#' df <- data.frame(
+#'   ID = c("idz", NA),
+#'   openness_1 = fun(), openness_2 = fun(), openness_3 = fun(),
+#'   extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(),
+#'   agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun()
+#' )
+#'
+#' df_long <- reshape_longer(
+#'   df,
+#'   select = -1,
+#'   names_sep = "_",
+#'   names_to = c("dimension", "item")
+#' )
+#'
+#' describe_missing(
+#'   df_long,
+#'   select = -c(1, 3),
+#'   by = "dimension"
+#' )
+#'
+describe_missing <- function(data,
+                             select = NULL,
+                             exclude = NULL,
+                             ignore_case = FALSE,
+                             regex = FALSE,
+                             verbose = TRUE,
+                             by = NULL,
+                             sort = FALSE,
+                             ...) {
+  if (!is.null(select) || !is.null(exclude)) {
+    data <- data_select(
+      data = data,
+      select = select,
+      exclude = exclude,
+      ignore_case = ignore_case,
+      regex = regex,
+      verbose = verbose,
+      ...
+    )
+  }
+  if (is.null(by)) {
+    na_list <- lapply(names(data), function(x) {
+      data_subset <- data[, x, drop = FALSE]
+      .describe_missing(data_subset)
+    })
+  } else {
+    if (!by %in% names(data)) {
+      stop("The 'by' column does not exist in the data.", call. = FALSE)
+    }
+    grouped_data <- split(data, data[[by]])
+    na_list <- lapply(names(grouped_data), function(group_name) {
+      group <- grouped_data[[group_name]]
+      # Identify columns to analyze (exclude the 'by' column)
+      cols_to_analyze <- setdiff(names(group), by)
+      group_na_list <- lapply(cols_to_analyze, function(x) {
+        data_subset <- group[, x, drop = FALSE]
+        .describe_missing(data_subset)
+      })
+      group_na_df <- do.call(rbind, group_na_list)
+      group_na_df$variable <- group_name
+      group_na_df
+    })
+  }
+  na_df <- do.call(rbind, na_list)
+  if (isTRUE(sort)) {
+    na_df <- na_df[order(-na_df$missing_percent), ]
+  }
+  na_df_tot <- .describe_missing(data)
+  na_df_tot$variable <- "Total"
+  na_df <- rbind(na_df, na_df_tot)
+  na_df
+}
+
+.describe_missing <- function(data) {
+  n_missing <- sum(is.na(data))
+  missing_percent <- round(n_missing / (nrow(data) * ncol(data)) * 100, 2)
+  data.frame(
+    variable = names(data)[1],
+    n_missing = n_missing,
+    missing_percent = missing_percent,
+    complete_percent = 100 - missing_percent
+  )
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -75,6 +75,7 @@ labelling
 leptokurtic
 lm
 lme
+macOS
 meaned
 mesokurtic
 midhinge
@@ -88,6 +89,8 @@ platykurtic
 poorman
 pre
 px
+quartile
+quartiles
 readr
 readxl
 relevel

diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd
diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml
@@ -66,6 +66,7 @@ reference:
       - data_tabulate
       - data_peek
       - data_seek
+      - describe_missing
       - means_by_group
       - contains("distribution")
       - kurtosis

diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md
@@ -0,0 +1,61 @@
+# describe_missing
+
+    Code
+      describe_missing(airquality2)
+    Output
+        variable n_missing missing_percent complete_percent
+      1  Solar.R         7            4.58            95.42
+      2     Wind         0            0.00           100.00
+      3     Temp         0            0.00           100.00
+      4    Month         0            0.00           100.00
+      5      Day         0            0.00           100.00
+      6    Ozone        37           24.18            75.82
+      7    Total        44            4.79            95.21
+
+---
+
+    Code
+      describe_missing(airquality2, sort = TRUE)
+    Output
+         variable n_missing missing_percent complete_percent
+      6     Ozone        37           24.18            75.82
+      1   Solar.R         7            4.58            95.42
+      2      Wind         0            0.00           100.00
+      3      Temp         0            0.00           100.00
+      4     Month         0            0.00           100.00
+      5       Day         0            0.00           100.00
+      11    Total        44            4.79            95.21
+
+---
+
+    Code
+      describe_missing(airquality2, select = "Ozone:Temp")
+    Output
+        variable n_missing missing_percent complete_percent
+      1    Ozone        37           24.18            75.82
+      2      Day         0            0.00           100.00
+      3    Month         0            0.00           100.00
+      4     Temp         0            0.00           100.00
+      5    Total        37            6.05            93.95
+
+---
+
+    Code
+      describe_missing(airquality2, exclude = "Ozone:Temp")
+    Output
+        variable n_missing missing_percent complete_percent
+      1  Solar.R         7            4.58            95.42
+      2     Wind         0            0.00           100.00
+      3    Total         7            2.29            97.71
+
+---
+
+    Code
+      describe_missing(df_long, select = -c(1, 3), by = "dimension")
+    Output
+             variable n_missing missing_percent complete_percent
+      1 agreeableness        10           23.81            76.19
+      2  extroversion        17           40.48            59.52
+      3      openness        11           26.19            73.81
+      4         Total        38           15.08            84.92
+
diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R
@@ -0,0 +1,43 @@
+test_that("describe_missing", {
+  airquality2 <- cbind(airquality[2:6], airquality[1])
+
+  expect_snapshot(describe_missing(airquality2))
+
+  expect_snapshot(describe_missing(airquality2, sort = TRUE))
+
+  expect_snapshot(describe_missing(
+    airquality2,
+    select = "Ozone:Temp"
+  ))
+
+  expect_snapshot(describe_missing(
+    airquality2,
+    exclude = "Ozone:Temp"
+  ))
+
+  # Testing the 'by' argument for survey scales
+  set.seed(15)
+  fun <- function() {
+    c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA)
+  }
+  df <- data.frame(
+    ID = c("idz", NA),
+    openness_1 = fun(), openness_2 = fun(), openness_3 = fun(),
+    extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(),
+    agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun(),
+    stringsAsFactors = FALSE
+  )
+
+  # Pivot and group using datawizard
+  df_long <- reshape_longer(df,
+    select = -1,
+    names_sep = "_",
+    names_to = c("dimension", "item")
+  )
+
+  # Run describe_missing with 'by' argument
+  expect_snapshot(describe_missing(
+    df_long,
+    select = -c(1, 3), by = "dimension"
+  ))
+})