-
-
Notifications
You must be signed in to change notification settings - Fork 17
Suggestion of new function: describe_missing()
#561
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
f879900
ab9f006
218b7f4
ebaeb68
c3c1302
357dbbc
0c25fef
fbdd26d
72041f5
835b3bb
0e83588
e8d393d
f26f247
ceebf8b
b389a39
1f36678
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| #' @title Describe Missing Values in Data According to Guidelines | ||
| #' | ||
| #' @description Provides a detailed description of missing values in a data frame. | ||
| #' This function reports both absolute and percentage missing values of specified | ||
| #' variables. | ||
| #' | ||
| #' @inheritParams extract_column_names | ||
| #' @param by Optional character string, indicating the names of one or more | ||
| #' variables in the data frame. If supplied, the data will be split by these | ||
| #' variables and summary statistics will be computed for each group. Useful | ||
| #' for survey data by first reshaping the data to the long format. | ||
rempsyc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| #' @param sort Logical. Whether to sort the result from highest to lowest | ||
| #' percentage of missing data. | ||
rempsyc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| #' @return A dataframe with the following columns: | ||
| #' - `variable`: Variables selected. | ||
| #' - `n_missing`: Number of missing values. | ||
| #' - `missing_percent`: Percentage of missing values. | ||
| #' - `complete_percent`: Percentage of non-missing values. | ||
|
Comment on lines
+14
to
+18
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it also have the total number of obs for better comparison? Although this number would be repeated for all rows... |
||
| #' @param ... Arguments passed down to other functions. Currently not used. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we actually need If we keep it, it should be positioned before
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need it, but I had added it because I pass it to
So I can either keep it or remove it, as you wish. |
||
| #' | ||
| #' @export | ||
| #' @examples | ||
| #' describe_missing(airquality) | ||
| #' | ||
| #' # Survey data | ||
| #' set.seed(15) | ||
| #' fun <- function() { | ||
| #' c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) | ||
| #' } | ||
| #' df <- data.frame( | ||
| #' ID = c("idz", NA), | ||
| #' openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), | ||
| #' extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), | ||
| #' agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() | ||
| #' ) | ||
| #' | ||
| #' df_long <- reshape_longer( | ||
| #' df, | ||
| #' select = -1, | ||
| #' names_sep = "_", | ||
| #' names_to = c("dimension", "item") | ||
| #' ) | ||
| #' | ||
| #' describe_missing( | ||
| #' df_long, | ||
| #' select = -c(1, 3), | ||
| #' by = "dimension" | ||
| #' ) | ||
|
Comment on lines
+44
to
+48
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This fails with an unclear message if there are more than one variable in The way this argument works is also not very clear to me. For instance, I'd find it more natural if the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Additionally, the current implementation means that > describe_missing(df_long, by = "dimension")
variable n_missing missing_percent complete_percent
1 agreeableness 21 50.00 50.00
2 agreeableness 0 0.00 100.00
3 agreeableness 10 23.81 76.19
4 extroversion 21 50.00 50.00
5 extroversion 0 0.00 100.00
6 extroversion 17 40.48 59.52
7 openness 21 50.00 50.00
8 openness 0 0.00 100.00
9 openness 11 26.19 73.81
10 Total 101 20.04 79.96 |
||
| #' | ||
| describe_missing <- function(data, | ||
| select = NULL, | ||
| exclude = NULL, | ||
| ignore_case = FALSE, | ||
| regex = FALSE, | ||
| verbose = TRUE, | ||
| by = NULL, | ||
| sort = FALSE, | ||
| ...) { | ||
| if (!is.null(select) || !is.null(exclude)) { | ||
| data <- data_select( | ||
| data = data, | ||
| select = select, | ||
| exclude = exclude, | ||
| ignore_case = ignore_case, | ||
| regex = regex, | ||
| verbose = verbose, | ||
| ... | ||
| ) | ||
| } | ||
| if (is.null(by)) { | ||
| na_list <- lapply(names(data), function(x) { | ||
| data_subset <- data[, x, drop = FALSE] | ||
| .describe_missing(data_subset) | ||
| }) | ||
| } else { | ||
| if (!by %in% names(data)) { | ||
| stop("The 'by' column does not exist in the data.", call. = FALSE) | ||
| } | ||
| grouped_data <- split(data, data[[by]]) | ||
| na_list <- lapply(names(grouped_data), function(group_name) { | ||
| group <- grouped_data[[group_name]] | ||
| # Identify columns to analyze (exclude the 'by' column) | ||
| cols_to_analyze <- setdiff(names(group), by) | ||
| group_na_list <- lapply(cols_to_analyze, function(x) { | ||
| data_subset <- group[, x, drop = FALSE] | ||
| .describe_missing(data_subset) | ||
| }) | ||
| group_na_df <- do.call(rbind, group_na_list) | ||
| group_na_df$variable <- group_name | ||
| group_na_df | ||
| }) | ||
| } | ||
| na_df <- do.call(rbind, na_list) | ||
| if (isTRUE(sort)) { | ||
| na_df <- na_df[order(-na_df$missing_percent), ] | ||
| } | ||
| na_df_tot <- .describe_missing(data) | ||
| na_df_tot$variable <- "Total" | ||
| na_df <- rbind(na_df, na_df_tot) | ||
| na_df | ||
| } | ||
|
|
||
| .describe_missing <- function(data) { | ||
| n_missing <- sum(is.na(data)) | ||
| missing_percent <- round(n_missing / (nrow(data) * ncol(data)) * 100, 2) | ||
| data.frame( | ||
| variable = names(data)[1], | ||
| n_missing = n_missing, | ||
| missing_percent = missing_percent, | ||
| complete_percent = 100 - missing_percent | ||
| ) | ||
| } | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| # describe_missing | ||
|
|
||
| Code | ||
| describe_missing(airquality2) | ||
| Output | ||
| variable n_missing missing_percent complete_percent | ||
| 1 Solar.R 7 4.58 95.42 | ||
| 2 Wind 0 0.00 100.00 | ||
| 3 Temp 0 0.00 100.00 | ||
| 4 Month 0 0.00 100.00 | ||
| 5 Day 0 0.00 100.00 | ||
| 6 Ozone 37 24.18 75.82 | ||
| 7 Total 44 4.79 95.21 | ||
|
|
||
| --- | ||
|
|
||
| Code | ||
| describe_missing(airquality2, sort = TRUE) | ||
| Output | ||
| variable n_missing missing_percent complete_percent | ||
| 6 Ozone 37 24.18 75.82 | ||
| 1 Solar.R 7 4.58 95.42 | ||
| 2 Wind 0 0.00 100.00 | ||
| 3 Temp 0 0.00 100.00 | ||
| 4 Month 0 0.00 100.00 | ||
| 5 Day 0 0.00 100.00 | ||
| 11 Total 44 4.79 95.21 | ||
|
|
||
| --- | ||
|
|
||
| Code | ||
| describe_missing(airquality2, select = "Ozone:Temp") | ||
| Output | ||
| variable n_missing missing_percent complete_percent | ||
| 1 Ozone 37 24.18 75.82 | ||
| 2 Day 0 0.00 100.00 | ||
| 3 Month 0 0.00 100.00 | ||
| 4 Temp 0 0.00 100.00 | ||
| 5 Total 37 6.05 93.95 | ||
|
|
||
| --- | ||
|
|
||
| Code | ||
| describe_missing(airquality2, exclude = "Ozone:Temp") | ||
| Output | ||
| variable n_missing missing_percent complete_percent | ||
| 1 Solar.R 7 4.58 95.42 | ||
| 2 Wind 0 0.00 100.00 | ||
| 3 Total 7 2.29 97.71 | ||
|
|
||
| --- | ||
|
|
||
| Code | ||
| describe_missing(df_long, select = -c(1, 3), by = "dimension") | ||
| Output | ||
| variable n_missing missing_percent complete_percent | ||
| 1 agreeableness 10 23.81 76.19 | ||
| 2 extroversion 17 40.48 59.52 | ||
| 3 openness 11 26.19 73.81 | ||
| 4 Total 38 15.08 84.92 | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| test_that("describe_missing", { | ||
| airquality2 <- cbind(airquality[2:6], airquality[1]) | ||
|
|
||
| expect_snapshot(describe_missing(airquality2)) | ||
|
|
||
| expect_snapshot(describe_missing(airquality2, sort = TRUE)) | ||
|
|
||
| expect_snapshot(describe_missing( | ||
| airquality2, | ||
| select = "Ozone:Temp" | ||
| )) | ||
|
|
||
| expect_snapshot(describe_missing( | ||
| airquality2, | ||
| exclude = "Ozone:Temp" | ||
| )) | ||
|
|
||
| # Testing the 'by' argument for survey scales | ||
| set.seed(15) | ||
| fun <- function() { | ||
| c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) | ||
| } | ||
| df <- data.frame( | ||
| ID = c("idz", NA), | ||
| openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), | ||
| extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), | ||
| agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun(), | ||
| stringsAsFactors = FALSE | ||
| ) | ||
|
|
||
| # Pivot and group using datawizard | ||
| df_long <- reshape_longer(df, | ||
| select = -1, | ||
| names_sep = "_", | ||
| names_to = c("dimension", "item") | ||
| ) | ||
|
|
||
| # Run describe_missing with 'by' argument | ||
| expect_snapshot(describe_missing( | ||
| df_long, | ||
| select = -c(1, 3), by = "dimension" | ||
| )) | ||
| }) |
Uh oh!
There was an error while loading. Please reload this page.