options(repos = "https://cran.rstudio.com/")
Load required packages
install.packages("naniar")
##
## The downloaded binary packages are in
## /var/folders/17/pj_hxw4d56j8yrhnl18xmyzw0000gn/T//RtmpW75pGl/downloaded_packages
install.packages("visdat")
##
## The downloaded binary packages are in
## /var/folders/17/pj_hxw4d56j8yrhnl18xmyzw0000gn/T//RtmpW75pGl/downloaded_packages
library(rmarkdown)
library("naniar")
library("visdat")
Look at structure of dataset
str(esoph)
## 'data.frame': 88 obs. of 5 variables:
## $ agegp : Ord.factor w/ 6 levels "25-34"<"35-44"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ alcgp : Ord.factor w/ 4 levels "0-39g/day"<"40-79"<..: 1 1 1 1 2 2 2 2 3 3 ...
## $ tobgp : Ord.factor w/ 4 levels "0-9g/day"<"10-19"<..: 1 2 3 4 1 2 3 4 1 2 ...
## $ ncases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ncontrols: num 40 10 6 5 27 7 4 7 2 1 ...
Visualize entire dataset
vis_dat(esoph)
vis_miss(esoph)
library(ggplot2)
ggplot(esoph,
aes(x = agegp,
y = alcgp)) +
geom_point()
library(naniar)
ggplot(esoph,
aes(x = agegp,
y = alcgp)) +
geom_point()
Visualizing the missings in the data.
gg_miss_var(esoph)
as_shadow(esoph)
## # A tibble: 88 × 5
## agegp_NA alcgp_NA tobgp_NA ncases_NA ncontrols_NA
## <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA
## 7 !NA !NA !NA !NA !NA
## 8 !NA !NA !NA !NA !NA
## 9 !NA !NA !NA !NA !NA
## 10 !NA !NA !NA !NA !NA
## # ℹ 78 more rows
es_shadow <- bind_shadow(esoph)
es_nab <- nabular(esoph)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(es_shadow)
## Rows: 88
## Columns: 10
## $ agegp <ord> 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 2…
## $ alcgp <ord> 0-39g/day, 0-39g/day, 0-39g/day, 0-39g/day, 40-79, 40-79,…
## $ tobgp <ord> 0-9g/day, 10-19, 20-29, 30+, 0-9g/day, 10-19, 20-29, 30+,…
## $ ncases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ ncontrols <dbl> 40, 10, 6, 5, 27, 7, 4, 7, 2, 1, 2, 1, 0, 1, 2, 60, 13, 7…
## $ agegp_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ alcgp_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ tobgp_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ ncases_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ ncontrols_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
glimpse(es_nab)
## Rows: 88
## Columns: 10
## $ agegp <ord> 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 2…
## $ alcgp <ord> 0-39g/day, 0-39g/day, 0-39g/day, 0-39g/day, 40-79, 40-79,…
## $ tobgp <ord> 0-9g/day, 10-19, 20-29, 30+, 0-9g/day, 10-19, 20-29, 30+,…
## $ ncases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ ncontrols <dbl> 40, 10, 6, 5, 27, 7, 4, 7, 2, 1, 2, 1, 0, 1, 2, 60, 13, 7…
## $ agegp_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ alcgp_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ tobgp_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ ncases_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ ncontrols_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
all.equal(es_shadow, es_nab)
## [1] TRUE
Plotting the distribution of agegp, plotting for values of agegp when alcgp is missing or not.
ggplot(es_shadow,
aes(x = agegp,
colour = alcgp_NA)) +
geom_density()
Numerical summaries of missing values in esoph
n_distinct(esoph)
## [1] 88
n_distinct(esoph$agegp)
## [1] 6
n_miss(esoph)
## [1] 0
n_miss(esoph$agegp)
## [1] 0
n_complete(esoph)
## [1] 440
n_complete(esoph$agegp)
## [1] 88
Proportion and percentage of missing values in dataset
prop_miss_case(esoph)
## [1] 0
pct_miss_case(esoph)
## [1] 0
Tabulating the number of missing values in a row.
miss_case_table(esoph)
## # A tibble: 1 × 3
## n_miss_in_case n_cases pct_cases
## <int> <int> <dbl>
## 1 0 88 100
Returns the percent and proportion of variables that contain a missing value.
prop_miss_var(esoph)
## [1] 0
miss_var_summary(esoph)
## # A tibble: 5 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 agegp 0 0
## 2 alcgp 0 0
## 3 tobgp 0 0
## 4 ncases 0 0
## 5 ncontrols 0 0
miss_var_table(esoph)
## # A tibble: 1 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 5 100
miss_var_run(esoph,
agegp)
## # A tibble: 1 × 2
## run_length is_na
## <int> <chr>
## 1 88 complete
Determining the number of missings over a specified repeating span of rows in variable of a dataframe.
miss_var_span(esoph,
agegp,
span_every = 100)
## # A tibble: 1 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 0 88 0 1 88
esoph %>% miss_var_summary()
## # A tibble: 5 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 agegp 0 0
## 2 alcgp 0 0
## 3 tobgp 0 0
## 4 ncases 0 0
## 5 ncontrols 0 0
Viewing the number of missing values for all variables of esoph data
esoph %>%
group_by(agegp) %>%
miss_var_summary() %>%
filter(variable == "alcgp")
## # A tibble: 6 × 4
## # Groups: agegp [6]
## agegp variable n_miss pct_miss
## <ord> <chr> <int> <dbl>
## 1 25-34 alcgp 0 0
## 2 35-44 alcgp 0 0
## 3 45-54 alcgp 0 0
## 4 55-64 alcgp 0 0
## 5 65-74 alcgp 0 0
## 6 75+ alcgp 0 0
Calculating the proportion of missing values in each row
esoph %>%
add_prop_miss() %>%
head()
## agegp alcgp tobgp ncases ncontrols prop_miss_all
## 1 25-34 0-39g/day 0-9g/day 0 40 0
## 2 25-34 0-39g/day 10-19 0 10 0
## 3 25-34 0-39g/day 20-29 0 6 0
## 4 25-34 0-39g/day 30+ 0 5 0
## 5 25-34 40-79 0-9g/day 0 27 0
## 6 25-34 40-79 10-19 0 7 0
Using a decision trees to predict which variables and their values are important for predicting the proportion of missingness
library("rpart")
install.packages("rpart.plot")
##
## The downloaded binary packages are in
## /var/folders/17/pj_hxw4d56j8yrhnl18xmyzw0000gn/T//RtmpW75pGl/downloaded_packages
library("rpart.plot")
esoph %>%
add_prop_miss() %>%
rpart(prop_miss_all ~ ., data = .) %>%
prp(type = 4, extra = 101, prefix = "Prop. Miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call prp with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.