options(repos = "https://cran.rstudio.com/")

R Markdown

Load required packages

install.packages("naniar")
## 
## The downloaded binary packages are in
##  /var/folders/17/pj_hxw4d56j8yrhnl18xmyzw0000gn/T//RtmpW75pGl/downloaded_packages
install.packages("visdat")
## 
## The downloaded binary packages are in
##  /var/folders/17/pj_hxw4d56j8yrhnl18xmyzw0000gn/T//RtmpW75pGl/downloaded_packages
library(rmarkdown)
library("naniar")
library("visdat")

Look at structure of dataset

str(esoph)
## 'data.frame':    88 obs. of  5 variables:
##  $ agegp    : Ord.factor w/ 6 levels "25-34"<"35-44"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ alcgp    : Ord.factor w/ 4 levels "0-39g/day"<"40-79"<..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ tobgp    : Ord.factor w/ 4 levels "0-9g/day"<"10-19"<..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ ncases   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ncontrols: num  40 10 6 5 27 7 4 7 2 1 ...

Visualize entire dataset

vis_dat(esoph)

vis_miss(esoph)

library(ggplot2)
ggplot(esoph, 
       aes(x = agegp, 
           y = alcgp)) + 
  geom_point()

library(naniar)
ggplot(esoph, 
       aes(x = agegp, 
           y = alcgp)) + 
  geom_point()

Visualizing the missings in the data.

gg_miss_var(esoph)

as_shadow(esoph)
## # A tibble: 88 × 5
##    agegp_NA alcgp_NA tobgp_NA ncases_NA ncontrols_NA
##    <fct>    <fct>    <fct>    <fct>     <fct>       
##  1 !NA      !NA      !NA      !NA       !NA         
##  2 !NA      !NA      !NA      !NA       !NA         
##  3 !NA      !NA      !NA      !NA       !NA         
##  4 !NA      !NA      !NA      !NA       !NA         
##  5 !NA      !NA      !NA      !NA       !NA         
##  6 !NA      !NA      !NA      !NA       !NA         
##  7 !NA      !NA      !NA      !NA       !NA         
##  8 !NA      !NA      !NA      !NA       !NA         
##  9 !NA      !NA      !NA      !NA       !NA         
## 10 !NA      !NA      !NA      !NA       !NA         
## # ℹ 78 more rows
es_shadow <- bind_shadow(esoph)
es_nab <- nabular(esoph)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(es_shadow)
## Rows: 88
## Columns: 10
## $ agegp        <ord> 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 2…
## $ alcgp        <ord> 0-39g/day, 0-39g/day, 0-39g/day, 0-39g/day, 40-79, 40-79,…
## $ tobgp        <ord> 0-9g/day, 10-19, 20-29, 30+, 0-9g/day, 10-19, 20-29, 30+,…
## $ ncases       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ ncontrols    <dbl> 40, 10, 6, 5, 27, 7, 4, 7, 2, 1, 2, 1, 0, 1, 2, 60, 13, 7…
## $ agegp_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ alcgp_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ tobgp_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ ncases_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ ncontrols_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
glimpse(es_nab)
## Rows: 88
## Columns: 10
## $ agegp        <ord> 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 2…
## $ alcgp        <ord> 0-39g/day, 0-39g/day, 0-39g/day, 0-39g/day, 40-79, 40-79,…
## $ tobgp        <ord> 0-9g/day, 10-19, 20-29, 30+, 0-9g/day, 10-19, 20-29, 30+,…
## $ ncases       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ ncontrols    <dbl> 40, 10, 6, 5, 27, 7, 4, 7, 2, 1, 2, 1, 0, 1, 2, 60, 13, 7…
## $ agegp_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ alcgp_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ tobgp_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ ncases_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
## $ ncontrols_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !N…
all.equal(es_shadow, es_nab)
## [1] TRUE

Plotting the distribution of agegp, plotting for values of agegp when alcgp is missing or not.

ggplot(es_shadow,
       aes(x = agegp,
           colour = alcgp_NA)) + 
  geom_density()

Numerical summaries of missing values in esoph

n_distinct(esoph)
## [1] 88
n_distinct(esoph$agegp)
## [1] 6
n_miss(esoph)
## [1] 0
n_miss(esoph$agegp)
## [1] 0
n_complete(esoph)
## [1] 440
n_complete(esoph$agegp)
## [1] 88

Proportion and percentage of missing values in dataset

prop_miss_case(esoph)
## [1] 0
pct_miss_case(esoph)
## [1] 0

Tabulating the number of missing values in a row.

miss_case_table(esoph)
## # A tibble: 1 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0      88       100

Returns the percent and proportion of variables that contain a missing value.

prop_miss_var(esoph)
## [1] 0
miss_var_summary(esoph)
## # A tibble: 5 × 3
##   variable  n_miss pct_miss
##   <chr>      <int>    <dbl>
## 1 agegp          0        0
## 2 alcgp          0        0
## 3 tobgp          0        0
## 4 ncases         0        0
## 5 ncontrols      0        0
miss_var_table(esoph)
## # A tibble: 1 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      5      100
miss_var_run(esoph,
             agegp)
## # A tibble: 1 × 2
##   run_length is_na   
##        <int> <chr>   
## 1         88 complete

Determining the number of missings over a specified repeating span of rows in variable of a dataframe.

miss_var_span(esoph,
              agegp,
              span_every = 100)
## # A tibble: 1 × 6
##   span_counter n_miss n_complete prop_miss prop_complete n_in_span
##          <int>  <int>      <int>     <dbl>         <dbl>     <int>
## 1            1      0         88         0             1        88
esoph %>% miss_var_summary()
## # A tibble: 5 × 3
##   variable  n_miss pct_miss
##   <chr>      <int>    <dbl>
## 1 agegp          0        0
## 2 alcgp          0        0
## 3 tobgp          0        0
## 4 ncases         0        0
## 5 ncontrols      0        0

Viewing the number of missing values for all variables of esoph data

esoph %>%
 group_by(agegp) %>%
 miss_var_summary() %>%
 filter(variable == "alcgp")
## # A tibble: 6 × 4
## # Groups:   agegp [6]
##   agegp variable n_miss pct_miss
##   <ord> <chr>     <int>    <dbl>
## 1 25-34 alcgp         0        0
## 2 35-44 alcgp         0        0
## 3 45-54 alcgp         0        0
## 4 55-64 alcgp         0        0
## 5 65-74 alcgp         0        0
## 6 75+   alcgp         0        0

Calculating the proportion of missing values in each row

esoph %>%
  add_prop_miss() %>%
  head()
##   agegp     alcgp    tobgp ncases ncontrols prop_miss_all
## 1 25-34 0-39g/day 0-9g/day      0        40             0
## 2 25-34 0-39g/day    10-19      0        10             0
## 3 25-34 0-39g/day    20-29      0         6             0
## 4 25-34 0-39g/day      30+      0         5             0
## 5 25-34     40-79 0-9g/day      0        27             0
## 6 25-34     40-79    10-19      0         7             0

Using a decision trees to predict which variables and their values are important for predicting the proportion of missingness

library("rpart")
install.packages("rpart.plot")
## 
## The downloaded binary packages are in
##  /var/folders/17/pj_hxw4d56j8yrhnl18xmyzw0000gn/T//RtmpW75pGl/downloaded_packages
library("rpart.plot")
esoph %>%
  add_prop_miss() %>%
  rpart(prop_miss_all ~ ., data = .) %>%
  prp(type = 4, extra = 101, prefix = "Prop. Miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call prp with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.