Dealing with Missing Data in `R`

DataCamp: Statistics with `R`

Bonnie Cooper

library( dplyr )
library( ggplot2 )
library( gridExtra )
library( tidyverse )
library( naniar )
library( simputation )

Why care about missing data?

Introduction to missing data

“The best thing to do with missing data is to not have any.” -Gertrude M. Cox

Working with real-world data == working with missing data
Missing Data can have unexpected effects on your analysis
Bad imputation can lead to poor estimates and decisions

Checking for missing values with any_na()

x <- c( 1, NA, 3, NA, NA, 5 )
baset0 <- Sys.time()
any( is.na( x ) )

## [1] TRUE

timeint_base <- Sys.time() - baset0

naniart0 <- Sys.time()
any_na( x )

## [1] TRUE

timeint_naniar <- Sys.time() - naniart0

base2t0 <- Sys.time()
anyNA( x )

## [1] TRUE

timeint_base2 <- Sys.time() - base2t0

res <- paste( 'Comparing runtime for methods to detect NA:\nbase R runtime:', timeint_base,
              '\nanyNA() base r runtime', timeint_base2,
              '\nany_na() naniar runtime:', timeint_naniar)
cat( res, sep = '\n')

## Comparing runtime for methods to detect NA:
## base R runtime: 0.00213027000427246 
## anyNA() base r runtime 0.0020744800567627 
## any_na() naniar runtime: 0.00191235542297363

#return a boolean vector that tests for NAs
are_na( x )

## [1] FALSE  TRUE FALSE  TRUE  TRUE FALSE

#return the number of NAs
n_miss( x )

## [1] 3

#return the proportion of NAs
prop_miss( x )

## [1] 0.5

Generally, operations with NA values returns an NA value:

heights <- data.frame( 'Sophie' = 165, 'Dan' = 177, 'Fred' = NA )
sum( heights )

## [1] NA

Important Distinctions:

NaN == Not a Number. is evaluated the same as NA
NULL == empty. is Not the same as NA
Inf == Infinity. is Not the same as NA

r1 <- any_na( NaN )
r2 <- any_na( NULL )
r3 <- any_na( Inf )
r4 <- any_na( 0 )

res <- paste( 'When tested any_na()::\nNaN evaluates:', r1,
              '\nNULL evaluates:', r2,
              '\nInf evaluates:', r3,
              '\n0 evaluates:', r4 )
cat( res, sep='\n' )

## When tested any_na()::
## NaN evaluates: TRUE 
## NULL evaluates: FALSE 
## Inf evaluates: FALSE 
## 0 evaluates: FALSE

Conditional Statement Behaviors to look out for:

r1 <- NA | TRUE
r2 <- NA | FALSE
r3 <- NA | NaN
r4 <- NaN | NA

res <- paste( 'Conditional Statement Behaviors to be aware of::\nNA | TRUE evaluates:', r1,
              '\nNA | FALSE evaluates:', r2,
              '\nNA | NaN evaluates:', r3,
              '\nNaN | NA evaluates:', r4 )
cat( res, sep='\n' )

## Conditional Statement Behaviors to be aware of::
## NA | TRUE evaluates: TRUE 
## NA | FALSE evaluates: NA 
## NA | NaN evaluates: NA 
## NaN | NA evaluates: NA

# Create x, a vector, with values NA, NaN, Inf, ".", and "missing"
x <- c(NA, NaN, Inf, ".", "missing")

# Use any_na() and are_na() on to explore the missings
any_na(x)

## [1] TRUE

are_na(x)

## [1]  TRUE FALSE FALSE FALSE FALSE

dat_hw_url <- 'https://raw.githubusercontent.com/SmilodonCub/ReadingLearningTinkering/master/DataCamp/Statistics_with_R/dat_hw.csv'
dat_hw <- read.csv( dat_hw_url ) %>%
  select( -X )
head( dat_hw )

##      weight     height
## 1        NA  2.3881462
## 2  91.20470  1.0014508
## 3  81.57915         NA
## 4  76.84886         NA
## 5 111.01731 -0.2412422
## 6  90.15135  2.5207375

# Use n_miss() to count the total number of missing values in dat_hw
n_miss(dat_hw)

## [1] 30

# Use n_miss() on dat_hw$weight to count the total number of missing values
n_miss(dat_hw$weight)

## [1] 15

# Use n_complete() on dat_hw to count the total number of complete values
n_complete(dat_hw)

## [1] 170

# Use n_complete() on dat_hw$weight to count the total number of complete values
n_complete(dat_hw$weight)

## [1] 85

# Use prop_miss() and prop_complete() on dat_hw to count the total number of missing values in each of the variables
prop_miss(dat_hw)

## [1] 0.15

prop_complete(dat_hw)

## [1] 0.85

Why care about missing values?

Introduction to missingness summaries

Basic summary missingness:

n_miss( x )

## [1] 1

n_complete( x )

## [1] 4

Dataframe summaries of missingness:

miss_var_summary(): summarize the number of missing in each variable/feature/column

miss_x_cols <- miss_var_summary( dat_hw )
glimpse( miss_x_cols )

## Rows: 2
## Columns: 3
## $ variable <chr> "weight", "height"
## $ n_miss   <int> 15, 15
## $ pct_miss <dbl> 15, 15

miss_x_cols <- miss_var_summary( airquality )
glimpse( miss_x_cols )

## Rows: 6
## Columns: 3
## $ variable <chr> "Ozone", "Solar.R", "Wind", "Temp", "Month", "Day"
## $ n_miss   <int> 37, 7, 0, 0, 0, 0
## $ pct_miss <dbl> 24.183007, 4.575163, 0.000000, 0.000000, 0.000000, 0.000000

miss_case_summary: each case is a row in the dataframe. info on missing values by row.

dim( miss_case_summary( dat_hw ) )

## [1] 100   3

head( miss_case_summary( airquality ) )

## # A tibble: 6 x 3
##    case n_miss pct_miss
##   <int>  <int>    <dbl>
## 1     5      2     33.3
## 2    27      2     33.3
## 3     6      1     16.7
## 4    10      1     16.7
## 5    11      1     16.7
## 6    25      1     16.7

Missing Data Tabulations:

miss_var_table() returns a dataframe with info on the variables missing data as well as the percentage of variables affected by missing data

miss_var_table( dat_hw )

## # A tibble: 1 x 3
##   n_miss_in_var n_vars pct_vars
## *         <int>  <int>    <dbl>
## 1            15      2      100

can be interpretted as: 2 variables are missing 15 observations each. 100% of the variables in the dataframe are affected this way

miss_var_table( airquality )

## # A tibble: 3 x 3
##   n_miss_in_var n_vars pct_vars
## *         <int>  <int>    <dbl>
## 1             0      4     66.7
## 2             7      1     16.7
## 3            37      1     16.7

can be interpretted as: 66.6% of the features in this dataframe (total of 4 features) are missing 0 observations. One variables (16.6% of features) is missing 7 observations while another variable (16.6% of features) is missing 37 observations.

miss_case_table(): returns the same information but by cases (rows)

miss_case_table( dat_hw )

## # A tibble: 2 x 3
##   n_miss_in_case n_cases pct_cases
## *          <int>   <int>     <dbl>
## 1              0      70        70
## 2              1      30        30

can be interpretted as: 70% of rows (70 rows) are missing 0 observations. 30% of rows (30 rows) are missing 1 observation.

miss_case_table( airquality )

## # A tibble: 3 x 3
##   n_miss_in_case n_cases pct_cases
## *          <int>   <int>     <dbl>
## 1              0     111     72.5 
## 2              1      40     26.1 
## 3              2       2      1.31

can be interpretted as: 72.5% of rows (111 rows) are missing 0 observations. 26.1% of rows (40 rows) are missing 1 observation. 1.3% or rows (2 rows) are missing 2 observations.

Other useful functions:

miss_var_span() summarizes missing data by span of data (good for time series analysis e.g. weekly spans of 7)
miss_var_run() summarizes runs of missing data. good for finding unusual patterns of missing data. returns runs of complete and missing data. great for sinding systemic sampling error.

Using summaries with group_by():

airquality %>%
  group_by( Month ) %>%
  miss_var_summary()

## # A tibble: 25 x 4
## # Groups:   Month [5]
##    Month variable n_miss pct_miss
##    <int> <chr>     <int>    <dbl>
##  1     5 Ozone         5     16.1
##  2     5 Solar.R       4     12.9
##  3     5 Wind          0      0  
##  4     5 Temp          0      0  
##  5     5 Day           0      0  
##  6     6 Ozone        21     70  
##  7     6 Solar.R       0      0  
##  8     6 Wind          0      0  
##  9     6 Temp          0      0  
## 10     6 Day           0      0  
## # … with 15 more rows

glimpse( pedestrian )

## Rows: 37,700
## Columns: 9
## $ hourly_counts <int> 883, 597, 294, 183, 118, 68, 47, 52, 120, 333, 761, 1352…
## $ date_time     <dttm> 2016-01-01 00:00:00, 2016-01-01 01:00:00, 2016-01-01 02…
## $ year          <int> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…
## $ month         <ord> January, January, January, January, January, January, Ja…
## $ month_day     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ week_day      <ord> Friday, Friday, Friday, Friday, Friday, Friday, Friday, …
## $ hour          <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ sensor_id     <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ sensor_name   <chr> "Bourke Street Mall (South)", "Bourke Street Mall (South…

miss_var_table( pedestrian )

## # A tibble: 2 x 3
##   n_miss_in_var n_vars pct_vars
## *         <int>  <int>    <dbl>
## 1             0      8     88.9
## 2          2548      1     11.1

# Calculate the summaries for each run of missingness for the variable, hourly_counts
miss_var_run(pedestrian, var = hourly_counts)

## # A tibble: 35 x 2
##    run_length is_na   
##         <int> <chr>   
##  1       6628 complete
##  2          1 missing 
##  3       5250 complete
##  4        624 missing 
##  5       3652 complete
##  6          1 missing 
##  7       1290 complete
##  8        744 missing 
##  9       7420 complete
## 10          1 missing 
## # … with 25 more rows

# Calculate the summaries for each span of missingness, 
# for a span of 4000, for the variable hourly_counts
miss_var_span(pedestrian, var = hourly_counts, span_every = 4000)

## # A tibble: 10 x 5
##    span_counter n_miss n_complete prop_miss prop_complete
##  *        <int>  <int>      <dbl>     <dbl>         <dbl>
##  1            1      0       4000   0               1    
##  2            2      1       3999   0.00025         1.00 
##  3            3    121       3879   0.0302          0.970
##  4            4    503       3497   0.126           0.874
##  5            5    745       3255   0.186           0.814
##  6            6      0       4000   0               1    
##  7            7      1       3999   0.00025         1.00 
##  8            8      0       4000   0               1    
##  9            9    745       3255   0.186           0.814
## 10           10    432       3568   0.108           0.892

# For each `month` variable, calculate the run of missingness for hourly_counts
pedestrian %>% group_by(month) %>% miss_var_run(hourly_counts)

## # A tibble: 51 x 3
## # Groups:   month [12]
##    month    run_length is_na   
##    <ord>         <int> <chr>   
##  1 January        2976 complete
##  2 February       2784 complete
##  3 March          2976 complete
##  4 April           888 complete
##  5 April           552 missing 
##  6 April          1440 complete
##  7 May             744 complete
##  8 May              72 missing 
##  9 May            2160 complete
## 10 June           2880 complete
## # … with 41 more rows

# For each `month` variable, calculate the span of missingness 
# of a span of 2000, for the variable hourly_counts
pedestrian %>% group_by(month) %>% miss_var_span(var = hourly_counts, span_every = 2000)

## # A tibble: 25 x 6
## # Groups:   month [12]
##    month    span_counter n_miss n_complete prop_miss prop_complete
##    <ord>           <int>  <int>      <dbl>     <dbl>         <dbl>
##  1 January             1      0       2000     0             1    
##  2 January             2      0       2000     0             1    
##  3 February            1      0       2000     0             1    
##  4 February            2      0       2000     0             1    
##  5 March               1      0       2000     0             1    
##  6 March               2      0       2000     0             1    
##  7 April               1    552       1448     0.276         0.724
##  8 April               2      0       2000     0             1    
##  9 May                 1     72       1928     0.036         0.964
## 10 May                 2      0       2000     0             1    
## # … with 15 more rows

How do we visual missing values?

naniar missing data visualization methods.

Overview of missingness: a type of heatmap for missing data. black == missing. also provides basic stats of proportions of missingness.

vis_miss( airquality )

vis_miss( dat_hw )

vis_miss( airquality )

vis_miss( airquality, cluster = TRUE)

vis_miss( dat_hw, cluster = TRUE)

Looking at missing observations in both variables and cases

varp <- gg_miss_var( airquality )
casep <- gg_miss_case( airquality )
grid.arrange( varp, casep, ncol = 2 )

varp <- gg_miss_var( dat_hw )
casep <- gg_miss_case( dat_hw )
grid.arrange( varp, casep, ncol = 2 )

faceting a gg_miss_var() plot acts like visualizing a group_by

gg_miss_var( airquality, facet = Month )

Visualizing missingness patterns

gg_miss_upset() shows co-occuring missing observations across variables

gg_miss_upset( airquality )

gg_miss_fct(): visualizing factors for missingness. again, kinda like visualizing missing group_by result. Gives a heat map view for each feature (y-axis) and each montt (x-axis) where color intensity is the number of missing observations.

gg_miss_fct( x = airquality, fct = Month )

gg_miss_span() visualizes the number of missing observations for a given span.

gg_miss_span( pedestrian, hourly_counts, span_every = 3000)

# Visualize all of the missingness in the `riskfactors`  dataset
vm <- vis_miss(riskfactors) +
    theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

# Visualize and cluster all of the missingness in the `riskfactors` dataset
vmc <-vis_miss(riskfactors, cluster = TRUE) +
    theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

# visualize and sort the columns by missingness in the `riskfactors` dataset
vms <- vis_miss(riskfactors, sort_miss = TRUE) +
    theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

# visualize cluster and sort the columns by missingness in the `riskfactors` dataset
vmcs <- vis_miss(riskfactors, sort_miss = TRUE, cluster = TRUE ) +
    theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

grid.arrange( vm, vmc, vms, vmcs, ncol = 2 )

# Visualize the number of missings in cases using `gg_miss_case()`
caserf <- gg_miss_case(riskfactors)

# Explore the number of missings in cases using `gg_miss_case()` 
# and facet by the variable `education`
fct_caserf <- gg_miss_case(riskfactors, facet = education)

grid.arrange( caserf, fct_caserf, ncol = 2 )

# Visualize the number of missings in variables using `gg_miss_var()`
varrf <- gg_miss_var(riskfactors)

# Explore the number of missings in variables using `gg_miss_var()` 
# and facet by the variable `education`
fct_varrf <- gg_miss_var(riskfactors, facet = education)

grid.arrange( varrf, fct_varrf, ncol = 2 )

# With the riskfactors dataset, explore how the missingness changes across the marital variable using gg_miss_fct()
gg_miss_fct(x = riskfactors, fct = marital)

# Using the pedestrian dataset, explore how the missingness of hourly_counts changes over a span of 3000 
gg_miss_span(pedestrian, var = hourly_counts, span_every = 3000)

# Using the pedestrian dataset, explore the impact of month by faceting by month
# and explore how missingness changes for a span of 1000
gg_miss_span(pedestrian, var = hourly_counts , span_every = 1000, facet = month)

Wrangling and tidying up missing values.

Searching for an replacing missing values

Assumptions with missing data: finding missing values and labelling with NA

In a perfect word, missing data is labelled NA. However, it may be the case that it is coded as ‘missing’, ‘Not Available’, ‘N/A’ or some other permutation.

miss_search_count(): Searching for missing values

employees_url <- 'https://raw.githubusercontent.com/ChaitanyaBaweja/Programming-Tutorials/master/Missing-Data-Pandas/employees.csv'
employees <- read.csv( employees_url )
glimpse( employees )

## Rows: 1,000
## Columns: 6
## $ First.Name        <chr> "Douglas", "Thomas", "Maria", "Jerry", "Larry", "Den…
## $ Gender            <chr> "Male", "Male", "Female", "Male", "Male", "n.a.", "F…
## $ Salary            <chr> "97308", "61933", "130590", NA, "101004", "115163", …
## $ Bonus..           <chr> "6.945", "NaN", "11.858", "9.34", "1.389", "10.125",…
## $ Senior.Management <chr> "TRUE", "TRUE", "FALSE", "TRUE", "TRUE", "FALSE", "T…
## $ Team              <chr> "Marketing", "", "Finance", "Finance", "Client Servi…

#explicitly search for strange NA mislabellings
employees %>%
  miss_scan_count( search = list('n.a', 'na') )

## # A tibble: 6 x 2
##   Variable              n
##   <chr>             <int>
## 1 First.Name           58
## 2 Gender                1
## 3 Salary                1
## 4 Bonus..               0
## 5 Senior.Management     2
## 6 Team                106

employees %>%
  miss_scan_count( search = common_na_strings )

## # A tibble: 6 x 2
##   Variable              n
##   <chr>             <int>
## 1 First.Name          998
## 2 Gender              999
## 3 Salary              999
## 4 Bonus..            1000
## 5 Senior.Management  1000
## 6 Team               1000

print( common_na_strings )

##  [1] "NA"     "N A"    "N/A"    "NA "    " NA"    "N /A"   "N / A"  " N / A"
##  [9] "N / A " "na"     "n a"    "n/a"    "na "    " na"    "n /a"   "n / a" 
## [17] " a / a" "n / a " "NULL"   "null"   ""       "\\?"    "\\*"    "\\."

replace_with_na(): replace specified values with NA

replace_with_na_all(): all variables
replace_with_na_at(): a subset of selected variables
replace_with_na_if(): a subset of variables that fulfill some condition

#replace all instances of 'N/A' and 'N/a' from the feature `grade`
employees %>%
  replace_with_na( replace = list( Team = c('n.a', 'na') ) ) %>%
  miss_scan_count( search = common_na_strings )

## # A tibble: 6 x 2
##   Variable              n
##   <chr>             <int>
## 1 First.Name          998
## 2 Gender              999
## 3 Salary              999
## 4 Bonus..            1000
## 5 Senior.Management  1000
## 6 Team                997

#replace any instance of -99 with `NA`
employees %>%
  replace_with_na_all( condition = ~.x == "" ) %>%
  miss_scan_count( search = common_na_strings )

## # A tibble: 6 x 2
##   Variable              n
##   <chr>             <int>
## 1 First.Name          931
## 2 Gender              852
## 3 Salary              998
## 4 Bonus..            1000
## 5 Senior.Management   933
## 6 Team                957

#replace multiple with `NA`
employees %>%
  replace_with_na_all( condition = ~.x %in% common_na_strings ) %>%
  miss_scan_count( search = common_na_strings )

## # A tibble: 6 x 2
##   Variable              n
##   <chr>             <int>
## 1 First.Name          931
## 2 Gender              852
## 3 Salary              998
## 4 Bonus..            1000
## 5 Senior.Management   933
## 6 Team                954

# Explore the strange missing values "N/A"
miss_scan_count(data = pedestrian, search = list("N/A") )

## # A tibble: 9 x 2
##   Variable          n
##   <chr>         <int>
## 1 hourly_counts     0
## 2 date_time         0
## 3 year              0
## 4 month             0
## 5 month_day         0
## 6 week_day          0
## 7 hour              0
## 8 sensor_id         0
## 9 sensor_name       0

# Explore the strange missing values "missing"
miss_scan_count(data = pedestrian, search = list("missing") )

## # A tibble: 9 x 2
##   Variable          n
##   <chr>         <int>
## 1 hourly_counts     0
## 2 date_time         0
## 3 year              0
## 4 month             0
## 5 month_day         0
## 6 week_day          0
## 7 hour              0
## 8 sensor_id         0
## 9 sensor_name       0

# Explore the strange missing values "na"
miss_scan_count(data = pedestrian, search = list('na' ) )

## # A tibble: 9 x 2
##   Variable          n
##   <chr>         <int>
## 1 hourly_counts     0
## 2 date_time         0
## 3 year              0
## 4 month             0
## 5 month_day         0
## 6 week_day          0
## 7 hour              0
## 8 sensor_id         0
## 9 sensor_name       0

# Explore the strange missing values " " (a single space)
miss_scan_count(data = pedestrian, search = list(' ') )

## # A tibble: 9 x 2
##   Variable          n
##   <chr>         <int>
## 1 hourly_counts     0
## 2 date_time     37700
## 3 year              0
## 4 month             0
## 5 month_day         0
## 6 week_day          0
## 7 hour              0
## 8 sensor_id         0
## 9 sensor_name   37700

# Explore all of the strange missing values, "N/A", "missing", "na", " "
miss_scan_count(data = pedestrian, search = list("N/A", "missing", "na", " "))

## # A tibble: 9 x 2
##   Variable          n
##   <chr>         <int>
## 1 hourly_counts     0
## 2 date_time     37700
## 3 year              0
## 4 month             0
## 5 month_day         0
## 6 week_day          0
## 7 hour              0
## 8 sensor_id         0
## 9 sensor_name   37700

# Print the top of the pacman data using `head()`
glimpse(employees)

## Rows: 1,000
## Columns: 6
## $ First.Name        <chr> "Douglas", "Thomas", "Maria", "Jerry", "Larry", "Den…
## $ Gender            <chr> "Male", "Male", "Female", "Male", "Male", "n.a.", "F…
## $ Salary            <chr> "97308", "61933", "130590", NA, "101004", "115163", …
## $ Bonus..           <chr> "6.945", "NaN", "11.858", "9.34", "1.389", "10.125",…
## $ Senior.Management <chr> "TRUE", "TRUE", "FALSE", "TRUE", "TRUE", "FALSE", "T…
## $ Team              <chr> "Marketing", "", "Finance", "Finance", "Client Servi…

# Replace the strange missing values "N/A", "na", and  
# "missing" with `NA` for the variables, year, and score
emp_clean <- replace_with_na(data = employees, replace = list(Team = c("", "na", "n.a","NaN"),
                                Bonus.. = c("", "na", "n.a","NaN")))
                                        
# Test if `pacman_clean` still has these values in it?
miss_scan_count(emp_clean, search = list("", "na", "n.a", "NaN"))

## # A tibble: 6 x 2
##   Variable              n
##   <chr>             <int>
## 1 First.Name          998
## 2 Gender              999
## 3 Salary              999
## 4 Bonus..             997
## 5 Senior.Management  1000
## 6 Team                954

# Use `replace_with_na_at()` to replace with NA
rp1 <- replace_with_na_at(employees,
                   .vars = c('First.Name', 'Gender', 'Team'), 
                   ~.x %in% c("", " ", "na", 'NaN', '?'))
head( rp1 )

##   First.Name Gender Salary Bonus.. Senior.Management            Team
## 1    Douglas   Male  97308   6.945              TRUE       Marketing
## 2     Thomas   Male  61933     NaN              TRUE            <NA>
## 3      Maria Female 130590  11.858             FALSE         Finance
## 4      Jerry   Male   <NA>    9.34              TRUE         Finance
## 5      Larry   Male 101004   1.389              TRUE Client Services
## 6     Dennis   n.a. 115163  10.125             FALSE           Legal

# Use `replace_with_na_if()` to replace with NA the character values using `is.character`
rp2 <- replace_with_na_if(employees,
                   .predicate = is.character, 
                   ~.x %in% c("", " ", "na", 'NaN', '?'))
head( rp2 )

##   First.Name Gender Salary Bonus.. Senior.Management            Team
## 1    Douglas   Male  97308   6.945              TRUE       Marketing
## 2     Thomas   Male  61933    <NA>              TRUE            <NA>
## 3      Maria Female 130590  11.858             FALSE         Finance
## 4      Jerry   Male   <NA>    9.34              TRUE         Finance
## 5      Larry   Male 101004   1.389              TRUE Client Services
## 6     Dennis   n.a. 115163  10.125             FALSE           Legal

# Use `replace_with_na_all()` to replace with NA
rp3 <- replace_with_na_all(employees, condition = ~.x %in% c("", " ", "na", 'NaN', '?'))
head( rp3 )

## # A tibble: 6 x 6
##   First.Name Gender Salary Bonus.. Senior.Management Team           
##   <chr>      <chr>  <chr>  <chr>   <chr>             <chr>          
## 1 Douglas    Male   97308  6.945   TRUE              Marketing      
## 2 Thomas     Male   61933  <NA>    TRUE              <NA>           
## 3 Maria      Female 130590 11.858  FALSE             Finance        
## 4 Jerry      Male   <NA>   9.34    TRUE              Finance        
## 5 Larry      Male   101004 1.389   TRUE              Client Services
## 6 Dennis     n.a.   115163 10.125  FALSE             Legal

Filling down missing values

Explicitly Missing: they are missing in the data and indicated with NA or something else.
Implicitly Missing: Not shown in the data, but implied (e.g. a missing level)

name <- c( 'jesse', 'jesse', 'jesse', 'jesse', 'andy',  'andy',  'andy',  'nic',   'nic',
           'dan',   'dan',   'alex', 'alex',  'alex',  'alex')
time <- c( 'morning', 'afternoon',  'evening', 'late_night', 'morning', 'afternoon',
           'late_night', 'afternoon', 'late_night', 'morning', 'evening', 'morning', 
           'afternoon',  'evening', 'late_night' )
value <- c(6678, 800060, 475528, 143533, 425115, 587468, 111000, 588532, 915533, 388148,
           180912, 552670,  98355, 266055, 121056)

frogger <- data.frame( 'name' = name, 'time' = time, 'value' = value )

glimpse( frogger )

## Rows: 15
## Columns: 3
## $ name  <chr> "jesse", "jesse", "jesse", "jesse", "andy", "andy", "andy", "nic…
## $ time  <chr> "morning", "afternoon", "evening", "late_night", "morning", "aft…
## $ value <dbl> 6678, 800060, 475528, 143533, 425115, 587468, 111000, 588532, 91…

# Use `complete()` on the `time` and `name` variables to  
# make implicit missing values explicit
frogger_tidy <- frogger %>% complete(time, name)
frogger_tidy

## # A tibble: 20 x 3
##    time       name   value
##    <chr>      <chr>  <dbl>
##  1 afternoon  alex   98355
##  2 afternoon  andy  587468
##  3 afternoon  dan       NA
##  4 afternoon  jesse 800060
##  5 afternoon  nic   588532
##  6 evening    alex  266055
##  7 evening    andy      NA
##  8 evening    dan   180912
##  9 evening    jesse 475528
## 10 evening    nic       NA
## 11 late_night alex  121056
## 12 late_night andy  111000
## 13 late_night dan       NA
## 14 late_night jesse 143533
## 15 late_night nic   915533
## 16 morning    alex  552670
## 17 morning    andy  425115
## 18 morning    dan   388148
## 19 morning    jesse   6678
## 20 morning    nic       NA

# Use `fill()` to fill down the name variable in the frogger dataset
frogger %>% tidyr::fill(name)

##     name       time  value
## 1  jesse    morning   6678
## 2  jesse  afternoon 800060
## 3  jesse    evening 475528
## 4  jesse late_night 143533
## 5   andy    morning 425115
## 6   andy  afternoon 587468
## 7   andy late_night 111000
## 8    nic  afternoon 588532
## 9    nic late_night 915533
## 10   dan    morning 388148
## 11   dan    evening 180912
## 12  alex    morning 552670
## 13  alex  afternoon  98355
## 14  alex    evening 266055
## 15  alex late_night 121056

frogger %>% 
  fill(name) %>%
  complete(name,time)

## # A tibble: 20 x 3
##    name  time        value
##    <chr> <chr>       <dbl>
##  1 alex  afternoon   98355
##  2 alex  evening    266055
##  3 alex  late_night 121056
##  4 alex  morning    552670
##  5 andy  afternoon  587468
##  6 andy  evening        NA
##  7 andy  late_night 111000
##  8 andy  morning    425115
##  9 dan   afternoon      NA
## 10 dan   evening    180912
## 11 dan   late_night     NA
## 12 dan   morning    388148
## 13 jesse afternoon  800060
## 14 jesse evening    475528
## 15 jesse late_night 143533
## 16 jesse morning      6678
## 17 nic   afternoon  588532
## 18 nic   evening        NA
## 19 nic   late_night 915533
## 20 nic   morning        NA

Missing data dependence

MCAR: Missing Completely at Random.
- missingness has no association with any data you have observed or not observed
- Imputation is advisable
- deleting observations may reduce the sample size, limiting inference, but will not bias.
MAR: Missing at Random
- missingness depends on data observed, but not data unobserved
- Should be imputing data
- deletion is not advisable and may lead to bias
MNAR: Missing Not at Random
- missingness of the response is related to an unobserved value relevant to the assessment of interest.
- data will be biased by deletion and imputation
- inference can be limited, proceed with caution

glimpse( oceanbuoys )

## Rows: 736
## Columns: 8
## $ year       <dbl> 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997,…
## $ latitude   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ longitude  <dbl> -110, -110, -110, -110, -110, -110, -110, -110, -110, -110,…
## $ sea_temp_c <dbl> 27.59, 27.55, 27.57, 27.62, 27.65, 27.83, 28.01, 28.04, 28.…
## $ air_temp_c <dbl> 27.15, 27.02, 27.00, 26.93, 26.84, 26.94, 27.04, 27.11, 27.…
## $ humidity   <dbl> 79.6, 75.8, 76.5, 76.2, 76.4, 76.7, 76.5, 78.3, 78.6, 76.9,…
## $ wind_ew    <dbl> -6.4, -5.3, -5.1, -4.9, -3.5, -4.4, -2.0, -3.7, -4.2, -3.6,…
## $ wind_ns    <dbl> 5.4, 5.3, 4.5, 2.5, 4.1, 1.6, 3.5, 4.5, 5.0, 3.5, 2.9, 1.8,…

# Arrange by year
oceanbuoys %>% arrange(year) %>% vis_miss()

# Arrange by latitude
oceanbuoys %>% arrange(latitude) %>% vis_miss()

# Arrange by wind_ew (wind east west)
oceanbuoys %>% arrange(wind_ew) %>% vis_miss()

gg_miss_var( oceanbuoys, facet = year)

Testing missing relationships

Tools to explore missing data dependence

as_shadow() to explore missingness

Coordinated names: shadow matrix inherits feature labels the ’_NA’
Clear Values: binary missing or !missing

bind_shadow() or nabular() to bind the shadow mat with the data == nabular data (a mix of NA and tabular data). This format is useful to do things like calculate summary statistics based on the missingness of a feature

airquality %>%
  bind_shadow() %>%
  group_by( Ozone_NA ) %>%
  summarise( mean = mean( Wind ) )

## # A tibble: 2 x 2
##   Ozone_NA  mean
## * <fct>    <dbl>
## 1 !NA       9.86
## 2 NA       10.3

Create Nabular Data

# Create shadow matrix data with `as_shadow()`
obs <- as_shadow( oceanbuoys )
head( obs )

## # A tibble: 6 x 8
##   year_NA latitude_NA longitude_NA sea_temp_c_NA air_temp_c_NA humidity_NA
##   <fct>   <fct>       <fct>        <fct>         <fct>         <fct>      
## 1 !NA     !NA         !NA          !NA           !NA           !NA        
## 2 !NA     !NA         !NA          !NA           !NA           !NA        
## 3 !NA     !NA         !NA          !NA           !NA           !NA        
## 4 !NA     !NA         !NA          !NA           !NA           !NA        
## 5 !NA     !NA         !NA          !NA           !NA           !NA        
## 6 !NA     !NA         !NA          !NA           !NA           !NA        
## # … with 2 more variables: wind_ew_NA <fct>, wind_ns_NA <fct>

# Create nabular data by binding the shadow to the data with `bind_shadow()`
bob <- bind_shadow( oceanbuoys )
dim( bob )

## [1] 736  16

# Bind only the variables with missing values by using bind_shadow(only_miss = TRUE)
bob_om <- bind_shadow( oceanbuoys, only_miss = TRUE)
dim( bob_om )

## [1] 736  11

Use nabular data to calculate some summary statistics about other features:

# `bind_shadow()` and `group_by()` humidity missingness (`humidity_NA`)
oceanbuoys %>%
  bind_shadow() %>%
  group_by( humidity_NA ) %>% 
  summarize(wind_ew_mean = mean(wind_ew), # calculate mean of wind_ew
            wind_ew_sd = sd(wind_ew)) # calculate standard deviation of wind_ew

## # A tibble: 2 x 3
##   humidity_NA wind_ew_mean wind_ew_sd
## * <fct>              <dbl>      <dbl>
## 1 !NA                -3.78       1.90
## 2 NA                 -3.30       2.31

# Repeat this, but calculating summaries for wind north south (`wind_ns`).
oceanbuoys %>%
  bind_shadow() %>%
  group_by(humidity_NA) %>%
  summarize(wind_ns_mean = mean(wind_ns),
            wind_ns_sd = sd(wind_ns))

## # A tibble: 2 x 3
##   humidity_NA wind_ns_mean wind_ns_sd
## * <fct>              <dbl>      <dbl>
## 1 !NA                 2.78       2.06
## 2 NA                  1.66       2.23

Add information about to summarize the missingness of a dataset

# How many NAs are in a feature?

# Summarize wind_ew by the missingness of `air_temp_c_NA`
oceanbuoys %>% 
  bind_shadow() %>%
  group_by(air_temp_c_NA) %>%
  summarize(wind_ew_mean = mean(wind_ew),
            wind_ew_sd = sd(wind_ew),
            n_obs = n())

## # A tibble: 2 x 4
##   air_temp_c_NA wind_ew_mean wind_ew_sd n_obs
## * <fct>                <dbl>      <dbl> <int>
## 1 !NA                  -3.91       1.85   655
## 2 NA                   -2.17       2.14    81

# Summarize wind_ew by missingness of `air_temp_c_NA` and `humidity_NA`
oceanbuoys %>% 
  bind_shadow() %>%
  group_by(air_temp_c_NA, humidity_NA) %>%
  summarize(wind_ew_mean = mean(wind_ew),
            wind_ew_sd = sd(wind_ew),
            n_obs = n())

## `summarise()` has grouped output by 'air_temp_c_NA'. You can override using the `.groups` argument.

## # A tibble: 4 x 5
## # Groups:   air_temp_c_NA [2]
##   air_temp_c_NA humidity_NA wind_ew_mean wind_ew_sd n_obs
##   <fct>         <fct>              <dbl>      <dbl> <int>
## 1 !NA           !NA                -4.01       1.74   565
## 2 !NA           NA                 -3.24       2.31    90
## 3 NA            !NA                -2.06       2.08    78
## 4 NA            NA                 -4.97       1.74     3

Visualizing missingness across one variable

Exploring conditional missings w/ggplot

How to use nabular data to explore how values change according to other values going missing
ggplot2 visualizations:
- density plots
- box plots
- etc.

ggplot( airquality,
        aes( x = Temp ) ) +
  geom_density()

Create nabular data:

airquality %>%
  bind_shadow() %>%
  ggplot( aes( x = Temp,
               color = Ozone_NA ) ) +
  geom_density()

The values of Temperature do not change much when data for Ozone are present or NA

Here is a feceted versions:

airquality %>%
  bind_shadow() %>%
  ggplot( aes( x = Temp ) ) +
  geom_density() +
  facet_wrap( ~Ozone_NA )

Another look with facetted scatter plots. This gives an idea of how sparce NA data is compared to when the feature is present.

airquality %>%
  bind_shadow() %>%
  ggplot( aes( x = Temp,
               y = Wind ) ) +
  geom_point() +
  facet_wrap( ~Ozone_NA )

Can make the same point, perhaps more obvious, with a box plot.

airquality %>%
  bind_shadow() %>%
  ggplot( aes( x = Ozone_NA,
               y = Temp ) ) +
  geom_boxplot()

This shows how close the medians of the two distributions are.

Visualizing missingness with color:

airquality %>%
  bind_shadow() %>%
  ggplot( aes( x = Temp,
               y = Wind,
               color = Ozone_NA ) ) +
  geom_point()

Visualize the missingness of two features

airquality %>%
  bind_shadow() %>%
  ggplot( aes( x = Temp,
               color = Ozone_NA ) ) +
  geom_density() +
  facet_wrap( ~ Solar.R_NA )

There doesn’t appear to much much difference in the distributions of Temperature when Solar.R info is in a given record. However, when Solar.R is missing, the temperatures are low.

Now to take a look at oceanbuoys

# First explore the missingness structure of `oceanbuoys` using `vis_miss()`
vmob <- vis_miss(oceanbuoys) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

# Explore the distribution of `wind_ew` for the missingness  
# of `air_temp_c_NA` using  `geom_density()`
bsob <- bind_shadow(oceanbuoys) %>%
  ggplot(aes(x = wind_ew, 
             color = air_temp_c_NA)) + 
  geom_density()

# Explore the distribution of sea temperature for the  
# missingness of humidity (humidity_NA) using  `geom_density()`
bsob2 <- bind_shadow(oceanbuoys) %>%
  ggplot(aes(x = sea_temp_c,
             color = humidity_NA)) + 
  geom_density()

grid.arrange( vmob, bsob, bsob2, ncol = 1 )

## Warning: Removed 3 rows containing non-finite values (stat_density).

# Explore the distribution of wind east west (wind_ew) for the missingness of air temperature 
# using geom_density() and faceting by the missingness of air temperature (air_temp_c_NA).
ob1 <- oceanbuoys %>%
  bind_shadow() %>%
  ggplot(aes(x = wind_ew)) + 
  geom_density() + 
  facet_wrap(~air_temp_c_NA)

# Build upon this visualization by coloring by the missingness of humidity (humidity_NA).
ob2 <- oceanbuoys %>%
  bind_shadow() %>%
  ggplot(aes(x = wind_ew,
             color = humidity_NA)) + 
  geom_density() + 
  facet_wrap(~air_temp_c_NA)

grid.arrange( ob1, ob2, ncol = 1 )

# Explore the distribution of wind east west (`wind_ew`) for  
# the missingness of air temperature using  `geom_boxplot()`
ob1 <- oceanbuoys %>%
  bind_shadow() %>%
  ggplot(aes(x = air_temp_c_NA,
             y = wind_ew)) + 
  geom_boxplot()

# Build upon this visualization by faceting by the missingness of humidity (`humidity_NA`).
ob2 <- oceanbuoys %>%
  bind_shadow() %>%
  ggplot(aes(x = air_temp_c_NA,
             y = wind_ew)) + 
  geom_boxplot() + 
  facet_wrap(~humidity_NA)

grid.arrange( ob1, ob2, ncol = 1 )

Visualizing missingness across two variables

The problem with visualizing missing data in 2D (e.g. scatterplot) is that rows with missing values are removed. ggplot2 is very kind to give a warning when rows are dropped:

ggplot( airquality,
        aes( x = Ozone,
             y = Solar.R ) ) +
  geom_point()

## Warning: Removed 42 rows containing missing values (geom_point).

geom_miss_point() visualizes missing data by placing them in the margins of a figure

ggplot( airquality,
        aes( x = Ozone,
             y = Solar.R ) ) +
  geom_miss_point()

ggplot( airquality,
        aes( x = Wind,
             y = Ozone ) ) +
  geom_miss_point() +
  facet_wrap( ~ Month )

Visualize missingness with another missing variable

airquality %>%
  bind_shadow() %>%
  ggplot( aes( x = Wind,
               y = Ozone ) ) +
  geom_miss_point() +
  facet_wrap( ~ Solar.R_NA )

# Explore the missingness in wind and air temperature, and  
# display the missingness using `geom_miss_point()`
ob1 <- ggplot(oceanbuoys,
       aes(x = wind_ew,
           y = air_temp_c)) + 
  geom_miss_point()

# Explore the missingness in humidity and air temperature,  
# and display the missingness using `geom_miss_point()`
ob2 <- ggplot(oceanbuoys,
       aes(x = humidity,
           y = air_temp_c)) + 
  geom_miss_point()

grid.arrange( ob1, ob2, ncol = 2 )

# Explore the missingness in wind and air temperature, and display the 
# missingness using `geom_miss_point()`. Facet by year to explore this further.
ob1 <- ggplot(oceanbuoys,
       aes(x = wind_ew,
           y = air_temp_c)) + 
  geom_miss_point() + 
  facet_wrap(~year)

# Explore the missingness in humidity and air temperature, and display the 
# missingness using `geom_miss_point()` Facet by year to explore this further.
ob2 <- ggplot(oceanbuoys,
       aes(x = humidity,
           y = air_temp_c)) + 
  geom_miss_point() + 
  facet_wrap(~year)

grid.arrange( ob1, ob2, ncol = 1 )

# Use geom_miss_point() and facet_wrap to explore how the missingness  
# in wind_ew and air_temp_c is different for missingness of humidity
bind_shadow(oceanbuoys) %>%
  ggplot(aes(x = wind_ew,
           y = air_temp_c)) + 
  geom_miss_point() + 
  facet_wrap(~humidity_NA)

# Use geom_miss_point() and facet_grid to explore how the missingness in wind_ew and air_temp_c 
# is different for missingness of humidity AND by year - by using `facet_grid(humidity_NA ~ year)`
bind_shadow(oceanbuoys) %>%
  ggplot(aes(x = wind_ew,
             y = air_temp_c)) + 
  geom_miss_point() + 
  facet_grid(humidity_NA~year)

Connecting the dots (Imputation)

Filling in the blanks

Performing and tracking imputation

Using imputations to understand data structure. Visualizing and exploring imputed values

Imputing data to explore missingness
tracking missing values
visualize imputed values against data

impute_below() imputes below to minimum vaue in the variable

impute_below( c( 5,6,7,NA,9,10 ) )

## [1]  5.00000  6.00000  7.00000  4.40271  9.00000 10.00000

impute below to satisfy a conditional

summary( pedestrian )

##  hourly_counts       date_time                        year          month      
##  Min.   :    0.0   Min.   :2016-01-01 00:00:00   Min.   :2016   October: 5540  
##  1st Qu.:   72.0   1st Qu.:2016-04-08 04:00:00   1st Qu.:2016   January: 2976  
##  Median :  277.0   Median :2016-07-15 08:00:00   Median :2016   March  : 2976  
##  Mean   :  701.8   Mean   :2016-07-09 04:46:33   Mean   :2016   May    : 2976  
##  3rd Qu.:  878.0   3rd Qu.:2016-10-11 21:00:00   3rd Qu.:2016   July   : 2976  
##  Max.   :11273.0   Max.   :2016-12-31 23:00:00   Max.   :2016   August : 2976  
##  NA's   :2548                                                   (Other):17280  
##    month_day          week_day         hour        sensor_id    
##  Min.   : 1.00   Sunday   :5396   Min.   : 0.0   Min.   : 2.00  
##  1st Qu.: 8.00   Monday   :5376   1st Qu.: 6.0   1st Qu.: 2.00  
##  Median :16.00   Tuesday  :5328   Median :12.0   Median : 7.00  
##  Mean   :15.75   Wednesday:5328   Mean   :11.5   Mean   :11.15  
##  3rd Qu.:23.00   Thursday :5352   3rd Qu.:18.0   3rd Qu.:13.00  
##  Max.   :31.00   Friday   :5424   Max.   :23.0   Max.   :23.00  
##                  Saturday :5496                                 
##  sensor_name       
##  Length:37700      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

pedestrian_imp <- impute_below_if( pedestrian, is.numeric )
summary( pedestrian_imp )

##  hourly_counts       date_time                        year          month      
##  Min.   :-1409.0   Min.   :2016-01-01 00:00:00   Min.   :2016   October: 5540  
##  1st Qu.:   44.0   1st Qu.:2016-04-08 04:00:00   1st Qu.:2016   January: 2976  
##  Median :  243.0   Median :2016-07-15 08:00:00   Median :2016   March  : 2976  
##  Mean   :  578.1   Mean   :2016-07-09 04:46:33   Mean   :2016   May    : 2976  
##  3rd Qu.:  804.0   3rd Qu.:2016-10-11 21:00:00   3rd Qu.:2016   July   : 2976  
##  Max.   :11273.0   Max.   :2016-12-31 23:00:00   Max.   :2016   August : 2976  
##                                                                 (Other):17280  
##    month_day          week_day         hour        sensor_id    
##  Min.   : 1.00   Sunday   :5396   Min.   : 0.0   Min.   : 2.00  
##  1st Qu.: 8.00   Monday   :5376   1st Qu.: 6.0   1st Qu.: 2.00  
##  Median :16.00   Tuesday  :5328   Median :12.0   Median : 7.00  
##  Mean   :15.75   Wednesday:5328   Mean   :11.5   Mean   :11.15  
##  3rd Qu.:23.00   Thursday :5352   3rd Qu.:18.0   3rd Qu.:13.00  
##  Max.   :31.00   Friday   :5424   Max.   :23.0   Max.   :23.00  
##                  Saturday :5496                                 
##  sensor_name       
##  Length:37700      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Specify features to be imputed

glimpse( oceanbuoys )

## Rows: 736
## Columns: 8
## $ year       <dbl> 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997,…
## $ latitude   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ longitude  <dbl> -110, -110, -110, -110, -110, -110, -110, -110, -110, -110,…
## $ sea_temp_c <dbl> 27.59, 27.55, 27.57, 27.62, 27.65, 27.83, 28.01, 28.04, 28.…
## $ air_temp_c <dbl> 27.15, 27.02, 27.00, 26.93, 26.84, 26.94, 27.04, 27.11, 27.…
## $ humidity   <dbl> 79.6, 75.8, 76.5, 76.2, 76.4, 76.7, 76.5, 78.3, 78.6, 76.9,…
## $ wind_ew    <dbl> -6.4, -5.3, -5.1, -4.9, -3.5, -4.4, -2.0, -3.7, -4.2, -3.6,…
## $ wind_ns    <dbl> 5.4, 5.3, 4.5, 2.5, 4.1, 1.6, 3.5, 4.5, 5.0, 3.5, 2.9, 1.8,…

miss_var_summary( oceanbuoys )

## # A tibble: 8 x 3
##   variable   n_miss pct_miss
##   <chr>       <int>    <dbl>
## 1 humidity       93   12.6  
## 2 air_temp_c     81   11.0  
## 3 sea_temp_c      3    0.408
## 4 year            0    0    
## 5 latitude        0    0    
## 6 longitude       0    0    
## 7 wind_ew         0    0    
## 8 wind_ns         0    0

oceanbuoys_imp <- impute_below_at( oceanbuoys, vars( humidity, air_temp_c, sea_temp_c ) )
miss_var_summary( oceanbuoys_imp )

## # A tibble: 8 x 3
##   variable   n_miss pct_miss
##   <chr>       <int>    <dbl>
## 1 year            0        0
## 2 latitude        0        0
## 3 longitude       0        0
## 4 sea_temp_c      0        0
## 5 air_temp_c      0        0
## 6 humidity        0        0
## 7 wind_ew         0        0
## 8 wind_ns         0        0

glimpse( dat_hw )

## Rows: 100
## Columns: 2
## $ weight <dbl> NA, 91.20470, 81.57915, 76.84886, 111.01731, 90.15135, 63.14240…
## $ height <dbl> 2.3881462, 1.0014508, NA, NA, -0.2412422, 2.5207375, 1.4016896,…

dat_hw_imp <- impute_below_all( dat_hw )

ob1 <- ggplot(dat_hw_imp,
       aes(x = weight,
           y = height)) + 
  geom_point()

ob1

Tracking missing values can be handles by using bind_shadows():

dat_hw_imp <- bind_shadow( dat_hw ) %>% impute_below_all()
head( dat_hw_imp )

## # A tibble: 6 x 4
##   weight height weight_NA height_NA
##    <dbl>  <dbl> <fct>     <fct>    
## 1   40.3  2.39  NA        !NA      
## 2   91.2  1.00  !NA       !NA      
## 3   81.6 -1.51  !NA       NA       
## 4   76.8 -1.65  !NA       NA       
## 5  111.  -0.241 !NA       !NA      
## 6   90.2  2.52  !NA       !NA

aq_imp <- airquality %>%
  bind_shadow() %>%
  impute_below_all() %>%
  ggplot( aes( x = Ozone,
               fill = Ozone_NA ) ) +
  geom_histogram()
aq_imp

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Split the plot by missingness:

aq_imp <- airquality %>%
  bind_shadow() %>%
  impute_below_all() %>%
  ggplot( aes( x = Ozone,
               fill = Ozone_NA ) ) +
  geom_histogram() +
  facet_wrap( ~ Solar.R_NA )
aq_imp

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Visualize imputed value against data values using scatter plots:

aq_imp <- airquality %>%
  bind_shadow() %>%
  add_label_shadow() %>%
  impute_below_all() %>%
  ggplot( aes( x = Ozone,
               y = Solar.R,
               color = any_missing ) ) +
  geom_point()
aq_imp

This successfully recreates the figure rendered by a call to geom_miss_point()

# Impute the oceanbuoys data below the range using `impute_below`.
ocean_imp <- impute_below_all(oceanbuoys)

# Visualize the new missing values
ggplot(ocean_imp, 
       aes(x = wind_ew, y = air_temp_c)) +  
  geom_point()

# Impute and track data with `bind_shadow`, `impute_below_all`, and `add_label_shadow`
ocean_imp_track <-  bind_shadow(oceanbuoys) %>% impute_below_all() %>% add_label_shadow()

# Look at the imputed values
ggplot(ocean_imp_track, aes(x = wind_ew, y = air_temp_c, color = any_missing ) ) +
geom_point()

ocean_imp_track

## # A tibble: 736 x 17
##     year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
##    <dbl>    <dbl>     <dbl>      <dbl>      <dbl>    <dbl>   <dbl>   <dbl>
##  1  1997        0      -110       27.6       27.1     79.6   -6.40    5.40
##  2  1997        0      -110       27.5       27.0     75.8   -5.30    5.30
##  3  1997        0      -110       27.6       27       76.5   -5.10    4.5 
##  4  1997        0      -110       27.6       26.9     76.2   -4.90    2.5 
##  5  1997        0      -110       27.6       26.8     76.4   -3.5     4.10
##  6  1997        0      -110       27.8       26.9     76.7   -4.40    1.60
##  7  1997        0      -110       28.0       27.0     76.5   -2       3.5 
##  8  1997        0      -110       28.0       27.1     78.3   -3.70    4.5 
##  9  1997        0      -110       28.0       27.2     78.6   -4.20    5   
## 10  1997        0      -110       28.0       27.2     76.9   -3.60    3.5 
## # … with 726 more rows, and 9 more variables: year_NA <fct>, latitude_NA <fct>,
## #   longitude_NA <fct>, sea_temp_c_NA <fct>, air_temp_c_NA <fct>,
## #   humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>, any_missing <chr>

# Impute and track the missing values
ocean_imp_track <- bind_shadow(oceanbuoys) %>% 
  impute_below_all() %>% 
  add_label_shadow()

# Visualize the missingness in wind and air temperature,  
# coloring missing air temp values with air_temp_c_NA
ggplot(ocean_imp_track, 
       aes(x = wind_ew, y = air_temp_c, color = air_temp_c_NA)) + 
  geom_point()

# Visualize humidity and air temp, coloring any missing cases using the variable any_missing
ggplot(ocean_imp_track, 
       aes(x = humidity, y = air_temp_c, color = any_missing)) +  
  geom_point()

# Explore the values of air_temp_c, visualizing the amount of missings with `air_temp_c_NA`.
p <- ggplot(ocean_imp_track, aes(x = air_temp_c, fill = air_temp_c_NA)) +  geom_histogram()

# Expore the missings in humidity using humidity_NA
p2 <- ggplot(ocean_imp_track,  aes(x = humidity, fill = humidity_NA)) + geom_histogram()

# Explore the missings in air_temp_c according to year, using `facet_wrap(~year)`.
p + facet_wrap(~year)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Explore the missings in humidity according to year, using `facet_wrap(~year)`.
p2 + facet_wrap(~year)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What makes good imputation?

To understand good imputations, let’s spend some time taking a look at bad imputations:
Imputation by the mean value of the data is particularly bad.

Imputing by the mean is bad because it artifucially increases the mean while decreasing the variance of the dataset as the following excersizes demonstrate:

aq_imp <- airquality %>%
  bind_shadow(only_miss = TRUE) %>% #bind only features with missing values
  add_label_shadow() %>%
  impute_mean_all() %>%
  ggplot( aes( x = Ozone_NA,
               y = Ozone ) ) +
  geom_boxplot()

## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA

aq_imp

The median is lower for the ‘not missing’ group than the NA group.

aq_imp <- airquality %>%
  bind_shadow(only_miss = TRUE) %>% #bind only features with missing values
  add_label_shadow() %>%
  impute_mean_all() %>%
  ggplot( aes( x = Ozone,
               y = Solar.R,
               color = any_missing ) ) +
  geom_point()

## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA

aq_imp

Exploring imputations for many variables:

aq_imp <- airquality %>%
  bind_shadow() %>% #bind only features with missing values
  impute_mean_all()

aq_imp_long <- shadow_long( aq_imp,
                            Ozone,
                            Solar.R )
head( aq_imp_long )

## # A tibble: 6 x 4
##   variable value variable_NA value_NA
##   <chr>    <dbl> <chr>       <chr>   
## 1 Ozone     41   Ozone_NA    !NA     
## 2 Ozone     36   Ozone_NA    !NA     
## 3 Ozone     12   Ozone_NA    !NA     
## 4 Ozone     18   Ozone_NA    !NA     
## 5 Ozone     42.1 Ozone_NA    NA      
## 6 Ozone     28   Ozone_NA    !NA

…and now to visualize:

ggplot( aq_imp_long,
        aes( x = value,
             fill = value_NA ) ) +
  geom_histogram() +
  facet_wrap( ~ variable )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Impute the mean value and track the imputations 
ocean_imp_mean <- bind_shadow(oceanbuoys) %>% 
  impute_mean_all() %>% 
  add_label_shadow()

# Explore the mean values in humidity in the imputed dataset
ggplot(ocean_imp_mean, 
       aes(x = humidity_NA, y = humidity)) + 
  geom_boxplot()

# Explore the values in air temperature in the imputed dataset
ggplot(ocean_imp_mean, 
       aes(x = air_temp_c_NA, y = air_temp_c)) + 
  geom_boxplot()

# Explore imputations in air temperature and humidity,  
# coloring by the variable, any_missing
ggplot(ocean_imp_mean, 
       aes(x = air_temp_c, y = humidity, color = any_missing)) + 
  geom_point()

# Explore imputations in air temperature and humidity,  
# coloring by the variable, any_missing, and faceting by year
ggplot(ocean_imp_mean, 
       aes(x = air_temp_c, y = humidity, color = any_missing)) + 
  geom_point() +  
  facet_wrap(~year)

# Gather the imputed data 
ocean_imp_mean_gather <- shadow_long(ocean_imp_mean,
                                     humidity,
                                     air_temp_c)
# Inspect the data
head( ocean_imp_mean_gather )

## # A tibble: 6 x 4
##   variable   value       variable_NA   value_NA
##   <chr>      <chr>       <chr>         <chr>   
## 1 air_temp_c 27.14999962 air_temp_c_NA !NA     
## 2 air_temp_c 27.02000046 air_temp_c_NA !NA     
## 3 air_temp_c 27          air_temp_c_NA !NA     
## 4 air_temp_c 26.93000031 air_temp_c_NA !NA     
## 5 air_temp_c 26.84000015 air_temp_c_NA !NA     
## 6 air_temp_c 26.94000053 air_temp_c_NA !NA

# Explore the imputations in a histogram 

ggplot( ocean_imp_mean_gather,
        aes( x = as.numeric(value),
             fill = value_NA ) ) +
  geom_histogram() +
  facet_wrap( ~variable )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Performing imputations

Imputing using a linear regression model.
Try simputation
Using impute_lm from simputation is a powerfl way to impute values for a dataset. However, the model used for imputation needs the same regorous evaluation that statistical model approaches demand.

oceanbuoys_lmimp <- bind_shadow( oceanbuoys, only_miss = TRUE ) %>%
  add_label_shadow() %>%
  impute_lm( humidity ~ air_temp_c + wind_ew )
head( oceanbuoys_lmimp )

## # A tibble: 6 x 12
##    year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
##   <dbl>    <dbl>     <dbl>      <dbl>      <dbl>    <dbl>   <dbl>   <dbl>
## 1  1997        0      -110       27.6       27.1     79.6   -6.40    5.40
## 2  1997        0      -110       27.5       27.0     75.8   -5.30    5.30
## 3  1997        0      -110       27.6       27       76.5   -5.10    4.5 
## 4  1997        0      -110       27.6       26.9     76.2   -4.90    2.5 
## 5  1997        0      -110       27.6       26.8     76.4   -3.5     4.10
## 6  1997        0      -110       27.8       26.9     76.7   -4.40    1.60
## # … with 4 more variables: sea_temp_c_NA <fct>, air_temp_c_NA <fct>,
## #   humidity_NA <fct>, any_missing <chr>

airquality_type <- airquality %>% mutate( Solar.R = as.double( Solar.R ),
                                           Ozone = as.double( Ozone ) )
aq_imp_lm <- airquality_type %>% 
  bind_shadow() %>%
  add_label_shadow() %>%
  impute_lm( Solar.R ~ Wind + Temp + Month ) %>%
  impute_lm( Ozone ~ Wind + Temp + Month )
head( aq_imp_lm )

## # A tibble: 6 x 13
##   Ozone Solar.R  Wind  Temp Month   Day Ozone_NA Solar.R_NA Wind_NA Temp_NA
##   <dbl>   <dbl> <dbl> <int> <int> <int> <fct>    <fct>      <fct>   <fct>  
## 1 41       190    7.4    67     5     1 !NA      !NA        !NA     !NA    
## 2 36       118    8      72     5     2 !NA      !NA        !NA     !NA    
## 3 12       149   12.6    74     5     3 !NA      !NA        !NA     !NA    
## 4 18       313   11.5    62     5     4 !NA      !NA        !NA     !NA    
## 5 -9.04    138.  14.3    56     5     5 NA       NA         !NA     !NA    
## 6 28       178.  14.9    66     5     6 !NA      NA         !NA     !NA    
## # … with 3 more variables: Month_NA <fct>, Day_NA <fct>, any_missing <chr>

The bind_shadow() & add_labels_missings() methods are important for tracking which values were imputed. The imputed values for the last example are visualized below:

ggplot( aq_imp_lm,
        aes( x = Solar.R,
             y = Ozone,
             color = any_missing ) ) +
  geom_point()

Build other linear model imputations and compare the results:

aq_imp_lm_small <- airquality_type %>% 
  bind_shadow() %>%
  add_label_shadow() %>%
  impute_lm( Solar.R ~ Wind + Temp ) %>%
  impute_lm( Ozone ~ Wind + Temp )

aq_imp_lm_large <- airquality_type %>% 
  bind_shadow() %>%
  add_label_shadow() %>%
  impute_lm( Solar.R ~ Wind + Temp + Month + Day ) %>%
  impute_lm( Ozone ~ Wind + Temp + Month + Day )

Bind the models, so that we can build visualizations:

bound_models <- bind_rows( small = aq_imp_lm_small,
                           medium = aq_imp_lm,
                           large = aq_imp_lm_large,
                           .id = 'imp_model' )
head( bound_models )

## # A tibble: 6 x 14
##   imp_model Ozone Solar.R  Wind  Temp Month   Day Ozone_NA Solar.R_NA Wind_NA
##   <chr>     <dbl>   <dbl> <dbl> <int> <int> <int> <fct>    <fct>      <fct>  
## 1 small      41      190    7.4    67     5     1 !NA      !NA        !NA    
## 2 small      36      118    8      72     5     2 !NA      !NA        !NA    
## 3 small      12      149   12.6    74     5     3 !NA      !NA        !NA    
## 4 small      18      313   11.5    62     5     4 !NA      !NA        !NA    
## 5 small     -11.7    127.  14.3    56     5     5 NA       NA         !NA    
## 6 small      28      160.  14.9    66     5     6 !NA      NA         !NA    
## # … with 4 more variables: Temp_NA <fct>, Month_NA <fct>, Day_NA <fct>,
## #   any_missing <chr>

bound_models_gather <- bound_models %>%
  select( Ozone, Solar.R, any_missing, imp_model ) %>%
  gather( key = 'variable', value = 'value', -any_missing, -imp_model )
head( bound_models_gather )

## # A tibble: 6 x 4
##   any_missing imp_model variable value
##   <chr>       <chr>     <chr>    <dbl>
## 1 Not Missing small     Ozone     41  
## 2 Not Missing small     Ozone     36  
## 3 Not Missing small     Ozone     12  
## 4 Not Missing small     Ozone     18  
## 5 Missing     small     Ozone    -11.7
## 6 Missing     small     Ozone     28

ggplot( bound_models_gather,
        aes( x = imp_model,
             y = value,
             color = imp_model ) ) +
  geom_boxplot() +
  facet_wrap( ~variable )

# Impute humidity and air temperature using wind_ew and wind_ns, and track missing values
ocean_imp_lm_wind <- oceanbuoys %>% 
    bind_shadow() %>%
    impute_lm(air_temp_c ~ wind_ew + wind_ns) %>% 
    impute_lm(humidity ~ wind_ew + wind_ns) %>%
    add_label_shadow()
    
# Plot the imputed values for air_temp_c and humidity, colored by missingness
ggplot(ocean_imp_lm_wind, 
       aes(x = air_temp_c, y = humidity, color = any_missing)) + 
  geom_point()

# Bind the models together 
bound_models <- bind_rows(mean = ocean_imp_mean,
                          lm_wind = ocean_imp_lm_wind,
                          .id = "imp_model")

# Inspect the values of air_temp and humidity as a scatter plot
ggplot(bound_models, 
       aes(x = air_temp_c, 
           y = humidity, 
           color = any_missing)) +
  geom_point() + 
  facet_wrap(~imp_model)

# Build a model adding year to the outcome
ocean_imp_lm_wind_year <- bind_shadow(oceanbuoys) %>%
  impute_lm(air_temp_c ~ wind_ew + wind_ns + year) %>%
  impute_lm(humidity ~ wind_ew + wind_ns + year) %>%
  add_label_shadow()

# Bind the mean, lm_wind, and lm_wind_year models together
bound_models <- bind_rows(mean = ocean_imp_mean,
                          lm_wind = ocean_imp_lm_wind,
                          lm_wind_year = ocean_imp_lm_wind_year,
                          .id = "imp_model")
bound_models$imp_model_f <- factor( bound_models$imp_model, levels = c('mean','lm_wind','lm_wind_year'))

# Explore air_temp and humidity, coloring by any missings, and faceting by imputation model
ggplot(bound_models, aes(x = air_temp_c, y = humidity, color = any_missing)) + 
  geom_point() + facet_wrap(~imp_model_f)

Evaluating imputations and models

Assessing inference from imputed data in a modelling context.

Compare the imputated data with a Complete case analysis (only uses rows with no missing values)

#Complete Case Analysis.
aq_cc <- airquality %>%
  na.omit() %>%
  bind_shadow() %>%
  add_label_shadow()
#dim( aq_cc )

#Impute the data with a linear model
aq_imp_lm <- bind_shadow( airquality_type ) %>%
  add_label_shadow() %>%
  impute_lm( Ozone ~ Temp + Wind + Month + Day ) %>%
  impute_lm( Solar.R ~ Temp + Wind + Month + Day )
#dim( aq_imp_lm )

#Bind the different datasets together
bound_models <- bind_rows( cc = aq_cc,
                           imp_lm = aq_imp_lm,
                           .id = 'imp_model' )
head( bound_models )

## # A tibble: 6 x 14
##   imp_model Ozone Solar.R  Wind  Temp Month   Day Ozone_NA Solar.R_NA Wind_NA
##   <chr>     <dbl>   <dbl> <dbl> <int> <int> <int> <fct>    <fct>      <fct>  
## 1 cc           41     190   7.4    67     5     1 !NA      !NA        !NA    
## 2 cc           36     118   8      72     5     2 !NA      !NA        !NA    
## 3 cc           12     149  12.6    74     5     3 !NA      !NA        !NA    
## 4 cc           18     313  11.5    62     5     4 !NA      !NA        !NA    
## 5 cc           23     299   8.6    65     5     7 !NA      !NA        !NA    
## 6 cc           19      99  13.8    59     5     8 !NA      !NA        !NA    
## # … with 4 more variables: Temp_NA <fct>, Month_NA <fct>, Day_NA <fct>,
## #   any_missing <chr>

Now that the data is formatted, fit a linear model to each of the datasets

model_summary <- bound_models %>%
  group_by( imp_model ) %>%
  nest() %>% #colapses the data such that each row represents a dataset
  mutate( mod = map( data,
                     ~lm( Temp ~ Ozone + Solar.R + Wind + Temp + Day + Month,
                          data = . ) ), #fit a linear model to each row
          res = map( mod, residuals ), #get the residuals
          pred = map( mod, predict ), #get a model prediction
          tidy = map( mod, broom::tidy ) ) #get the coefficients too
model_summary

## # A tibble: 2 x 6
## # Groups:   imp_model [2]
##   imp_model data                mod    res         pred        tidy            
##   <chr>     <list>              <list> <list>      <list>      <list>          
## 1 cc        <tibble [111 × 13]> <lm>   <dbl [111]> <dbl [111]> <tibble [6 × 5]>
## 2 imp_lm    <tibble [153 × 13]> <lm>   <dbl [153]> <dbl [153]> <tibble [6 × 5]>

Explore the results from both approaches to fit a linear model to the data (with & w/out imputation)

model_summary %>%
  select( imp_model,
          tidy ) %>%
  unnest(cols = c( tidy ) )

## # A tibble: 12 x 6
## # Groups:   imp_model [2]
##    imp_model term        estimate std.error statistic  p.value
##    <chr>     <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 cc        (Intercept) 57.3       4.50      12.7    5.52e-23
##  2 cc        Ozone        0.165     0.0239     6.92   3.66e-10
##  3 cc        Solar.R      0.0108    0.00699    1.55   1.24e- 1
##  4 cc        Wind        -0.174     0.212     -0.821  4.13e- 1
##  5 cc        Day         -0.0892    0.0677    -1.32   1.91e- 1
##  6 cc        Month        2.04      0.409      4.99   2.42e- 6
##  7 imp_lm    (Intercept) 54.7       3.59      15.2    5.21e-32
##  8 imp_lm    Ozone        0.196     0.0205     9.53   4.52e-17
##  9 imp_lm    Solar.R      0.0102    0.00577    1.76   7.97e- 2
## 10 imp_lm    Wind        -0.00642   0.172     -0.0374 9.70e- 1
## 11 imp_lm    Day         -0.112     0.0538    -2.08   3.92e- 2
## 12 imp_lm    Month        2.11      0.340      6.21   5.09e- 9

model_summary %>%
  select( imp_model,
          res ) %>%
  unnest(cols = c( res ) ) %>%
  ggplot( aes( x = res,
               fill = imp_model ) ) +
  geom_histogram( position = 'dodge' )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Explore the predictions in the data

model_summary %>%
  select( imp_model,
          pred ) %>%
  unnest(cols = c( pred ) ) %>%
  ggplot( aes( x = pred,
               fill = imp_model ) ) +
  geom_histogram( position = 'dodge' )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

and for the oceanbouys dataset

#Complete Case Analysis.
ocean_cc <- oceanbuoys %>%
  na.omit() %>%
  bind_shadow() %>%
  add_label_shadow()

# Create an imputed dataset using a linear models
ocean_imp_lm_all <- bind_shadow(oceanbuoys) %>%
  add_label_shadow() %>%
  impute_lm(sea_temp_c ~ wind_ew + wind_ns + year + latitude + longitude) %>%
  impute_lm(air_temp_c ~ wind_ew + wind_ns + year + latitude + longitude) %>%
  impute_lm(humidity ~ wind_ew + wind_ns + year + latitude + longitude)

# Bind the datasets
bound_models <- bind_rows(cc = ocean_cc,
                          imp_lm_wind = ocean_imp_lm_wind,
                          imp_lm_all = ocean_imp_lm_all,
                          .id = "imp_model")
# Look at the models
glimpse( bound_models )

## Rows: 2,037
## Columns: 18
## $ imp_model     <chr> "cc", "cc", "cc", "cc", "cc", "cc", "cc", "cc", "cc", "c…
## $ year          <dbl> 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 19…
## $ latitude      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ longitude     <dbl> -110, -110, -110, -110, -110, -110, -110, -110, -110, -1…
## $ sea_temp_c    <dbl> 27.59, 27.55, 27.57, 27.62, 27.65, 27.83, 28.01, 28.04, …
## $ air_temp_c    <dbl> 27.15, 27.02, 27.00, 26.93, 26.84, 26.94, 27.04, 27.11, …
## $ humidity      <dbl> 79.6, 75.8, 76.5, 76.2, 76.4, 76.7, 76.5, 78.3, 78.6, 76…
## $ wind_ew       <dbl> -6.4, -5.3, -5.1, -4.9, -3.5, -4.4, -2.0, -3.7, -4.2, -3…
## $ wind_ns       <dbl> 5.4, 5.3, 4.5, 2.5, 4.1, 1.6, 3.5, 4.5, 5.0, 3.5, 2.9, 1…
## $ year_NA       <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ latitude_NA   <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ longitude_NA  <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ sea_temp_c_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ air_temp_c_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ humidity_NA   <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ wind_ew_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ wind_ns_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ any_missing   <chr> "Not Missing", "Not Missing", "Not Missing", "Not Missin…

# Create the model summary for each dataset
model_summary <- bound_models %>% 
  group_by(imp_model) %>%
  nest() %>%
  mutate(mod = map(data, ~lm(sea_temp_c ~ air_temp_c + humidity + year, data = .)),
         res = map(mod, residuals),
         pred = map(mod, predict),
         tidy = map(mod, broom::tidy ))

# Explore the coefficients in the model
model_summary %>% 
    select(imp_model,tidy) %>% 
    unnest(cols = c( tidy ))

## # A tibble: 12 x 6
## # Groups:   imp_model [3]
##    imp_model   term          estimate std.error statistic   p.value
##    <chr>       <chr>            <dbl>     <dbl>     <dbl>     <dbl>
##  1 cc          (Intercept)  -735.      45.9        -16.0  8.19e- 48
##  2 cc          air_temp_c      0.864    0.0231      37.4  2.64e-154
##  3 cc          humidity        0.0341   0.00390      8.74 2.69e- 17
##  4 cc          year            0.369    0.0232      15.9  3.46e- 47
##  5 imp_lm_wind (Intercept) -1742.      56.1        -31.0  1.83e-135
##  6 imp_lm_wind air_temp_c      0.365    0.0279      13.1  2.73e- 35
##  7 imp_lm_wind humidity        0.0225   0.00690      3.26 1.17e-  3
##  8 imp_lm_wind year            0.880    0.0283      31.1  6.79e-136
##  9 imp_lm_all  (Intercept)  -697.      51.8        -13.5  5.04e- 37
## 10 imp_lm_all  air_temp_c      0.890    0.0255      35.0  2.90e-158
## 11 imp_lm_all  humidity        0.0127   0.00463      2.75 6.03e-  3
## 12 imp_lm_all  year            0.351    0.0262      13.4  1.12e- 36

The imp_lm_all model gives the highest estimate for air_temp_c

Final Lesson

Some Other Datasets to play with

ozoneNA_url <- 'https://raw.githubusercontent.com/njtierney/user2018-missing-data-tutorial/master/ozoneNA.csv'
ecological_url <- 'https://raw.githubusercontent.com/njtierney/user2018-missing-data-tutorial/master/ecological.csv'

Dealing with Missing Data in R

DataCamp: Statistics with R