library( dplyr )
library( ggplot2 )
library( gridExtra )
library( tidyverse )
library( naniar )
library( simputation )Why care about missing data?
Introduction to missing data
“The best thing to do with missing data is to not have any.” -Gertrude M. Cox
- Working with real-world data == working with missing data
- Missing Data can have unexpected effects on your analysis
- Bad imputation can lead to poor estimates and decisions
Checking for missing values with any_na()
x <- c( 1, NA, 3, NA, NA, 5 )
baset0 <- Sys.time()
any( is.na( x ) )## [1] TRUE
timeint_base <- Sys.time() - baset0
naniart0 <- Sys.time()
any_na( x )## [1] TRUE
timeint_naniar <- Sys.time() - naniart0
base2t0 <- Sys.time()
anyNA( x )## [1] TRUE
timeint_base2 <- Sys.time() - base2t0
res <- paste( 'Comparing runtime for methods to detect NA:\nbase R runtime:', timeint_base,
'\nanyNA() base r runtime', timeint_base2,
'\nany_na() naniar runtime:', timeint_naniar)
cat( res, sep = '\n')## Comparing runtime for methods to detect NA:
## base R runtime: 0.00213027000427246
## anyNA() base r runtime 0.0020744800567627
## any_na() naniar runtime: 0.00191235542297363
#return a boolean vector that tests for NAs
are_na( x )## [1] FALSE TRUE FALSE TRUE TRUE FALSE
#return the number of NAs
n_miss( x )## [1] 3
#return the proportion of NAs
prop_miss( x )## [1] 0.5
Generally, operations with NA values returns an NA value:
heights <- data.frame( 'Sophie' = 165, 'Dan' = 177, 'Fred' = NA )
sum( heights )## [1] NA
Important Distinctions:
NaN== Not a Number. is evaluated the same asNANULL== empty. is Not the same asNAInf== Infinity. is Not the same asNA
r1 <- any_na( NaN )
r2 <- any_na( NULL )
r3 <- any_na( Inf )
r4 <- any_na( 0 )
res <- paste( 'When tested any_na()::\nNaN evaluates:', r1,
'\nNULL evaluates:', r2,
'\nInf evaluates:', r3,
'\n0 evaluates:', r4 )
cat( res, sep='\n' )## When tested any_na()::
## NaN evaluates: TRUE
## NULL evaluates: FALSE
## Inf evaluates: FALSE
## 0 evaluates: FALSE
Conditional Statement Behaviors to look out for:
r1 <- NA | TRUE
r2 <- NA | FALSE
r3 <- NA | NaN
r4 <- NaN | NA
res <- paste( 'Conditional Statement Behaviors to be aware of::\nNA | TRUE evaluates:', r1,
'\nNA | FALSE evaluates:', r2,
'\nNA | NaN evaluates:', r3,
'\nNaN | NA evaluates:', r4 )
cat( res, sep='\n' )## Conditional Statement Behaviors to be aware of::
## NA | TRUE evaluates: TRUE
## NA | FALSE evaluates: NA
## NA | NaN evaluates: NA
## NaN | NA evaluates: NA
# Create x, a vector, with values NA, NaN, Inf, ".", and "missing"
x <- c(NA, NaN, Inf, ".", "missing")
# Use any_na() and are_na() on to explore the missings
any_na(x)## [1] TRUE
are_na(x)## [1] TRUE FALSE FALSE FALSE FALSE
dat_hw_url <- 'https://raw.githubusercontent.com/SmilodonCub/ReadingLearningTinkering/master/DataCamp/Statistics_with_R/dat_hw.csv'
dat_hw <- read.csv( dat_hw_url ) %>%
select( -X )
head( dat_hw )## weight height
## 1 NA 2.3881462
## 2 91.20470 1.0014508
## 3 81.57915 NA
## 4 76.84886 NA
## 5 111.01731 -0.2412422
## 6 90.15135 2.5207375
# Use n_miss() to count the total number of missing values in dat_hw
n_miss(dat_hw)## [1] 30
# Use n_miss() on dat_hw$weight to count the total number of missing values
n_miss(dat_hw$weight)## [1] 15
# Use n_complete() on dat_hw to count the total number of complete values
n_complete(dat_hw)## [1] 170
# Use n_complete() on dat_hw$weight to count the total number of complete values
n_complete(dat_hw$weight)## [1] 85
# Use prop_miss() and prop_complete() on dat_hw to count the total number of missing values in each of the variables
prop_miss(dat_hw)## [1] 0.15
prop_complete(dat_hw)## [1] 0.85
Why care about missing values?
Introduction to missingness summaries
Basic summary missingness:
n_miss( x )## [1] 1
n_complete( x )## [1] 4
Dataframe summaries of missingness:
miss_var_summary(): summarize the number of missing in each variable/feature/column
miss_x_cols <- miss_var_summary( dat_hw )
glimpse( miss_x_cols )## Rows: 2
## Columns: 3
## $ variable <chr> "weight", "height"
## $ n_miss <int> 15, 15
## $ pct_miss <dbl> 15, 15
miss_x_cols <- miss_var_summary( airquality )
glimpse( miss_x_cols )## Rows: 6
## Columns: 3
## $ variable <chr> "Ozone", "Solar.R", "Wind", "Temp", "Month", "Day"
## $ n_miss <int> 37, 7, 0, 0, 0, 0
## $ pct_miss <dbl> 24.183007, 4.575163, 0.000000, 0.000000, 0.000000, 0.000000
miss_case_summary: each case is a row in the dataframe. info on missing values by row.
dim( miss_case_summary( dat_hw ) )## [1] 100 3
head( miss_case_summary( airquality ) )## # A tibble: 6 x 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 5 2 33.3
## 2 27 2 33.3
## 3 6 1 16.7
## 4 10 1 16.7
## 5 11 1 16.7
## 6 25 1 16.7
Missing Data Tabulations:
miss_var_table() returns a dataframe with info on the variables missing data as well as the percentage of variables affected by missing data
miss_var_table( dat_hw )## # A tibble: 1 x 3
## n_miss_in_var n_vars pct_vars
## * <int> <int> <dbl>
## 1 15 2 100
can be interpretted as: 2 variables are missing 15 observations each. 100% of the variables in the dataframe are affected this way
miss_var_table( airquality )## # A tibble: 3 x 3
## n_miss_in_var n_vars pct_vars
## * <int> <int> <dbl>
## 1 0 4 66.7
## 2 7 1 16.7
## 3 37 1 16.7
can be interpretted as: 66.6% of the features in this dataframe (total of 4 features) are missing 0 observations. One variables (16.6% of features) is missing 7 observations while another variable (16.6% of features) is missing 37 observations.
miss_case_table(): returns the same information but by cases (rows)
miss_case_table( dat_hw )## # A tibble: 2 x 3
## n_miss_in_case n_cases pct_cases
## * <int> <int> <dbl>
## 1 0 70 70
## 2 1 30 30
can be interpretted as: 70% of rows (70 rows) are missing 0 observations. 30% of rows (30 rows) are missing 1 observation.
miss_case_table( airquality )## # A tibble: 3 x 3
## n_miss_in_case n_cases pct_cases
## * <int> <int> <dbl>
## 1 0 111 72.5
## 2 1 40 26.1
## 3 2 2 1.31
can be interpretted as: 72.5% of rows (111 rows) are missing 0 observations. 26.1% of rows (40 rows) are missing 1 observation. 1.3% or rows (2 rows) are missing 2 observations.
Other useful functions:
miss_var_span()summarizes missing data by span of data (good for time series analysis e.g. weekly spans of 7)miss_var_run()summarizes runs of missing data. good for finding unusual patterns of missing data. returns runs of complete and missing data. great for sinding systemic sampling error.
Using summaries with group_by():
airquality %>%
group_by( Month ) %>%
miss_var_summary()## # A tibble: 25 x 4
## # Groups: Month [5]
## Month variable n_miss pct_miss
## <int> <chr> <int> <dbl>
## 1 5 Ozone 5 16.1
## 2 5 Solar.R 4 12.9
## 3 5 Wind 0 0
## 4 5 Temp 0 0
## 5 5 Day 0 0
## 6 6 Ozone 21 70
## 7 6 Solar.R 0 0
## 8 6 Wind 0 0
## 9 6 Temp 0 0
## 10 6 Day 0 0
## # … with 15 more rows
glimpse( pedestrian )## Rows: 37,700
## Columns: 9
## $ hourly_counts <int> 883, 597, 294, 183, 118, 68, 47, 52, 120, 333, 761, 1352…
## $ date_time <dttm> 2016-01-01 00:00:00, 2016-01-01 01:00:00, 2016-01-01 02…
## $ year <int> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…
## $ month <ord> January, January, January, January, January, January, Ja…
## $ month_day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ week_day <ord> Friday, Friday, Friday, Friday, Friday, Friday, Friday, …
## $ hour <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ sensor_id <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ sensor_name <chr> "Bourke Street Mall (South)", "Bourke Street Mall (South…
miss_var_table( pedestrian )## # A tibble: 2 x 3
## n_miss_in_var n_vars pct_vars
## * <int> <int> <dbl>
## 1 0 8 88.9
## 2 2548 1 11.1
# Calculate the summaries for each run of missingness for the variable, hourly_counts
miss_var_run(pedestrian, var = hourly_counts)## # A tibble: 35 x 2
## run_length is_na
## <int> <chr>
## 1 6628 complete
## 2 1 missing
## 3 5250 complete
## 4 624 missing
## 5 3652 complete
## 6 1 missing
## 7 1290 complete
## 8 744 missing
## 9 7420 complete
## 10 1 missing
## # … with 25 more rows
# Calculate the summaries for each span of missingness,
# for a span of 4000, for the variable hourly_counts
miss_var_span(pedestrian, var = hourly_counts, span_every = 4000)## # A tibble: 10 x 5
## span_counter n_miss n_complete prop_miss prop_complete
## * <int> <int> <dbl> <dbl> <dbl>
## 1 1 0 4000 0 1
## 2 2 1 3999 0.00025 1.00
## 3 3 121 3879 0.0302 0.970
## 4 4 503 3497 0.126 0.874
## 5 5 745 3255 0.186 0.814
## 6 6 0 4000 0 1
## 7 7 1 3999 0.00025 1.00
## 8 8 0 4000 0 1
## 9 9 745 3255 0.186 0.814
## 10 10 432 3568 0.108 0.892
# For each `month` variable, calculate the run of missingness for hourly_counts
pedestrian %>% group_by(month) %>% miss_var_run(hourly_counts)## # A tibble: 51 x 3
## # Groups: month [12]
## month run_length is_na
## <ord> <int> <chr>
## 1 January 2976 complete
## 2 February 2784 complete
## 3 March 2976 complete
## 4 April 888 complete
## 5 April 552 missing
## 6 April 1440 complete
## 7 May 744 complete
## 8 May 72 missing
## 9 May 2160 complete
## 10 June 2880 complete
## # … with 41 more rows
# For each `month` variable, calculate the span of missingness
# of a span of 2000, for the variable hourly_counts
pedestrian %>% group_by(month) %>% miss_var_span(var = hourly_counts, span_every = 2000)## # A tibble: 25 x 6
## # Groups: month [12]
## month span_counter n_miss n_complete prop_miss prop_complete
## <ord> <int> <int> <dbl> <dbl> <dbl>
## 1 January 1 0 2000 0 1
## 2 January 2 0 2000 0 1
## 3 February 1 0 2000 0 1
## 4 February 2 0 2000 0 1
## 5 March 1 0 2000 0 1
## 6 March 2 0 2000 0 1
## 7 April 1 552 1448 0.276 0.724
## 8 April 2 0 2000 0 1
## 9 May 1 72 1928 0.036 0.964
## 10 May 2 0 2000 0 1
## # … with 15 more rows
How do we visual missing values?
naniar missing data visualization methods.
Overview of missingness: a type of heatmap for missing data. black == missing. also provides basic stats of proportions of missingness.
vis_miss( airquality )vis_miss( dat_hw )vis_miss( airquality )vis_miss( airquality, cluster = TRUE)vis_miss( dat_hw, cluster = TRUE)Looking at missing observations in both variables and cases
varp <- gg_miss_var( airquality )
casep <- gg_miss_case( airquality )
grid.arrange( varp, casep, ncol = 2 )varp <- gg_miss_var( dat_hw )
casep <- gg_miss_case( dat_hw )
grid.arrange( varp, casep, ncol = 2 )faceting a gg_miss_var() plot acts like visualizing a group_by
gg_miss_var( airquality, facet = Month )Visualizing missingness patterns
gg_miss_upset() shows co-occuring missing observations across variables
gg_miss_upset( airquality )
gg_miss_fct(): visualizing factors for missingness. again, kinda like visualizing missing group_by result. Gives a heat map view for each feature (y-axis) and each montt (x-axis) where color intensity is the number of missing observations.
gg_miss_fct( x = airquality, fct = Month )
gg_miss_span() visualizes the number of missing observations for a given span.
gg_miss_span( pedestrian, hourly_counts, span_every = 3000)# Visualize all of the missingness in the `riskfactors` dataset
vm <- vis_miss(riskfactors) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# Visualize and cluster all of the missingness in the `riskfactors` dataset
vmc <-vis_miss(riskfactors, cluster = TRUE) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# visualize and sort the columns by missingness in the `riskfactors` dataset
vms <- vis_miss(riskfactors, sort_miss = TRUE) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# visualize cluster and sort the columns by missingness in the `riskfactors` dataset
vmcs <- vis_miss(riskfactors, sort_miss = TRUE, cluster = TRUE ) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
grid.arrange( vm, vmc, vms, vmcs, ncol = 2 )# Visualize the number of missings in cases using `gg_miss_case()`
caserf <- gg_miss_case(riskfactors)
# Explore the number of missings in cases using `gg_miss_case()`
# and facet by the variable `education`
fct_caserf <- gg_miss_case(riskfactors, facet = education)
grid.arrange( caserf, fct_caserf, ncol = 2 )# Visualize the number of missings in variables using `gg_miss_var()`
varrf <- gg_miss_var(riskfactors)
# Explore the number of missings in variables using `gg_miss_var()`
# and facet by the variable `education`
fct_varrf <- gg_miss_var(riskfactors, facet = education)
grid.arrange( varrf, fct_varrf, ncol = 2 )# With the riskfactors dataset, explore how the missingness changes across the marital variable using gg_miss_fct()
gg_miss_fct(x = riskfactors, fct = marital)# Using the pedestrian dataset, explore how the missingness of hourly_counts changes over a span of 3000
gg_miss_span(pedestrian, var = hourly_counts, span_every = 3000)# Using the pedestrian dataset, explore the impact of month by faceting by month
# and explore how missingness changes for a span of 1000
gg_miss_span(pedestrian, var = hourly_counts , span_every = 1000, facet = month)Wrangling and tidying up missing values.
Searching for an replacing missing values
Assumptions with missing data: finding missing values and labelling with NA
In a perfect word, missing data is labelled NA. However, it may be the case that it is coded as ‘missing’, ‘Not Available’, ‘N/A’ or some other permutation.
miss_search_count(): Searching for missing values
employees_url <- 'https://raw.githubusercontent.com/ChaitanyaBaweja/Programming-Tutorials/master/Missing-Data-Pandas/employees.csv'
employees <- read.csv( employees_url )
glimpse( employees )## Rows: 1,000
## Columns: 6
## $ First.Name <chr> "Douglas", "Thomas", "Maria", "Jerry", "Larry", "Den…
## $ Gender <chr> "Male", "Male", "Female", "Male", "Male", "n.a.", "F…
## $ Salary <chr> "97308", "61933", "130590", NA, "101004", "115163", …
## $ Bonus.. <chr> "6.945", "NaN", "11.858", "9.34", "1.389", "10.125",…
## $ Senior.Management <chr> "TRUE", "TRUE", "FALSE", "TRUE", "TRUE", "FALSE", "T…
## $ Team <chr> "Marketing", "", "Finance", "Finance", "Client Servi…
#explicitly search for strange NA mislabellings
employees %>%
miss_scan_count( search = list('n.a', 'na') )## # A tibble: 6 x 2
## Variable n
## <chr> <int>
## 1 First.Name 58
## 2 Gender 1
## 3 Salary 1
## 4 Bonus.. 0
## 5 Senior.Management 2
## 6 Team 106
employees %>%
miss_scan_count( search = common_na_strings )## # A tibble: 6 x 2
## Variable n
## <chr> <int>
## 1 First.Name 998
## 2 Gender 999
## 3 Salary 999
## 4 Bonus.. 1000
## 5 Senior.Management 1000
## 6 Team 1000
print( common_na_strings )## [1] "NA" "N A" "N/A" "NA " " NA" "N /A" "N / A" " N / A"
## [9] "N / A " "na" "n a" "n/a" "na " " na" "n /a" "n / a"
## [17] " a / a" "n / a " "NULL" "null" "" "\\?" "\\*" "\\."
replace_with_na(): replace specified values with NA
replace_with_na_all(): all variablesreplace_with_na_at(): a subset of selected variablesreplace_with_na_if(): a subset of variables that fulfill some condition
#replace all instances of 'N/A' and 'N/a' from the feature `grade`
employees %>%
replace_with_na( replace = list( Team = c('n.a', 'na') ) ) %>%
miss_scan_count( search = common_na_strings )## # A tibble: 6 x 2
## Variable n
## <chr> <int>
## 1 First.Name 998
## 2 Gender 999
## 3 Salary 999
## 4 Bonus.. 1000
## 5 Senior.Management 1000
## 6 Team 997
#replace any instance of -99 with `NA`
employees %>%
replace_with_na_all( condition = ~.x == "" ) %>%
miss_scan_count( search = common_na_strings )## # A tibble: 6 x 2
## Variable n
## <chr> <int>
## 1 First.Name 931
## 2 Gender 852
## 3 Salary 998
## 4 Bonus.. 1000
## 5 Senior.Management 933
## 6 Team 957
#replace multiple with `NA`
employees %>%
replace_with_na_all( condition = ~.x %in% common_na_strings ) %>%
miss_scan_count( search = common_na_strings )## # A tibble: 6 x 2
## Variable n
## <chr> <int>
## 1 First.Name 931
## 2 Gender 852
## 3 Salary 998
## 4 Bonus.. 1000
## 5 Senior.Management 933
## 6 Team 954
# Explore the strange missing values "N/A"
miss_scan_count(data = pedestrian, search = list("N/A") )## # A tibble: 9 x 2
## Variable n
## <chr> <int>
## 1 hourly_counts 0
## 2 date_time 0
## 3 year 0
## 4 month 0
## 5 month_day 0
## 6 week_day 0
## 7 hour 0
## 8 sensor_id 0
## 9 sensor_name 0
# Explore the strange missing values "missing"
miss_scan_count(data = pedestrian, search = list("missing") )## # A tibble: 9 x 2
## Variable n
## <chr> <int>
## 1 hourly_counts 0
## 2 date_time 0
## 3 year 0
## 4 month 0
## 5 month_day 0
## 6 week_day 0
## 7 hour 0
## 8 sensor_id 0
## 9 sensor_name 0
# Explore the strange missing values "na"
miss_scan_count(data = pedestrian, search = list('na' ) )## # A tibble: 9 x 2
## Variable n
## <chr> <int>
## 1 hourly_counts 0
## 2 date_time 0
## 3 year 0
## 4 month 0
## 5 month_day 0
## 6 week_day 0
## 7 hour 0
## 8 sensor_id 0
## 9 sensor_name 0
# Explore the strange missing values " " (a single space)
miss_scan_count(data = pedestrian, search = list(' ') )## # A tibble: 9 x 2
## Variable n
## <chr> <int>
## 1 hourly_counts 0
## 2 date_time 37700
## 3 year 0
## 4 month 0
## 5 month_day 0
## 6 week_day 0
## 7 hour 0
## 8 sensor_id 0
## 9 sensor_name 37700
# Explore all of the strange missing values, "N/A", "missing", "na", " "
miss_scan_count(data = pedestrian, search = list("N/A", "missing", "na", " "))## # A tibble: 9 x 2
## Variable n
## <chr> <int>
## 1 hourly_counts 0
## 2 date_time 37700
## 3 year 0
## 4 month 0
## 5 month_day 0
## 6 week_day 0
## 7 hour 0
## 8 sensor_id 0
## 9 sensor_name 37700
# Print the top of the pacman data using `head()`
glimpse(employees)## Rows: 1,000
## Columns: 6
## $ First.Name <chr> "Douglas", "Thomas", "Maria", "Jerry", "Larry", "Den…
## $ Gender <chr> "Male", "Male", "Female", "Male", "Male", "n.a.", "F…
## $ Salary <chr> "97308", "61933", "130590", NA, "101004", "115163", …
## $ Bonus.. <chr> "6.945", "NaN", "11.858", "9.34", "1.389", "10.125",…
## $ Senior.Management <chr> "TRUE", "TRUE", "FALSE", "TRUE", "TRUE", "FALSE", "T…
## $ Team <chr> "Marketing", "", "Finance", "Finance", "Client Servi…
# Replace the strange missing values "N/A", "na", and
# "missing" with `NA` for the variables, year, and score
emp_clean <- replace_with_na(data = employees, replace = list(Team = c("", "na", "n.a","NaN"),
Bonus.. = c("", "na", "n.a","NaN")))
# Test if `pacman_clean` still has these values in it?
miss_scan_count(emp_clean, search = list("", "na", "n.a", "NaN"))## # A tibble: 6 x 2
## Variable n
## <chr> <int>
## 1 First.Name 998
## 2 Gender 999
## 3 Salary 999
## 4 Bonus.. 997
## 5 Senior.Management 1000
## 6 Team 954
# Use `replace_with_na_at()` to replace with NA
rp1 <- replace_with_na_at(employees,
.vars = c('First.Name', 'Gender', 'Team'),
~.x %in% c("", " ", "na", 'NaN', '?'))
head( rp1 )## First.Name Gender Salary Bonus.. Senior.Management Team
## 1 Douglas Male 97308 6.945 TRUE Marketing
## 2 Thomas Male 61933 NaN TRUE <NA>
## 3 Maria Female 130590 11.858 FALSE Finance
## 4 Jerry Male <NA> 9.34 TRUE Finance
## 5 Larry Male 101004 1.389 TRUE Client Services
## 6 Dennis n.a. 115163 10.125 FALSE Legal
# Use `replace_with_na_if()` to replace with NA the character values using `is.character`
rp2 <- replace_with_na_if(employees,
.predicate = is.character,
~.x %in% c("", " ", "na", 'NaN', '?'))
head( rp2 )## First.Name Gender Salary Bonus.. Senior.Management Team
## 1 Douglas Male 97308 6.945 TRUE Marketing
## 2 Thomas Male 61933 <NA> TRUE <NA>
## 3 Maria Female 130590 11.858 FALSE Finance
## 4 Jerry Male <NA> 9.34 TRUE Finance
## 5 Larry Male 101004 1.389 TRUE Client Services
## 6 Dennis n.a. 115163 10.125 FALSE Legal
# Use `replace_with_na_all()` to replace with NA
rp3 <- replace_with_na_all(employees, condition = ~.x %in% c("", " ", "na", 'NaN', '?'))
head( rp3 )## # A tibble: 6 x 6
## First.Name Gender Salary Bonus.. Senior.Management Team
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Douglas Male 97308 6.945 TRUE Marketing
## 2 Thomas Male 61933 <NA> TRUE <NA>
## 3 Maria Female 130590 11.858 FALSE Finance
## 4 Jerry Male <NA> 9.34 TRUE Finance
## 5 Larry Male 101004 1.389 TRUE Client Services
## 6 Dennis n.a. 115163 10.125 FALSE Legal
Filling down missing values
Explicitly Missing: they are missing in the data and indicated with NA or something else.
Implicitly Missing: Not shown in the data, but implied (e.g. a missing level)
name <- c( 'jesse', 'jesse', 'jesse', 'jesse', 'andy', 'andy', 'andy', 'nic', 'nic',
'dan', 'dan', 'alex', 'alex', 'alex', 'alex')
time <- c( 'morning', 'afternoon', 'evening', 'late_night', 'morning', 'afternoon',
'late_night', 'afternoon', 'late_night', 'morning', 'evening', 'morning',
'afternoon', 'evening', 'late_night' )
value <- c(6678, 800060, 475528, 143533, 425115, 587468, 111000, 588532, 915533, 388148,
180912, 552670, 98355, 266055, 121056)
frogger <- data.frame( 'name' = name, 'time' = time, 'value' = value )glimpse( frogger )## Rows: 15
## Columns: 3
## $ name <chr> "jesse", "jesse", "jesse", "jesse", "andy", "andy", "andy", "nic…
## $ time <chr> "morning", "afternoon", "evening", "late_night", "morning", "aft…
## $ value <dbl> 6678, 800060, 475528, 143533, 425115, 587468, 111000, 588532, 91…
# Use `complete()` on the `time` and `name` variables to
# make implicit missing values explicit
frogger_tidy <- frogger %>% complete(time, name)
frogger_tidy## # A tibble: 20 x 3
## time name value
## <chr> <chr> <dbl>
## 1 afternoon alex 98355
## 2 afternoon andy 587468
## 3 afternoon dan NA
## 4 afternoon jesse 800060
## 5 afternoon nic 588532
## 6 evening alex 266055
## 7 evening andy NA
## 8 evening dan 180912
## 9 evening jesse 475528
## 10 evening nic NA
## 11 late_night alex 121056
## 12 late_night andy 111000
## 13 late_night dan NA
## 14 late_night jesse 143533
## 15 late_night nic 915533
## 16 morning alex 552670
## 17 morning andy 425115
## 18 morning dan 388148
## 19 morning jesse 6678
## 20 morning nic NA
# Use `fill()` to fill down the name variable in the frogger dataset
frogger %>% tidyr::fill(name)## name time value
## 1 jesse morning 6678
## 2 jesse afternoon 800060
## 3 jesse evening 475528
## 4 jesse late_night 143533
## 5 andy morning 425115
## 6 andy afternoon 587468
## 7 andy late_night 111000
## 8 nic afternoon 588532
## 9 nic late_night 915533
## 10 dan morning 388148
## 11 dan evening 180912
## 12 alex morning 552670
## 13 alex afternoon 98355
## 14 alex evening 266055
## 15 alex late_night 121056
frogger %>%
fill(name) %>%
complete(name,time)## # A tibble: 20 x 3
## name time value
## <chr> <chr> <dbl>
## 1 alex afternoon 98355
## 2 alex evening 266055
## 3 alex late_night 121056
## 4 alex morning 552670
## 5 andy afternoon 587468
## 6 andy evening NA
## 7 andy late_night 111000
## 8 andy morning 425115
## 9 dan afternoon NA
## 10 dan evening 180912
## 11 dan late_night NA
## 12 dan morning 388148
## 13 jesse afternoon 800060
## 14 jesse evening 475528
## 15 jesse late_night 143533
## 16 jesse morning 6678
## 17 nic afternoon 588532
## 18 nic evening NA
## 19 nic late_night 915533
## 20 nic morning NA
Missing data dependence
- MCAR: Missing Completely at Random.
- missingness has no association with any data you have observed or not observed
- Imputation is advisable
- deleting observations may reduce the sample size, limiting inference, but will not bias.
- MAR: Missing at Random
- missingness depends on data observed, but not data unobserved
- Should be imputing data
- deletion is not advisable and may lead to bias
- MNAR: Missing Not at Random
- missingness of the response is related to an unobserved value relevant to the assessment of interest.
- data will be biased by deletion and imputation
- inference can be limited, proceed with caution
glimpse( oceanbuoys )## Rows: 736
## Columns: 8
## $ year <dbl> 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997,…
## $ latitude <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ longitude <dbl> -110, -110, -110, -110, -110, -110, -110, -110, -110, -110,…
## $ sea_temp_c <dbl> 27.59, 27.55, 27.57, 27.62, 27.65, 27.83, 28.01, 28.04, 28.…
## $ air_temp_c <dbl> 27.15, 27.02, 27.00, 26.93, 26.84, 26.94, 27.04, 27.11, 27.…
## $ humidity <dbl> 79.6, 75.8, 76.5, 76.2, 76.4, 76.7, 76.5, 78.3, 78.6, 76.9,…
## $ wind_ew <dbl> -6.4, -5.3, -5.1, -4.9, -3.5, -4.4, -2.0, -3.7, -4.2, -3.6,…
## $ wind_ns <dbl> 5.4, 5.3, 4.5, 2.5, 4.1, 1.6, 3.5, 4.5, 5.0, 3.5, 2.9, 1.8,…
# Arrange by year
oceanbuoys %>% arrange(year) %>% vis_miss()# Arrange by latitude
oceanbuoys %>% arrange(latitude) %>% vis_miss()# Arrange by wind_ew (wind east west)
oceanbuoys %>% arrange(wind_ew) %>% vis_miss()gg_miss_var( oceanbuoys, facet = year)Testing missing relationships
Tools to explore missing data dependence
as_shadow() to explore missingness
- Coordinated names: shadow matrix inherits feature labels the ’_NA’
- Clear Values: binary missing or !missing
bind_shadow() or nabular() to bind the shadow mat with the data == nabular data (a mix of NA and tabular data). This format is useful to do things like calculate summary statistics based on the missingness of a feature
airquality %>%
bind_shadow() %>%
group_by( Ozone_NA ) %>%
summarise( mean = mean( Wind ) )## # A tibble: 2 x 2
## Ozone_NA mean
## * <fct> <dbl>
## 1 !NA 9.86
## 2 NA 10.3
Create Nabular Data
# Create shadow matrix data with `as_shadow()`
obs <- as_shadow( oceanbuoys )
head( obs )## # A tibble: 6 x 8
## year_NA latitude_NA longitude_NA sea_temp_c_NA air_temp_c_NA humidity_NA
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA
## # … with 2 more variables: wind_ew_NA <fct>, wind_ns_NA <fct>
# Create nabular data by binding the shadow to the data with `bind_shadow()`
bob <- bind_shadow( oceanbuoys )
dim( bob )## [1] 736 16
# Bind only the variables with missing values by using bind_shadow(only_miss = TRUE)
bob_om <- bind_shadow( oceanbuoys, only_miss = TRUE)
dim( bob_om )## [1] 736 11
Use nabular data to calculate some summary statistics about other features:
# `bind_shadow()` and `group_by()` humidity missingness (`humidity_NA`)
oceanbuoys %>%
bind_shadow() %>%
group_by( humidity_NA ) %>%
summarize(wind_ew_mean = mean(wind_ew), # calculate mean of wind_ew
wind_ew_sd = sd(wind_ew)) # calculate standard deviation of wind_ew## # A tibble: 2 x 3
## humidity_NA wind_ew_mean wind_ew_sd
## * <fct> <dbl> <dbl>
## 1 !NA -3.78 1.90
## 2 NA -3.30 2.31
# Repeat this, but calculating summaries for wind north south (`wind_ns`).
oceanbuoys %>%
bind_shadow() %>%
group_by(humidity_NA) %>%
summarize(wind_ns_mean = mean(wind_ns),
wind_ns_sd = sd(wind_ns))## # A tibble: 2 x 3
## humidity_NA wind_ns_mean wind_ns_sd
## * <fct> <dbl> <dbl>
## 1 !NA 2.78 2.06
## 2 NA 1.66 2.23
Add information about to summarize the missingness of a dataset
# How many NAs are in a feature?
# Summarize wind_ew by the missingness of `air_temp_c_NA`
oceanbuoys %>%
bind_shadow() %>%
group_by(air_temp_c_NA) %>%
summarize(wind_ew_mean = mean(wind_ew),
wind_ew_sd = sd(wind_ew),
n_obs = n())## # A tibble: 2 x 4
## air_temp_c_NA wind_ew_mean wind_ew_sd n_obs
## * <fct> <dbl> <dbl> <int>
## 1 !NA -3.91 1.85 655
## 2 NA -2.17 2.14 81
# Summarize wind_ew by missingness of `air_temp_c_NA` and `humidity_NA`
oceanbuoys %>%
bind_shadow() %>%
group_by(air_temp_c_NA, humidity_NA) %>%
summarize(wind_ew_mean = mean(wind_ew),
wind_ew_sd = sd(wind_ew),
n_obs = n())## `summarise()` has grouped output by 'air_temp_c_NA'. You can override using the `.groups` argument.
## # A tibble: 4 x 5
## # Groups: air_temp_c_NA [2]
## air_temp_c_NA humidity_NA wind_ew_mean wind_ew_sd n_obs
## <fct> <fct> <dbl> <dbl> <int>
## 1 !NA !NA -4.01 1.74 565
## 2 !NA NA -3.24 2.31 90
## 3 NA !NA -2.06 2.08 78
## 4 NA NA -4.97 1.74 3
Visualizing missingness across one variable
Exploring conditional missings w/ggplot
- How to use nabular data to explore how values change according to other values going missing
ggplot2visualizations:- density plots
- box plots
- etc.
ggplot( airquality,
aes( x = Temp ) ) +
geom_density()Create nabular data:
airquality %>%
bind_shadow() %>%
ggplot( aes( x = Temp,
color = Ozone_NA ) ) +
geom_density() The values of Temperature do not change much when data for Ozone are present or
NA
Here is a feceted versions:
airquality %>%
bind_shadow() %>%
ggplot( aes( x = Temp ) ) +
geom_density() +
facet_wrap( ~Ozone_NA )Another look with facetted scatter plots. This gives an idea of how sparce NA data is compared to when the feature is present.
airquality %>%
bind_shadow() %>%
ggplot( aes( x = Temp,
y = Wind ) ) +
geom_point() +
facet_wrap( ~Ozone_NA )Can make the same point, perhaps more obvious, with a box plot.
airquality %>%
bind_shadow() %>%
ggplot( aes( x = Ozone_NA,
y = Temp ) ) +
geom_boxplot() This shows how close the medians of the two distributions are.
Visualizing missingness with color:
airquality %>%
bind_shadow() %>%
ggplot( aes( x = Temp,
y = Wind,
color = Ozone_NA ) ) +
geom_point()Visualize the missingness of two features
airquality %>%
bind_shadow() %>%
ggplot( aes( x = Temp,
color = Ozone_NA ) ) +
geom_density() +
facet_wrap( ~ Solar.R_NA )There doesn’t appear to much much difference in the distributions of Temperature when Solar.R info is in a given record. However, when Solar.R is missing, the temperatures are low.
Now to take a look at oceanbuoys
# First explore the missingness structure of `oceanbuoys` using `vis_miss()`
vmob <- vis_miss(oceanbuoys) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# Explore the distribution of `wind_ew` for the missingness
# of `air_temp_c_NA` using `geom_density()`
bsob <- bind_shadow(oceanbuoys) %>%
ggplot(aes(x = wind_ew,
color = air_temp_c_NA)) +
geom_density()
# Explore the distribution of sea temperature for the
# missingness of humidity (humidity_NA) using `geom_density()`
bsob2 <- bind_shadow(oceanbuoys) %>%
ggplot(aes(x = sea_temp_c,
color = humidity_NA)) +
geom_density()
grid.arrange( vmob, bsob, bsob2, ncol = 1 )## Warning: Removed 3 rows containing non-finite values (stat_density).
# Explore the distribution of wind east west (wind_ew) for the missingness of air temperature
# using geom_density() and faceting by the missingness of air temperature (air_temp_c_NA).
ob1 <- oceanbuoys %>%
bind_shadow() %>%
ggplot(aes(x = wind_ew)) +
geom_density() +
facet_wrap(~air_temp_c_NA)
# Build upon this visualization by coloring by the missingness of humidity (humidity_NA).
ob2 <- oceanbuoys %>%
bind_shadow() %>%
ggplot(aes(x = wind_ew,
color = humidity_NA)) +
geom_density() +
facet_wrap(~air_temp_c_NA)
grid.arrange( ob1, ob2, ncol = 1 )# Explore the distribution of wind east west (`wind_ew`) for
# the missingness of air temperature using `geom_boxplot()`
ob1 <- oceanbuoys %>%
bind_shadow() %>%
ggplot(aes(x = air_temp_c_NA,
y = wind_ew)) +
geom_boxplot()
# Build upon this visualization by faceting by the missingness of humidity (`humidity_NA`).
ob2 <- oceanbuoys %>%
bind_shadow() %>%
ggplot(aes(x = air_temp_c_NA,
y = wind_ew)) +
geom_boxplot() +
facet_wrap(~humidity_NA)
grid.arrange( ob1, ob2, ncol = 1 )Visualizing missingness across two variables
The problem with visualizing missing data in 2D (e.g. scatterplot) is that rows with missing values are removed. ggplot2 is very kind to give a warning when rows are dropped:
ggplot( airquality,
aes( x = Ozone,
y = Solar.R ) ) +
geom_point()## Warning: Removed 42 rows containing missing values (geom_point).
geom_miss_point() visualizes missing data by placing them in the margins of a figure
ggplot( airquality,
aes( x = Ozone,
y = Solar.R ) ) +
geom_miss_point()ggplot( airquality,
aes( x = Wind,
y = Ozone ) ) +
geom_miss_point() +
facet_wrap( ~ Month )Visualize missingness with another missing variable
airquality %>%
bind_shadow() %>%
ggplot( aes( x = Wind,
y = Ozone ) ) +
geom_miss_point() +
facet_wrap( ~ Solar.R_NA )# Explore the missingness in wind and air temperature, and
# display the missingness using `geom_miss_point()`
ob1 <- ggplot(oceanbuoys,
aes(x = wind_ew,
y = air_temp_c)) +
geom_miss_point()
# Explore the missingness in humidity and air temperature,
# and display the missingness using `geom_miss_point()`
ob2 <- ggplot(oceanbuoys,
aes(x = humidity,
y = air_temp_c)) +
geom_miss_point()
grid.arrange( ob1, ob2, ncol = 2 )# Explore the missingness in wind and air temperature, and display the
# missingness using `geom_miss_point()`. Facet by year to explore this further.
ob1 <- ggplot(oceanbuoys,
aes(x = wind_ew,
y = air_temp_c)) +
geom_miss_point() +
facet_wrap(~year)
# Explore the missingness in humidity and air temperature, and display the
# missingness using `geom_miss_point()` Facet by year to explore this further.
ob2 <- ggplot(oceanbuoys,
aes(x = humidity,
y = air_temp_c)) +
geom_miss_point() +
facet_wrap(~year)
grid.arrange( ob1, ob2, ncol = 1 )# Use geom_miss_point() and facet_wrap to explore how the missingness
# in wind_ew and air_temp_c is different for missingness of humidity
bind_shadow(oceanbuoys) %>%
ggplot(aes(x = wind_ew,
y = air_temp_c)) +
geom_miss_point() +
facet_wrap(~humidity_NA)# Use geom_miss_point() and facet_grid to explore how the missingness in wind_ew and air_temp_c
# is different for missingness of humidity AND by year - by using `facet_grid(humidity_NA ~ year)`
bind_shadow(oceanbuoys) %>%
ggplot(aes(x = wind_ew,
y = air_temp_c)) +
geom_miss_point() +
facet_grid(humidity_NA~year)Connecting the dots (Imputation)
Filling in the blanks
Performing and tracking imputation
Using imputations to understand data structure. Visualizing and exploring imputed values
- Imputing data to explore missingness
- tracking missing values
- visualize imputed values against data
impute_below() imputes below to minimum vaue in the variable
impute_below( c( 5,6,7,NA,9,10 ) )## [1] 5.00000 6.00000 7.00000 4.40271 9.00000 10.00000
impute below to satisfy a conditional
summary( pedestrian )## hourly_counts date_time year month
## Min. : 0.0 Min. :2016-01-01 00:00:00 Min. :2016 October: 5540
## 1st Qu.: 72.0 1st Qu.:2016-04-08 04:00:00 1st Qu.:2016 January: 2976
## Median : 277.0 Median :2016-07-15 08:00:00 Median :2016 March : 2976
## Mean : 701.8 Mean :2016-07-09 04:46:33 Mean :2016 May : 2976
## 3rd Qu.: 878.0 3rd Qu.:2016-10-11 21:00:00 3rd Qu.:2016 July : 2976
## Max. :11273.0 Max. :2016-12-31 23:00:00 Max. :2016 August : 2976
## NA's :2548 (Other):17280
## month_day week_day hour sensor_id
## Min. : 1.00 Sunday :5396 Min. : 0.0 Min. : 2.00
## 1st Qu.: 8.00 Monday :5376 1st Qu.: 6.0 1st Qu.: 2.00
## Median :16.00 Tuesday :5328 Median :12.0 Median : 7.00
## Mean :15.75 Wednesday:5328 Mean :11.5 Mean :11.15
## 3rd Qu.:23.00 Thursday :5352 3rd Qu.:18.0 3rd Qu.:13.00
## Max. :31.00 Friday :5424 Max. :23.0 Max. :23.00
## Saturday :5496
## sensor_name
## Length:37700
## Class :character
## Mode :character
##
##
##
##
pedestrian_imp <- impute_below_if( pedestrian, is.numeric )
summary( pedestrian_imp )## hourly_counts date_time year month
## Min. :-1409.0 Min. :2016-01-01 00:00:00 Min. :2016 October: 5540
## 1st Qu.: 44.0 1st Qu.:2016-04-08 04:00:00 1st Qu.:2016 January: 2976
## Median : 243.0 Median :2016-07-15 08:00:00 Median :2016 March : 2976
## Mean : 578.1 Mean :2016-07-09 04:46:33 Mean :2016 May : 2976
## 3rd Qu.: 804.0 3rd Qu.:2016-10-11 21:00:00 3rd Qu.:2016 July : 2976
## Max. :11273.0 Max. :2016-12-31 23:00:00 Max. :2016 August : 2976
## (Other):17280
## month_day week_day hour sensor_id
## Min. : 1.00 Sunday :5396 Min. : 0.0 Min. : 2.00
## 1st Qu.: 8.00 Monday :5376 1st Qu.: 6.0 1st Qu.: 2.00
## Median :16.00 Tuesday :5328 Median :12.0 Median : 7.00
## Mean :15.75 Wednesday:5328 Mean :11.5 Mean :11.15
## 3rd Qu.:23.00 Thursday :5352 3rd Qu.:18.0 3rd Qu.:13.00
## Max. :31.00 Friday :5424 Max. :23.0 Max. :23.00
## Saturday :5496
## sensor_name
## Length:37700
## Class :character
## Mode :character
##
##
##
##
Specify features to be imputed
glimpse( oceanbuoys )## Rows: 736
## Columns: 8
## $ year <dbl> 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997,…
## $ latitude <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ longitude <dbl> -110, -110, -110, -110, -110, -110, -110, -110, -110, -110,…
## $ sea_temp_c <dbl> 27.59, 27.55, 27.57, 27.62, 27.65, 27.83, 28.01, 28.04, 28.…
## $ air_temp_c <dbl> 27.15, 27.02, 27.00, 26.93, 26.84, 26.94, 27.04, 27.11, 27.…
## $ humidity <dbl> 79.6, 75.8, 76.5, 76.2, 76.4, 76.7, 76.5, 78.3, 78.6, 76.9,…
## $ wind_ew <dbl> -6.4, -5.3, -5.1, -4.9, -3.5, -4.4, -2.0, -3.7, -4.2, -3.6,…
## $ wind_ns <dbl> 5.4, 5.3, 4.5, 2.5, 4.1, 1.6, 3.5, 4.5, 5.0, 3.5, 2.9, 1.8,…
miss_var_summary( oceanbuoys )## # A tibble: 8 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 humidity 93 12.6
## 2 air_temp_c 81 11.0
## 3 sea_temp_c 3 0.408
## 4 year 0 0
## 5 latitude 0 0
## 6 longitude 0 0
## 7 wind_ew 0 0
## 8 wind_ns 0 0
oceanbuoys_imp <- impute_below_at( oceanbuoys, vars( humidity, air_temp_c, sea_temp_c ) )
miss_var_summary( oceanbuoys_imp )## # A tibble: 8 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 year 0 0
## 2 latitude 0 0
## 3 longitude 0 0
## 4 sea_temp_c 0 0
## 5 air_temp_c 0 0
## 6 humidity 0 0
## 7 wind_ew 0 0
## 8 wind_ns 0 0
glimpse( dat_hw )## Rows: 100
## Columns: 2
## $ weight <dbl> NA, 91.20470, 81.57915, 76.84886, 111.01731, 90.15135, 63.14240…
## $ height <dbl> 2.3881462, 1.0014508, NA, NA, -0.2412422, 2.5207375, 1.4016896,…
dat_hw_imp <- impute_below_all( dat_hw )
ob1 <- ggplot(dat_hw_imp,
aes(x = weight,
y = height)) +
geom_point()
ob1Tracking missing values can be handles by using bind_shadows():
dat_hw_imp <- bind_shadow( dat_hw ) %>% impute_below_all()
head( dat_hw_imp )## # A tibble: 6 x 4
## weight height weight_NA height_NA
## <dbl> <dbl> <fct> <fct>
## 1 40.3 2.39 NA !NA
## 2 91.2 1.00 !NA !NA
## 3 81.6 -1.51 !NA NA
## 4 76.8 -1.65 !NA NA
## 5 111. -0.241 !NA !NA
## 6 90.2 2.52 !NA !NA
aq_imp <- airquality %>%
bind_shadow() %>%
impute_below_all() %>%
ggplot( aes( x = Ozone,
fill = Ozone_NA ) ) +
geom_histogram()
aq_imp## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Split the plot by missingness:
aq_imp <- airquality %>%
bind_shadow() %>%
impute_below_all() %>%
ggplot( aes( x = Ozone,
fill = Ozone_NA ) ) +
geom_histogram() +
facet_wrap( ~ Solar.R_NA )
aq_imp## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Visualize imputed value against data values using scatter plots:
aq_imp <- airquality %>%
bind_shadow() %>%
add_label_shadow() %>%
impute_below_all() %>%
ggplot( aes( x = Ozone,
y = Solar.R,
color = any_missing ) ) +
geom_point()
aq_imp This successfully recreates the figure rendered by a call to
geom_miss_point()
# Impute the oceanbuoys data below the range using `impute_below`.
ocean_imp <- impute_below_all(oceanbuoys)
# Visualize the new missing values
ggplot(ocean_imp,
aes(x = wind_ew, y = air_temp_c)) +
geom_point()# Impute and track data with `bind_shadow`, `impute_below_all`, and `add_label_shadow`
ocean_imp_track <- bind_shadow(oceanbuoys) %>% impute_below_all() %>% add_label_shadow()
# Look at the imputed values
ggplot(ocean_imp_track, aes(x = wind_ew, y = air_temp_c, color = any_missing ) ) +
geom_point()ocean_imp_track## # A tibble: 736 x 17
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## 7 1997 0 -110 28.0 27.0 76.5 -2 3.5
## 8 1997 0 -110 28.0 27.1 78.3 -3.70 4.5
## 9 1997 0 -110 28.0 27.2 78.6 -4.20 5
## 10 1997 0 -110 28.0 27.2 76.9 -3.60 3.5
## # … with 726 more rows, and 9 more variables: year_NA <fct>, latitude_NA <fct>,
## # longitude_NA <fct>, sea_temp_c_NA <fct>, air_temp_c_NA <fct>,
## # humidity_NA <fct>, wind_ew_NA <fct>, wind_ns_NA <fct>, any_missing <chr>
# Impute and track the missing values
ocean_imp_track <- bind_shadow(oceanbuoys) %>%
impute_below_all() %>%
add_label_shadow()
# Visualize the missingness in wind and air temperature,
# coloring missing air temp values with air_temp_c_NA
ggplot(ocean_imp_track,
aes(x = wind_ew, y = air_temp_c, color = air_temp_c_NA)) +
geom_point()# Visualize humidity and air temp, coloring any missing cases using the variable any_missing
ggplot(ocean_imp_track,
aes(x = humidity, y = air_temp_c, color = any_missing)) +
geom_point()# Explore the values of air_temp_c, visualizing the amount of missings with `air_temp_c_NA`.
p <- ggplot(ocean_imp_track, aes(x = air_temp_c, fill = air_temp_c_NA)) + geom_histogram()
# Expore the missings in humidity using humidity_NA
p2 <- ggplot(ocean_imp_track, aes(x = humidity, fill = humidity_NA)) + geom_histogram()
# Explore the missings in air_temp_c according to year, using `facet_wrap(~year)`.
p + facet_wrap(~year)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Explore the missings in humidity according to year, using `facet_wrap(~year)`.
p2 + facet_wrap(~year)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
What makes good imputation?
To understand good imputations, let’s spend some time taking a look at bad imputations:
Imputation by the mean value of the data is particularly bad.
Imputing by the mean is bad because it artifucially increases the mean while decreasing the variance of the dataset as the following excersizes demonstrate:
aq_imp <- airquality %>%
bind_shadow(only_miss = TRUE) %>% #bind only features with missing values
add_label_shadow() %>%
impute_mean_all() %>%
ggplot( aes( x = Ozone_NA,
y = Ozone ) ) +
geom_boxplot()## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
aq_impThe median is lower for the ‘not missing’ group than the NA group.
aq_imp <- airquality %>%
bind_shadow(only_miss = TRUE) %>% #bind only features with missing values
add_label_shadow() %>%
impute_mean_all() %>%
ggplot( aes( x = Ozone,
y = Solar.R,
color = any_missing ) ) +
geom_point()## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
aq_impExploring imputations for many variables:
aq_imp <- airquality %>%
bind_shadow() %>% #bind only features with missing values
impute_mean_all()
aq_imp_long <- shadow_long( aq_imp,
Ozone,
Solar.R )
head( aq_imp_long )## # A tibble: 6 x 4
## variable value variable_NA value_NA
## <chr> <dbl> <chr> <chr>
## 1 Ozone 41 Ozone_NA !NA
## 2 Ozone 36 Ozone_NA !NA
## 3 Ozone 12 Ozone_NA !NA
## 4 Ozone 18 Ozone_NA !NA
## 5 Ozone 42.1 Ozone_NA NA
## 6 Ozone 28 Ozone_NA !NA
…and now to visualize:
ggplot( aq_imp_long,
aes( x = value,
fill = value_NA ) ) +
geom_histogram() +
facet_wrap( ~ variable )## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Impute the mean value and track the imputations
ocean_imp_mean <- bind_shadow(oceanbuoys) %>%
impute_mean_all() %>%
add_label_shadow()
# Explore the mean values in humidity in the imputed dataset
ggplot(ocean_imp_mean,
aes(x = humidity_NA, y = humidity)) +
geom_boxplot()# Explore the values in air temperature in the imputed dataset
ggplot(ocean_imp_mean,
aes(x = air_temp_c_NA, y = air_temp_c)) +
geom_boxplot()# Explore imputations in air temperature and humidity,
# coloring by the variable, any_missing
ggplot(ocean_imp_mean,
aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point()# Explore imputations in air temperature and humidity,
# coloring by the variable, any_missing, and faceting by year
ggplot(ocean_imp_mean,
aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point() +
facet_wrap(~year)# Gather the imputed data
ocean_imp_mean_gather <- shadow_long(ocean_imp_mean,
humidity,
air_temp_c)
# Inspect the data
head( ocean_imp_mean_gather )## # A tibble: 6 x 4
## variable value variable_NA value_NA
## <chr> <chr> <chr> <chr>
## 1 air_temp_c 27.14999962 air_temp_c_NA !NA
## 2 air_temp_c 27.02000046 air_temp_c_NA !NA
## 3 air_temp_c 27 air_temp_c_NA !NA
## 4 air_temp_c 26.93000031 air_temp_c_NA !NA
## 5 air_temp_c 26.84000015 air_temp_c_NA !NA
## 6 air_temp_c 26.94000053 air_temp_c_NA !NA
# Explore the imputations in a histogram
ggplot( ocean_imp_mean_gather,
aes( x = as.numeric(value),
fill = value_NA ) ) +
geom_histogram() +
facet_wrap( ~variable )## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Performing imputations
Imputing using a linear regression model.
Try simputation
Using impute_lm from simputation is a powerfl way to impute values for a dataset. However, the model used for imputation needs the same regorous evaluation that statistical model approaches demand.
oceanbuoys_lmimp <- bind_shadow( oceanbuoys, only_miss = TRUE ) %>%
add_label_shadow() %>%
impute_lm( humidity ~ air_temp_c + wind_ew )
head( oceanbuoys_lmimp )## # A tibble: 6 x 12
## year latitude longitude sea_temp_c air_temp_c humidity wind_ew wind_ns
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1997 0 -110 27.6 27.1 79.6 -6.40 5.40
## 2 1997 0 -110 27.5 27.0 75.8 -5.30 5.30
## 3 1997 0 -110 27.6 27 76.5 -5.10 4.5
## 4 1997 0 -110 27.6 26.9 76.2 -4.90 2.5
## 5 1997 0 -110 27.6 26.8 76.4 -3.5 4.10
## 6 1997 0 -110 27.8 26.9 76.7 -4.40 1.60
## # … with 4 more variables: sea_temp_c_NA <fct>, air_temp_c_NA <fct>,
## # humidity_NA <fct>, any_missing <chr>
airquality_type <- airquality %>% mutate( Solar.R = as.double( Solar.R ),
Ozone = as.double( Ozone ) )
aq_imp_lm <- airquality_type %>%
bind_shadow() %>%
add_label_shadow() %>%
impute_lm( Solar.R ~ Wind + Temp + Month ) %>%
impute_lm( Ozone ~ Wind + Temp + Month )
head( aq_imp_lm )## # A tibble: 6 x 13
## Ozone Solar.R Wind Temp Month Day Ozone_NA Solar.R_NA Wind_NA Temp_NA
## <dbl> <dbl> <dbl> <int> <int> <int> <fct> <fct> <fct> <fct>
## 1 41 190 7.4 67 5 1 !NA !NA !NA !NA
## 2 36 118 8 72 5 2 !NA !NA !NA !NA
## 3 12 149 12.6 74 5 3 !NA !NA !NA !NA
## 4 18 313 11.5 62 5 4 !NA !NA !NA !NA
## 5 -9.04 138. 14.3 56 5 5 NA NA !NA !NA
## 6 28 178. 14.9 66 5 6 !NA NA !NA !NA
## # … with 3 more variables: Month_NA <fct>, Day_NA <fct>, any_missing <chr>
The bind_shadow() & add_labels_missings() methods are important for tracking which values were imputed. The imputed values for the last example are visualized below:
ggplot( aq_imp_lm,
aes( x = Solar.R,
y = Ozone,
color = any_missing ) ) +
geom_point() Build other linear model imputations and compare the results:
aq_imp_lm_small <- airquality_type %>%
bind_shadow() %>%
add_label_shadow() %>%
impute_lm( Solar.R ~ Wind + Temp ) %>%
impute_lm( Ozone ~ Wind + Temp )
aq_imp_lm_large <- airquality_type %>%
bind_shadow() %>%
add_label_shadow() %>%
impute_lm( Solar.R ~ Wind + Temp + Month + Day ) %>%
impute_lm( Ozone ~ Wind + Temp + Month + Day )Bind the models, so that we can build visualizations:
bound_models <- bind_rows( small = aq_imp_lm_small,
medium = aq_imp_lm,
large = aq_imp_lm_large,
.id = 'imp_model' )
head( bound_models )## # A tibble: 6 x 14
## imp_model Ozone Solar.R Wind Temp Month Day Ozone_NA Solar.R_NA Wind_NA
## <chr> <dbl> <dbl> <dbl> <int> <int> <int> <fct> <fct> <fct>
## 1 small 41 190 7.4 67 5 1 !NA !NA !NA
## 2 small 36 118 8 72 5 2 !NA !NA !NA
## 3 small 12 149 12.6 74 5 3 !NA !NA !NA
## 4 small 18 313 11.5 62 5 4 !NA !NA !NA
## 5 small -11.7 127. 14.3 56 5 5 NA NA !NA
## 6 small 28 160. 14.9 66 5 6 !NA NA !NA
## # … with 4 more variables: Temp_NA <fct>, Month_NA <fct>, Day_NA <fct>,
## # any_missing <chr>
bound_models_gather <- bound_models %>%
select( Ozone, Solar.R, any_missing, imp_model ) %>%
gather( key = 'variable', value = 'value', -any_missing, -imp_model )
head( bound_models_gather )## # A tibble: 6 x 4
## any_missing imp_model variable value
## <chr> <chr> <chr> <dbl>
## 1 Not Missing small Ozone 41
## 2 Not Missing small Ozone 36
## 3 Not Missing small Ozone 12
## 4 Not Missing small Ozone 18
## 5 Missing small Ozone -11.7
## 6 Missing small Ozone 28
ggplot( bound_models_gather,
aes( x = imp_model,
y = value,
color = imp_model ) ) +
geom_boxplot() +
facet_wrap( ~variable )# Impute humidity and air temperature using wind_ew and wind_ns, and track missing values
ocean_imp_lm_wind <- oceanbuoys %>%
bind_shadow() %>%
impute_lm(air_temp_c ~ wind_ew + wind_ns) %>%
impute_lm(humidity ~ wind_ew + wind_ns) %>%
add_label_shadow()
# Plot the imputed values for air_temp_c and humidity, colored by missingness
ggplot(ocean_imp_lm_wind,
aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point()# Bind the models together
bound_models <- bind_rows(mean = ocean_imp_mean,
lm_wind = ocean_imp_lm_wind,
.id = "imp_model")
# Inspect the values of air_temp and humidity as a scatter plot
ggplot(bound_models,
aes(x = air_temp_c,
y = humidity,
color = any_missing)) +
geom_point() +
facet_wrap(~imp_model)# Build a model adding year to the outcome
ocean_imp_lm_wind_year <- bind_shadow(oceanbuoys) %>%
impute_lm(air_temp_c ~ wind_ew + wind_ns + year) %>%
impute_lm(humidity ~ wind_ew + wind_ns + year) %>%
add_label_shadow()
# Bind the mean, lm_wind, and lm_wind_year models together
bound_models <- bind_rows(mean = ocean_imp_mean,
lm_wind = ocean_imp_lm_wind,
lm_wind_year = ocean_imp_lm_wind_year,
.id = "imp_model")
bound_models$imp_model_f <- factor( bound_models$imp_model, levels = c('mean','lm_wind','lm_wind_year'))
# Explore air_temp and humidity, coloring by any missings, and faceting by imputation model
ggplot(bound_models, aes(x = air_temp_c, y = humidity, color = any_missing)) +
geom_point() + facet_wrap(~imp_model_f)Evaluating imputations and models
Assessing inference from imputed data in a modelling context.
Compare the imputated data with a Complete case analysis (only uses rows with no missing values)
#Complete Case Analysis.
aq_cc <- airquality %>%
na.omit() %>%
bind_shadow() %>%
add_label_shadow()
#dim( aq_cc )
#Impute the data with a linear model
aq_imp_lm <- bind_shadow( airquality_type ) %>%
add_label_shadow() %>%
impute_lm( Ozone ~ Temp + Wind + Month + Day ) %>%
impute_lm( Solar.R ~ Temp + Wind + Month + Day )
#dim( aq_imp_lm )
#Bind the different datasets together
bound_models <- bind_rows( cc = aq_cc,
imp_lm = aq_imp_lm,
.id = 'imp_model' )
head( bound_models )## # A tibble: 6 x 14
## imp_model Ozone Solar.R Wind Temp Month Day Ozone_NA Solar.R_NA Wind_NA
## <chr> <dbl> <dbl> <dbl> <int> <int> <int> <fct> <fct> <fct>
## 1 cc 41 190 7.4 67 5 1 !NA !NA !NA
## 2 cc 36 118 8 72 5 2 !NA !NA !NA
## 3 cc 12 149 12.6 74 5 3 !NA !NA !NA
## 4 cc 18 313 11.5 62 5 4 !NA !NA !NA
## 5 cc 23 299 8.6 65 5 7 !NA !NA !NA
## 6 cc 19 99 13.8 59 5 8 !NA !NA !NA
## # … with 4 more variables: Temp_NA <fct>, Month_NA <fct>, Day_NA <fct>,
## # any_missing <chr>
Now that the data is formatted, fit a linear model to each of the datasets
model_summary <- bound_models %>%
group_by( imp_model ) %>%
nest() %>% #colapses the data such that each row represents a dataset
mutate( mod = map( data,
~lm( Temp ~ Ozone + Solar.R + Wind + Temp + Day + Month,
data = . ) ), #fit a linear model to each row
res = map( mod, residuals ), #get the residuals
pred = map( mod, predict ), #get a model prediction
tidy = map( mod, broom::tidy ) ) #get the coefficients too
model_summary## # A tibble: 2 x 6
## # Groups: imp_model [2]
## imp_model data mod res pred tidy
## <chr> <list> <list> <list> <list> <list>
## 1 cc <tibble [111 × 13]> <lm> <dbl [111]> <dbl [111]> <tibble [6 × 5]>
## 2 imp_lm <tibble [153 × 13]> <lm> <dbl [153]> <dbl [153]> <tibble [6 × 5]>
Explore the results from both approaches to fit a linear model to the data (with & w/out imputation)
model_summary %>%
select( imp_model,
tidy ) %>%
unnest(cols = c( tidy ) ) ## # A tibble: 12 x 6
## # Groups: imp_model [2]
## imp_model term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 cc (Intercept) 57.3 4.50 12.7 5.52e-23
## 2 cc Ozone 0.165 0.0239 6.92 3.66e-10
## 3 cc Solar.R 0.0108 0.00699 1.55 1.24e- 1
## 4 cc Wind -0.174 0.212 -0.821 4.13e- 1
## 5 cc Day -0.0892 0.0677 -1.32 1.91e- 1
## 6 cc Month 2.04 0.409 4.99 2.42e- 6
## 7 imp_lm (Intercept) 54.7 3.59 15.2 5.21e-32
## 8 imp_lm Ozone 0.196 0.0205 9.53 4.52e-17
## 9 imp_lm Solar.R 0.0102 0.00577 1.76 7.97e- 2
## 10 imp_lm Wind -0.00642 0.172 -0.0374 9.70e- 1
## 11 imp_lm Day -0.112 0.0538 -2.08 3.92e- 2
## 12 imp_lm Month 2.11 0.340 6.21 5.09e- 9
model_summary %>%
select( imp_model,
res ) %>%
unnest(cols = c( res ) ) %>%
ggplot( aes( x = res,
fill = imp_model ) ) +
geom_histogram( position = 'dodge' )## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Explore the predictions in the data
model_summary %>%
select( imp_model,
pred ) %>%
unnest(cols = c( pred ) ) %>%
ggplot( aes( x = pred,
fill = imp_model ) ) +
geom_histogram( position = 'dodge' )## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
and for the oceanbouys dataset
#Complete Case Analysis.
ocean_cc <- oceanbuoys %>%
na.omit() %>%
bind_shadow() %>%
add_label_shadow()
# Create an imputed dataset using a linear models
ocean_imp_lm_all <- bind_shadow(oceanbuoys) %>%
add_label_shadow() %>%
impute_lm(sea_temp_c ~ wind_ew + wind_ns + year + latitude + longitude) %>%
impute_lm(air_temp_c ~ wind_ew + wind_ns + year + latitude + longitude) %>%
impute_lm(humidity ~ wind_ew + wind_ns + year + latitude + longitude)
# Bind the datasets
bound_models <- bind_rows(cc = ocean_cc,
imp_lm_wind = ocean_imp_lm_wind,
imp_lm_all = ocean_imp_lm_all,
.id = "imp_model")
# Look at the models
glimpse( bound_models )## Rows: 2,037
## Columns: 18
## $ imp_model <chr> "cc", "cc", "cc", "cc", "cc", "cc", "cc", "cc", "cc", "c…
## $ year <dbl> 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 1997, 19…
## $ latitude <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ longitude <dbl> -110, -110, -110, -110, -110, -110, -110, -110, -110, -1…
## $ sea_temp_c <dbl> 27.59, 27.55, 27.57, 27.62, 27.65, 27.83, 28.01, 28.04, …
## $ air_temp_c <dbl> 27.15, 27.02, 27.00, 26.93, 26.84, 26.94, 27.04, 27.11, …
## $ humidity <dbl> 79.6, 75.8, 76.5, 76.2, 76.4, 76.7, 76.5, 78.3, 78.6, 76…
## $ wind_ew <dbl> -6.4, -5.3, -5.1, -4.9, -3.5, -4.4, -2.0, -3.7, -4.2, -3…
## $ wind_ns <dbl> 5.4, 5.3, 4.5, 2.5, 4.1, 1.6, 3.5, 4.5, 5.0, 3.5, 2.9, 1…
## $ year_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ latitude_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ longitude_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ sea_temp_c_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ air_temp_c_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ humidity_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ wind_ew_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ wind_ns_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !…
## $ any_missing <chr> "Not Missing", "Not Missing", "Not Missing", "Not Missin…
# Create the model summary for each dataset
model_summary <- bound_models %>%
group_by(imp_model) %>%
nest() %>%
mutate(mod = map(data, ~lm(sea_temp_c ~ air_temp_c + humidity + year, data = .)),
res = map(mod, residuals),
pred = map(mod, predict),
tidy = map(mod, broom::tidy ))
# Explore the coefficients in the model
model_summary %>%
select(imp_model,tidy) %>%
unnest(cols = c( tidy ))## # A tibble: 12 x 6
## # Groups: imp_model [3]
## imp_model term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 cc (Intercept) -735. 45.9 -16.0 8.19e- 48
## 2 cc air_temp_c 0.864 0.0231 37.4 2.64e-154
## 3 cc humidity 0.0341 0.00390 8.74 2.69e- 17
## 4 cc year 0.369 0.0232 15.9 3.46e- 47
## 5 imp_lm_wind (Intercept) -1742. 56.1 -31.0 1.83e-135
## 6 imp_lm_wind air_temp_c 0.365 0.0279 13.1 2.73e- 35
## 7 imp_lm_wind humidity 0.0225 0.00690 3.26 1.17e- 3
## 8 imp_lm_wind year 0.880 0.0283 31.1 6.79e-136
## 9 imp_lm_all (Intercept) -697. 51.8 -13.5 5.04e- 37
## 10 imp_lm_all air_temp_c 0.890 0.0255 35.0 2.90e-158
## 11 imp_lm_all humidity 0.0127 0.00463 2.75 6.03e- 3
## 12 imp_lm_all year 0.351 0.0262 13.4 1.12e- 36
The imp_lm_all model gives the highest estimate for air_temp_c
Final Lesson
Some Other Datasets to play with
ozoneNA_url <- 'https://raw.githubusercontent.com/njtierney/user2018-missing-data-tutorial/master/ozoneNA.csv'
ecological_url <- 'https://raw.githubusercontent.com/njtierney/user2018-missing-data-tutorial/master/ecological.csv'