#library(hflights)
#summary(hflights)
#sum(is.na(hflights)==TRUE)
#table(complete.cases(hflights))
#prop.table(table(complete.cases(hflights))) * 100
#sort(sapply(hflights, function(x) sum(is.na(x))))

# Quick way to visualize missing data
library(visdat)
vis_dat(airquality)

vis_miss(airquality)

# 42 Rows containing missing values
library(ggplot2)
ggplot(airquality, 
       aes(x = Solar.R, 
           y = Ozone)) + 
  geom_point()
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Shows the difference between geom_point() & geom_miss_point()
ggplot(airquality, 
       aes(x = Solar.R, 
           y = Ozone)) + 
  geom_point()
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_point()`).

library(naniar)

ggplot(airquality, 
       aes(x = Solar.R, 
           y = Ozone)) + 
  geom_miss_point() +
  # Facets split the plot by month
  facet_wrap(~Month) +
  # Change theme
  theme_dark()

# Another method to visualize missing variables: gg_miss_var
gg_miss_var(airquality) + theme_bw() 

# Facet goes within the () of the gg_miss_var function
gg_miss_var(airquality,facet = Month) + labs(y = "Look at all the missing ones")

# Tidy missing data with the Shadow Matrix
as_shadow(airquality)
## # A tibble: 153 × 6
##    Ozone_NA Solar.R_NA Wind_NA Temp_NA Month_NA Day_NA
##    <fct>    <fct>      <fct>   <fct>   <fct>    <fct> 
##  1 !NA      !NA        !NA     !NA     !NA      !NA   
##  2 !NA      !NA        !NA     !NA     !NA      !NA   
##  3 !NA      !NA        !NA     !NA     !NA      !NA   
##  4 !NA      !NA        !NA     !NA     !NA      !NA   
##  5 NA       NA         !NA     !NA     !NA      !NA   
##  6 !NA      NA         !NA     !NA     !NA      !NA   
##  7 !NA      !NA        !NA     !NA     !NA      !NA   
##  8 !NA      !NA        !NA     !NA     !NA      !NA   
##  9 !NA      !NA        !NA     !NA     !NA      !NA   
## 10 NA       !NA        !NA     !NA     !NA      !NA   
## # ℹ 143 more rows
# bind_shadow & aq_nab creates a shadow of the current dataframe
aq_shadow <- bind_shadow(airquality)
aq_nab <- nabular(airquality)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Display the shadows & check if they are identical
glimpse(aq_shadow)
## Rows: 153
## Columns: 12
## $ Ozone      <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 1…
## $ Solar.R    <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290,…
## $ Wind       <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9…
## $ Temp       <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,…
## $ Month      <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Ozone_NA   <fct> !NA, !NA, !NA, !NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !NA, !…
## $ Solar.R_NA <fct> !NA, !NA, !NA, !NA, NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !N…
## $ Wind_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Temp_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA   <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Day_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
glimpse(aq_nab)
## Rows: 153
## Columns: 12
## $ Ozone      <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 1…
## $ Solar.R    <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290,…
## $ Wind       <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9…
## $ Temp       <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,…
## $ Month      <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Ozone_NA   <fct> !NA, !NA, !NA, !NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !NA, !…
## $ Solar.R_NA <fct> !NA, !NA, !NA, !NA, NA, NA, !NA, !NA, !NA, !NA, NA, !NA, !N…
## $ Wind_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Temp_NA    <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA   <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Day_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
all.equal(aq_shadow, aq_nab)
## [1] TRUE
# Bind shadow in practice
airquality %>%
  bind_shadow() %>%
  group_by(Ozone_NA) %>%
  summarise_at(.vars = "Solar.R",
               .funs = c("mean", "sd", "var", "min", "max"),
               na.rm = TRUE)
## # A tibble: 2 × 6
##   Ozone_NA  mean    sd   var   min   max
##   <fct>    <dbl> <dbl> <dbl> <int> <int>
## 1 !NA       185.  91.2 8309.     7   334
## 2 NA        190.  87.7 7690.    31   332
ggplot(aq_shadow,
       aes(x = Temp,
           colour = Ozone_NA)) + 
  geom_density()

# Air temperature and humidity based on the missingness of each
# Removed 81 rows
oceanbuoys %>%
  bind_shadow() %>%
  ggplot(aes(x = air_temp_c,
             fill = humidity_NA)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Removed 93 rows
oceanbuoys %>%
  bind_shadow() %>%
  ggplot(aes(x = humidity,
             fill = air_temp_c_NA)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 93 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Visualising inputs
library(simputation)
## 
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
## 
##     impute_median
library(dplyr)

airquality %>%
  impute_lm(Ozone ~ Temp + Wind) %>%
  ggplot(aes(x = Temp,
             y = Ozone)) + 
  geom_point()

#Color previously missing points from Ozone
aq_shadow %>%
  as.data.frame() %>% 
  impute_lm(Ozone ~ Temp + Wind) %>%
  ggplot(aes(x = Temp,
             y = Ozone,
             colour = Ozone_NA)) + 
  geom_point()

# 153 values
dplyr::n_distinct(airquality)
## [1] 153
# 68 values
dplyr::n_distinct(airquality$Ozone)
## [1] 68
# 44 values
n_miss(airquality)
## [1] 44
# 37 values
n_miss(airquality$Ozone)
## [1] 37
# 874 values
n_complete(airquality)
## [1] 874
# 116 values
n_complete(airquality$Ozone)
## [1] 116
# 0.2745098
prop_miss_case(airquality)
## [1] 0.2745098
# 27.45098
pct_miss_case(airquality)
## [1] 27.45098
# miss_case_summary vs miss_case_table
miss_case_summary(airquality)
## # A tibble: 153 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     5      2     33.3
##  2    27      2     33.3
##  3     6      1     16.7
##  4    10      1     16.7
##  5    11      1     16.7
##  6    25      1     16.7
##  7    26      1     16.7
##  8    32      1     16.7
##  9    33      1     16.7
## 10    34      1     16.7
## # ℹ 143 more rows
miss_case_table(airquality)
## # A tibble: 3 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0     111     72.5 
## 2              1      40     26.1 
## 3              2       2      1.31
# prop_miss_var vs pct_miss_var
prop_miss_var(airquality)
## [1] 0.3333333
pct_miss_var(airquality)
## [1] 33.33333
# miss_var_summary vs miss_var_table
miss_var_summary(airquality)
## # A tibble: 6 × 3
##   variable n_miss pct_miss
##   <chr>     <int>    <num>
## 1 Ozone        37    24.2 
## 2 Solar.R       7     4.58
## 3 Wind          0     0   
## 4 Temp          0     0   
## 5 Month         0     0   
## 6 Day           0     0
miss_var_table(airquality)
## # A tibble: 3 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0      4     66.7
## 2             7      1     16.7
## 3            37      1     16.7
# miss_var with variable of study specified
miss_var_run(pedestrian,hourly_counts)
## # A tibble: 35 × 2
##    run_length is_na   
##         <int> <chr>   
##  1       6628 complete
##  2          1 missing 
##  3       5250 complete
##  4        624 missing 
##  5       3652 complete
##  6          1 missing 
##  7       1290 complete
##  8        744 missing 
##  9       7420 complete
## 10          1 missing 
## # ℹ 25 more rows
# number of missing over a specified range of rows
miss_var_span(pedestrian,hourly_counts,span_every = 100)
## # A tibble: 377 × 6
##    span_counter n_miss n_complete prop_miss prop_complete n_in_span
##           <int>  <int>      <int>     <dbl>         <dbl>     <int>
##  1            1      0        100         0             1       100
##  2            2      0        100         0             1       100
##  3            3      0        100         0             1       100
##  4            4      0        100         0             1       100
##  5            5      0        100         0             1       100
##  6            6      0        100         0             1       100
##  7            7      0        100         0             1       100
##  8            8      0        100         0             1       100
##  9            9      0        100         0             1       100
## 10           10      0        100         0             1       100
## # ℹ 367 more rows
# group_by with naniar
pedestrian %>% miss_var_summary()
## # A tibble: 9 × 3
##   variable      n_miss pct_miss
##   <chr>          <int>    <num>
## 1 hourly_counts   2548     6.76
## 2 date_time          0     0   
## 3 year               0     0   
## 4 month              0     0   
## 5 month_day          0     0   
## 6 week_day           0     0   
## 7 hour               0     0   
## 8 sensor_id          0     0   
## 9 sensor_name        0     0
pedestrian %>%
  group_by(month) %>%
  miss_var_summary() %>%
  filter(variable == "hourly_counts")
## # A tibble: 12 × 4
## # Groups:   month [12]
##    month     variable      n_miss pct_miss
##    <ord>     <chr>          <int>    <num>
##  1 January   hourly_counts      0     0   
##  2 February  hourly_counts      0     0   
##  3 March     hourly_counts      0     0   
##  4 April     hourly_counts    552    19.2 
##  5 May       hourly_counts     72     2.42
##  6 June      hourly_counts      0     0   
##  7 July      hourly_counts      0     0   
##  8 August    hourly_counts    408    13.7 
##  9 September hourly_counts      0     0   
## 10 October   hourly_counts    412     7.44
## 11 November  hourly_counts    888    30.8 
## 12 December  hourly_counts    216     7.26
airquality %>%
  add_prop_miss() %>%
  head()
##   Ozone Solar.R Wind Temp Month Day prop_miss_all
## 1    41     190  7.4   67     5   1     0.0000000
## 2    36     118  8.0   72     5   2     0.0000000
## 3    12     149 12.6   74     5   3     0.0000000
## 4    18     313 11.5   62     5   4     0.0000000
## 5    NA      NA 14.3   56     5   5     0.3333333
## 6    28      NA 14.9   66     5   6     0.1666667
library(rpart)

#airquality %>%
  #add_prop_miss() %>%
  #rpart(prop_miss_all ~ ., data = .) %>%
  #prp(type = 4, extra = 101, prefix = "Prop. Miss = ")

#knitr::spin("Week 2 Coding Practice Part 1.R")