Step 1: Examining Data

Summary of dataset

library(hflights)
summary(hflights)
##       Year          Month          DayofMonth      DayOfWeek        DepTime    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000   Min.   :   1  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000   1st Qu.:1021  
##  Median :2011   Median : 7.000   Median :16.00   Median :4.000   Median :1416  
##  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948   Mean   :1396  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:1801  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000   Max.   :2400  
##                                                                  NA's   :2905  
##     ArrTime     UniqueCarrier        FlightNum      TailNum         
##  Min.   :   1   Length:227496      Min.   :   1   Length:227496     
##  1st Qu.:1215   Class :character   1st Qu.: 855   Class :character  
##  Median :1617   Mode  :character   Median :1696   Mode  :character  
##  Mean   :1578                      Mean   :1962                     
##  3rd Qu.:1953                      3rd Qu.:2755                     
##  Max.   :2400                      Max.   :7290                     
##  NA's   :3066                                                       
##  ActualElapsedTime    AirTime         ArrDelay          DepDelay      
##  Min.   : 34.0     Min.   : 11.0   Min.   :-70.000   Min.   :-33.000  
##  1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000   1st Qu.: -3.000  
##  Median :128.0     Median :107.0   Median :  0.000   Median :  0.000  
##  Mean   :129.3     Mean   :108.1   Mean   :  7.094   Mean   :  9.445  
##  3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000   3rd Qu.:  9.000  
##  Max.   :575.0     Max.   :549.0   Max.   :978.000   Max.   :981.000  
##  NA's   :3622      NA's   :3622    NA's   :3622      NA's   :2905     
##     Origin              Dest              Distance          TaxiIn       
##  Length:227496      Length:227496      Min.   :  79.0   Min.   :  1.000  
##  Class :character   Class :character   1st Qu.: 376.0   1st Qu.:  4.000  
##  Mode  :character   Mode  :character   Median : 809.0   Median :  5.000  
##                                        Mean   : 787.8   Mean   :  6.099  
##                                        3rd Qu.:1042.0   3rd Qu.:  7.000  
##                                        Max.   :3904.0   Max.   :165.000  
##                                                         NA's   :3066     
##     TaxiOut         Cancelled       CancellationCode      Diverted       
##  Min.   :  1.00   Min.   :0.00000   Length:227496      Min.   :0.000000  
##  1st Qu.: 10.00   1st Qu.:0.00000   Class :character   1st Qu.:0.000000  
##  Median : 14.00   Median :0.00000   Mode  :character   Median :0.000000  
##  Mean   : 15.09   Mean   :0.01307                      Mean   :0.002853  
##  3rd Qu.: 18.00   3rd Qu.:0.00000                      3rd Qu.:0.000000  
##  Max.   :163.00   Max.   :1.00000                      Max.   :1.000000  
##  NA's   :2947

Count of missing values in dataset

sum_na <- sum(is.na(hflights)==TRUE)
cat("The total number of missing values in this dataset is: ", sum_na)
## The total number of missing values in this dataset is:  25755

Table for the amount of rows that contain missing values

The quantity in the FALSE column is the amount of rows that have at least one missing value. The quantity in the TRUE column is the amount of rows that have no missing values.

table(complete.cases(hflights))
## 
##  FALSE   TRUE 
##   3622 223874

Percentage of complete and incomplete cases

prop.table(table(complete.cases(hflights))) * 100
## 
##     FALSE      TRUE 
##  1.592116 98.407884

Step 2: Following

Using vis_dat

library(visdat)
vis_dat(hflights, warn_large_data = FALSE)

Using vis_miss

vis_miss(hflights, warn_large_data = FALSE)

The variables with missing values are:

  • DepTime
  • ArrTime
  • ActualElapsedTime
  • AirTime
  • ArrDelay
  • DepDelay
  • TaxiIn
  • TaxiOut

Exploring missingness relationships

For this case, only focusinf on depature and arrival time variables

library(ggplot2)
ggplot(hflights,
       aes(x= DepTime,
           y= ArrTime)) +
  geom_point()
## Warning: Removed 3066 rows containing missing values or values outside the scale range
## (`geom_point()`).

Adding geom_miss_point()

library(naniar)
ggplot(hflights,
       aes(x= DepTime,
           y= ArrTime)) +
  geom_miss_point()

Arranging for months

ggplot(hflights,
       aes(x= DepTime,
           y= ArrTime)) +
  geom_miss_point() + 
  facet_wrap(~Month)

Adding themes to the plots

ggplot(hflights,
       aes(x= DepTime,
           y= ArrTime)) +
  geom_miss_point() + 
  facet_wrap(~Month) +
  theme_dark()

Visualising missings in variables
gg_miss_var(hflights)

Adding arguments for themes:

gg_miss_var(hflights) + theme_bw()

Adding arguments for labels:

gg_miss_var(hflights) + labs(y= "Look at all those missing values")

Adding facets:

gg_miss_var(hflights, facet = Month)

Tidy Missing Data: The Shadow Matrix

as_shadow(hflights)
## # A tibble: 227,496 × 21
##    Year_NA Month_NA DayofMonth_NA DayOfWeek_NA DepTime_NA ArrTime_NA
##    <fct>   <fct>    <fct>         <fct>        <fct>      <fct>     
##  1 !NA     !NA      !NA           !NA          !NA        !NA       
##  2 !NA     !NA      !NA           !NA          !NA        !NA       
##  3 !NA     !NA      !NA           !NA          !NA        !NA       
##  4 !NA     !NA      !NA           !NA          !NA        !NA       
##  5 !NA     !NA      !NA           !NA          !NA        !NA       
##  6 !NA     !NA      !NA           !NA          !NA        !NA       
##  7 !NA     !NA      !NA           !NA          !NA        !NA       
##  8 !NA     !NA      !NA           !NA          !NA        !NA       
##  9 !NA     !NA      !NA           !NA          !NA        !NA       
## 10 !NA     !NA      !NA           !NA          !NA        !NA       
## # ℹ 227,486 more rows
## # ℹ 15 more variables: UniqueCarrier_NA <fct>, FlightNum_NA <fct>,
## #   TailNum_NA <fct>, ActualElapsedTime_NA <fct>, AirTime_NA <fct>,
## #   ArrDelay_NA <fct>, DepDelay_NA <fct>, Origin_NA <fct>, Dest_NA <fct>,
## #   Distance_NA <fct>, TaxiIn_NA <fct>, TaxiOut_NA <fct>, Cancelled_NA <fct>,
## #   CancellationCode_NA <fct>, Diverted_NA <fct>
hf_shadow <- bind_shadow(hflights)
hf_nab <- nabular(hflights)

library (dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(hf_shadow)
## Rows: 227,496
## Columns: 42
## $ Year                 <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2…
## $ Month                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ DayOfWeek            <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1…
## $ DepTime              <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1…
## $ ArrTime              <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1…
## $ UniqueCarrier        <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ FlightNum            <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428,…
## $ TailNum              <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA",…
## $ ActualElapsedTime    <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 6…
## $ AirTime              <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 4…
## $ ArrDelay             <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -…
## $ DepDelay             <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -…
## $ Origin               <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", …
## $ Dest                 <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", …
## $ Distance             <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224,…
## $ TaxiIn               <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12,…
## $ TaxiOut              <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13,…
## $ Cancelled            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode     <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ Diverted             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Year_NA              <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA             <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayofMonth_NA        <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayOfWeek_NA         <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepTime_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrTime_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ UniqueCarrier_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ FlightNum_NA         <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TailNum_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ActualElapsedTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ AirTime_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrDelay_NA          <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepDelay_NA          <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Origin_NA            <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Dest_NA              <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Distance_NA          <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiIn_NA            <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiOut_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Cancelled_NA         <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ CancellationCode_NA  <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Diverted_NA          <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
glimpse(hf_nab)
## Rows: 227,496
## Columns: 42
## $ Year                 <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2…
## $ Month                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ DayOfWeek            <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1…
## $ DepTime              <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1…
## $ ArrTime              <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1…
## $ UniqueCarrier        <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ FlightNum            <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428,…
## $ TailNum              <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA",…
## $ ActualElapsedTime    <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 6…
## $ AirTime              <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 4…
## $ ArrDelay             <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -…
## $ DepDelay             <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -…
## $ Origin               <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", …
## $ Dest                 <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", …
## $ Distance             <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224,…
## $ TaxiIn               <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12,…
## $ TaxiOut              <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13,…
## $ Cancelled            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode     <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ Diverted             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Year_NA              <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA             <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayofMonth_NA        <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayOfWeek_NA         <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepTime_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrTime_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ UniqueCarrier_NA     <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ FlightNum_NA         <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TailNum_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ActualElapsedTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ AirTime_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrDelay_NA          <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepDelay_NA          <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Origin_NA            <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Dest_NA              <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Distance_NA          <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiIn_NA            <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiOut_NA           <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Cancelled_NA         <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ CancellationCode_NA  <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Diverted_NA          <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
all.equal(hf_shadow, hf_nab)
## [1] TRUE
hflights %>%
  bind_shadow() %>%
  group_by(DepTime_NA) %>%
  summarise_at(.vars = "ArrTime",
               .funs = c("mean", "sd", "var", "min", "max"),
               na.rm = TRUE)
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `min = .Primitive("min")(ArrTime, na.rm = TRUE)`.
## ℹ In group 2: `DepTime_NA = NA`.
## Caused by warning:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## # A tibble: 2 × 6
##   DepTime_NA  mean    sd     var   min   max
##   <fct>      <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1 !NA        1578.  472. 223163.     1  2400
## 2 NA          NaN    NA      NA    Inf  -Inf
ggplot(hf_shadow, 
       aes(x= Month, 
           color = DepTime_NA)) + 
  geom_density()

hflights %>%
    bind_shadow() %>%
    ggplot(aes(x = ArrTime,
               fill = ActualElapsedTime_NA)) +
        geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3066 rows containing non-finite outside the scale range
## (`stat_bin()`).

hflights %>%
    bind_shadow() %>%
    ggplot(aes(x = ActualElapsedTime,
               fill = ArrTime_NA)) +
        geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3622 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Combining variables of class <shade> and <factor> was deprecated in ggplot2
## 3.4.0.
## ℹ Please ensure your variables are compatible before plotting (location:
##   `compute_panel()`)
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Visualising imputes values

options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("simputation")
## 
## The downloaded binary packages are in
##  /var/folders/dz/5w4vtyc93lz766gz_bpfqt1m0000gn/T//RtmpiCg9E4/downloaded_packages
library(simputation)
## 
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
## 
##     impute_median
library(dplyr)

hflights %>%
  impute_lm(ActualElapsedTime ~ DepTime + ArrTime) %>%
  ggplot(aes(x = DepTime,
             y = ActualElapsedTime)) + 
  geom_point()
## Warning: Removed 3066 rows containing missing values or values outside the scale range
## (`geom_point()`).

hf_shadow %>%
  as.data.frame() %>% 
  impute_lm(ActualElapsedTime ~ DepTime + ArrTime) %>%
  ggplot(aes(x = DepTime,
             y = ActualElapsedTime,
             colour = ActualElapsedTime_NA)) + 
  geom_point()
## Warning: Removed 3066 rows containing missing values or values outside the scale range
## (`geom_point()`).

Numerical Missing Values

n_distinct(hflights)
## [1] 227496
n_distinct(hflights$DepTime)
## [1] 1208
n_miss(hflights)
## [1] 25755
n_miss(hflights$DepTime)
## [1] 2905
n_complete(hflights)
## [1] 4751661
n_complete(hflights$DepTime)
## [1] 224591
prop_miss_case(hflights)
## [1] 0.01592116
pct_miss_case(hflights)
## [1] 1.592116
miss_case_summary(hflights)
## # A tibble: 227,496 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1   195      8     38.1
##  2   211      8     38.1
##  3   324      8     38.1
##  4   336      8     38.1
##  5   348      8     38.1
##  6   416      8     38.1
##  7   425      8     38.1
##  8   535      8     38.1
##  9   804      8     38.1
## 10   952      8     38.1
## # ℹ 227,486 more rows
miss_case_table(hflights)
## # A tibble: 5 × 3
##   n_miss_in_case n_cases pct_cases
##            <int>   <int>     <dbl>
## 1              0  223874   98.4   
## 2              3     556    0.244 
## 3              5     119    0.0523
## 4              6      42    0.0185
## 5              8    2905    1.28
prop_miss_var(hflights)
## [1] 0.3809524
pct_miss_var(hflights)
## [1] 38.09524
miss_var_summary(hflights)
## # A tibble: 21 × 3
##    variable          n_miss pct_miss
##    <chr>              <int>    <num>
##  1 ActualElapsedTime   3622     1.59
##  2 AirTime             3622     1.59
##  3 ArrDelay            3622     1.59
##  4 ArrTime             3066     1.35
##  5 TaxiIn              3066     1.35
##  6 TaxiOut             2947     1.30
##  7 DepTime             2905     1.28
##  8 DepDelay            2905     1.28
##  9 Year                   0     0   
## 10 Month                  0     0   
## # ℹ 11 more rows
miss_var_table(hflights)
## # A tibble: 5 × 3
##   n_miss_in_var n_vars pct_vars
##           <int>  <int>    <dbl>
## 1             0     13    61.9 
## 2          2905      2     9.52
## 3          2947      1     4.76
## 4          3066      2     9.52
## 5          3622      3    14.3
  • Could not perform the miss_var_run and miss_var_span*

Using group_by with naniar

hflights %>% miss_var_summary()
## # A tibble: 21 × 3
##    variable          n_miss pct_miss
##    <chr>              <int>    <num>
##  1 ActualElapsedTime   3622     1.59
##  2 AirTime             3622     1.59
##  3 ArrDelay            3622     1.59
##  4 ArrTime             3066     1.35
##  5 TaxiIn              3066     1.35
##  6 TaxiOut             2947     1.30
##  7 DepTime             2905     1.28
##  8 DepDelay            2905     1.28
##  9 Year                   0     0   
## 10 Month                  0     0   
## # ℹ 11 more rows
hflights %>%
  group_by(Month) %>%
  miss_var_summary() %>%
  filter(variable == "ActualElapsedTime")
## # A tibble: 12 × 4
## # Groups:   Month [12]
##    Month variable          n_miss pct_miss
##    <int> <chr>              <int>    <num>
##  1     1 ActualElapsedTime    245    1.30 
##  2     2 ActualElapsedTime   1153    6.73 
##  3     3 ActualElapsedTime    203    1.04 
##  4     4 ActualElapsedTime    327    1.76 
##  5     5 ActualElapsedTime    342    1.78 
##  6     6 ActualElapsedTime    240    1.22 
##  7     7 ActualElapsedTime    236    1.15 
##  8     8 ActualElapsedTime    249    1.23 
##  9     9 ActualElapsedTime    151    0.836
## 10    10 ActualElapsedTime    148    0.792
## 11    11 ActualElapsedTime     88    0.488
## 12    12 ActualElapsedTime    240    1.26

Modeling missingness

hflights %>%
  add_prop_miss() %>%
  head()
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011     1          1         6    1400    1500            AA       428
## 5425 2011     1          2         7    1401    1501            AA       428
## 5426 2011     1          3         1    1352    1502            AA       428
## 5427 2011     1          4         2    1403    1513            AA       428
## 5428 2011     1          5         3    1405    1507            AA       428
## 5429 2011     1          6         4    1359    1503            AA       428
##      TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424  N576AA                60      40      -10        0    IAH  DFW      224
## 5425  N557AA                60      45       -9        1    IAH  DFW      224
## 5426  N541AA                70      48       -8       -8    IAH  DFW      224
## 5427  N403AA                70      39        3        3    IAH  DFW      224
## 5428  N492AA                62      44       -3        5    IAH  DFW      224
## 5429  N262AA                64      45       -7       -1    IAH  DFW      224
##      TaxiIn TaxiOut Cancelled CancellationCode Diverted prop_miss_all
## 5424      7      13         0                         0             0
## 5425      6       9         0                         0             0
## 5426      5      17         0                         0             0
## 5427      9      22         0                         0             0
## 5428      9       9         0                         0             0
## 5429      6      13         0                         0             0
install.packages("rpart")
## 
## The downloaded binary packages are in
##  /var/folders/dz/5w4vtyc93lz766gz_bpfqt1m0000gn/T//RtmpiCg9E4/downloaded_packages
install.packages("rpart.plot")
## 
## The downloaded binary packages are in
##  /var/folders/dz/5w4vtyc93lz766gz_bpfqt1m0000gn/T//RtmpiCg9E4/downloaded_packages
library(rpart)
library(rpart.plot)
library(dplyr) 
library(naniar)
library(hflights)
hflights %>%
  add_prop_miss() %>%
  rpart(prop_miss_all ~ ., data= .) %>%
  prp(type = 4, extra = 101, prefix = "prop. miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call prp with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.