Evan Klein / D590 / Fall 2025
library(hflights)
summary(hflights)
## Year Month DayofMonth DayOfWeek DepTime
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000 Min. : 1
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000 1st Qu.:1021
## Median :2011 Median : 7.000 Median :16.00 Median :4.000 Median :1416
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948 Mean :1396
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000 3rd Qu.:1801
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000 Max. :2400
## NA's :2905
## ArrTime UniqueCarrier FlightNum TailNum
## Min. : 1 Length:227496 Min. : 1 Length:227496
## 1st Qu.:1215 Class :character 1st Qu.: 855 Class :character
## Median :1617 Mode :character Median :1696 Mode :character
## Mean :1578 Mean :1962
## 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :7290
## NA's :3066
## ActualElapsedTime AirTime ArrDelay DepDelay
## Min. : 34.0 Min. : 11.0 Min. :-70.000 Min. :-33.000
## 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000 1st Qu.: -3.000
## Median :128.0 Median :107.0 Median : 0.000 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094 Mean : 9.445
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000 3rd Qu.: 9.000
## Max. :575.0 Max. :549.0 Max. :978.000 Max. :981.000
## NA's :3622 NA's :3622 NA's :3622 NA's :2905
## Origin Dest Distance TaxiIn
## Length:227496 Length:227496 Min. : 79.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 376.0 1st Qu.: 4.000
## Mode :character Mode :character Median : 809.0 Median : 5.000
## Mean : 787.8 Mean : 6.099
## 3rd Qu.:1042.0 3rd Qu.: 7.000
## Max. :3904.0 Max. :165.000
## NA's :3066
## TaxiOut Cancelled CancellationCode Diverted
## Min. : 1.00 Min. :0.00000 Length:227496 Min. :0.000000
## 1st Qu.: 10.00 1st Qu.:0.00000 Class :character 1st Qu.:0.000000
## Median : 14.00 Median :0.00000 Mode :character Median :0.000000
## Mean : 15.09 Mean :0.01307 Mean :0.002853
## 3rd Qu.: 18.00 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :163.00 Max. :1.00000 Max. :1.000000
## NA's :2947
sum(is.na(hflights)==TRUE)
## [1] 25755
table(complete.cases(hflights))
##
## FALSE TRUE
## 3622 223874
prop.table(table(complete.cases(hflights))) * 100
##
## FALSE TRUE
## 1.592116 98.407884
sort(sapply(hflights, function(x) sum(is.na(x))))
## Year Month DayofMonth DayOfWeek
## 0 0 0 0
## UniqueCarrier FlightNum TailNum Origin
## 0 0 0 0
## Dest Distance Cancelled CancellationCode
## 0 0 0 0
## Diverted DepTime DepDelay TaxiOut
## 0 2905 2905 2947
## ArrTime TaxiIn ActualElapsedTime AirTime
## 3066 3066 3622 3622
## ArrDelay
## 3622
library(visdat)
vis_dat(hflights, warn_large_data = FALSE)
vis_miss(hflights, warn_large_data = FALSE)
Looks like we have a small amount of missing data in a handful of columns. Additionally, the missingness does not seem particularly random - there seems to be blocks present in our dataset where data does not exist for multiple columns. The columns with missing data are:
vals <- sapply(hflights, function(x) sum(is.na(x)))
names(vals[vals > 0])
## [1] "DepTime" "ArrTime" "ActualElapsedTime"
## [4] "AirTime" "ArrDelay" "DepDelay"
## [7] "TaxiIn" "TaxiOut"
library(naniar)
library(ggplot2)
ggplot(hflights, aes(x = DepTime, y = ArrTime)) +
geom_miss_point() +
facet_wrap(~Month) +
theme_dark()
gg_miss_var(hflights)
gg_miss_var(hflights, facet = Month)
as_shadow(hflights)
## # A tibble: 227,496 × 21
## Year_NA Month_NA DayofMonth_NA DayOfWeek_NA DepTime_NA ArrTime_NA
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA
## 7 !NA !NA !NA !NA !NA !NA
## 8 !NA !NA !NA !NA !NA !NA
## 9 !NA !NA !NA !NA !NA !NA
## 10 !NA !NA !NA !NA !NA !NA
## # ℹ 227,486 more rows
## # ℹ 15 more variables: UniqueCarrier_NA <fct>, FlightNum_NA <fct>,
## # TailNum_NA <fct>, ActualElapsedTime_NA <fct>, AirTime_NA <fct>,
## # ArrDelay_NA <fct>, DepDelay_NA <fct>, Origin_NA <fct>, Dest_NA <fct>,
## # Distance_NA <fct>, TaxiIn_NA <fct>, TaxiOut_NA <fct>, Cancelled_NA <fct>,
## # CancellationCode_NA <fct>, Diverted_NA <fct>
hflights_shadow <- bind_shadow(hflights)
hflights_nab <- nabular(hflights)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(hflights_shadow)
## Rows: 227,496
## Columns: 42
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428,…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA",…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 6…
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 4…
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -…
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -…
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", …
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", …
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224,…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13,…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Year_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayofMonth_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayOfWeek_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ UniqueCarrier_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ FlightNum_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TailNum_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ActualElapsedTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ AirTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrDelay_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepDelay_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Origin_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Dest_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Distance_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiIn_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiOut_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Cancelled_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ CancellationCode_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Diverted_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
glimpse(hflights_nab)
## Rows: 227,496
## Columns: 42
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428,…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA",…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 6…
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 4…
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -…
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -…
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", …
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", …
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224,…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13,…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Year_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayofMonth_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayOfWeek_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ UniqueCarrier_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ FlightNum_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TailNum_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ActualElapsedTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ AirTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrDelay_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepDelay_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Origin_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Dest_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Distance_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiIn_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiOut_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Cancelled_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ CancellationCode_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Diverted_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
all.equal(hflights_shadow, hflights_nab)
## [1] TRUE
hflights %>%
bind_shadow() %>%
group_by(ArrDelay_NA) %>%
summarise_at(.vars = "Month",
.funs = c("mean", "sd", "var", "min", "max"),
na.rm = TRUE)
## # A tibble: 2 × 6
## ArrDelay_NA mean sd var min max
## <fct> <dbl> <dbl> <dbl> <int> <int>
## 1 !NA 6.54 3.41 11.7 1 12
## 2 NA 4.96 3.32 11.0 1 12
ggplot(hflights_shadow, aes(x = Distance, colour = ArrDelay_NA)) +
geom_density()
hflights %>%
bind_shadow() %>%
ggplot(aes(x = Distance, fill = DepDelay_NA)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(simputation)
##
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
##
## impute_median
library(dplyr)
hflights %>%
impute_lm(DepDelay ~ Month + Year) %>%
ggplot(aes(x = Month, y = DepDelay)) +
geom_point()
hflights_shadow %>%
as.data.frame() %>%
impute_lm(DepDelay ~ Month) %>%
ggplot(aes(x = Month, y = DepDelay, colour = DepDelay_NA)) +
geom_point()
# Naniar summaries
dplyr::n_distinct(hflights)
## [1] 227496
dplyr::n_distinct(hflights$Year)
## [1] 1
n_miss(hflights)
## [1] 25755
n_complete(hflights)
## [1] 4751661
n_complete(hflights$Year)
## [1] 227496
prop_miss_case(hflights)
## [1] 0.01592116
miss_case_summary(hflights)
## # A tibble: 227,496 × 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 195 8 38.1
## 2 211 8 38.1
## 3 324 8 38.1
## 4 336 8 38.1
## 5 348 8 38.1
## 6 416 8 38.1
## 7 425 8 38.1
## 8 535 8 38.1
## 9 804 8 38.1
## 10 952 8 38.1
## # ℹ 227,486 more rows
miss_case_table(hflights)
## # A tibble: 5 × 3
## n_miss_in_case n_cases pct_cases
## <int> <int> <dbl>
## 1 0 223874 98.4
## 2 3 556 0.244
## 3 5 119 0.0523
## 4 6 42 0.0185
## 5 8 2905 1.28
miss_var_table(hflights)
## # A tibble: 5 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 13 61.9
## 2 2905 2 9.52
## 3 2947 1 4.76
## 4 3066 2 9.52
## 5 3622 3 14.3
miss_var_run(hflights, Month)
## # A tibble: 1 × 2
## run_length is_na
## <int> <chr>
## 1 227496 complete
miss_var_span(hflights, DepDelay, span_every = 10000)
## # A tibble: 23 × 6
## span_counter n_miss n_complete prop_miss prop_complete n_in_span
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 133 9867 0.0133 0.987 10000
## 2 2 82 9918 0.0082 0.992 10000
## 3 3 525 9475 0.0525 0.948 10000
## 4 4 567 9433 0.0567 0.943 10000
## 5 5 83 9917 0.0083 0.992 10000
## 6 6 87 9913 0.0087 0.991 10000
## 7 7 116 9884 0.0116 0.988 10000
## 8 8 131 9869 0.0131 0.987 10000
## 9 9 166 9834 0.0166 0.983 10000
## 10 10 96 9904 0.0096 0.990 10000
## # ℹ 13 more rows
hflights %>% miss_var_summary()
## # A tibble: 21 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 ActualElapsedTime 3622 1.59
## 2 AirTime 3622 1.59
## 3 ArrDelay 3622 1.59
## 4 ArrTime 3066 1.35
## 5 TaxiIn 3066 1.35
## 6 TaxiOut 2947 1.30
## 7 DepTime 2905 1.28
## 8 DepDelay 2905 1.28
## 9 Year 0 0
## 10 Month 0 0
## # ℹ 11 more rows
hflights %>%
group_by(Month) %>%
miss_var_summary()
## # A tibble: 240 × 4
## # Groups: Month [12]
## Month variable n_miss pct_miss
## <int> <chr> <int> <num>
## 1 1 ActualElapsedTime 245 1.30
## 2 1 AirTime 245 1.30
## 3 1 ArrDelay 245 1.30
## 4 1 ArrTime 219 1.16
## 5 1 TaxiIn 219 1.16
## 6 1 TaxiOut 206 1.09
## 7 1 DepTime 201 1.06
## 8 1 DepDelay 201 1.06
## 9 1 Year 0 0
## 10 1 DayofMonth 0 0
## # ℹ 230 more rows
hflights %>%
add_prop_miss() %>%
head()
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011 1 1 6 1400 1500 AA 428
## 5425 2011 1 2 7 1401 1501 AA 428
## 5426 2011 1 3 1 1352 1502 AA 428
## 5427 2011 1 4 2 1403 1513 AA 428
## 5428 2011 1 5 3 1405 1507 AA 428
## 5429 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424 N576AA 60 40 -10 0 IAH DFW 224
## 5425 N557AA 60 45 -9 1 IAH DFW 224
## 5426 N541AA 70 48 -8 -8 IAH DFW 224
## 5427 N403AA 70 39 3 3 IAH DFW 224
## 5428 N492AA 62 44 -3 5 IAH DFW 224
## 5429 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted prop_miss_all
## 5424 7 13 0 0 0
## 5425 6 9 0 0 0
## 5426 5 17 0 0 0
## 5427 9 22 0 0 0
## 5428 9 9 0 0 0
## 5429 6 13 0 0 0
library(rpart)
library(rpart.plot)
hflights %>%
add_prop_miss() %>%
rpart(prop_miss_all ~ ., data = .) %>%
prp(type = 4, extra = 101, prefix = "Prop. Miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call prp with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.