library(hflights)
summary(hflights)
## Year Month DayofMonth DayOfWeek DepTime
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000 Min. : 1
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000 1st Qu.:1021
## Median :2011 Median : 7.000 Median :16.00 Median :4.000 Median :1416
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948 Mean :1396
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000 3rd Qu.:1801
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000 Max. :2400
## NA's :2905
## ArrTime UniqueCarrier FlightNum TailNum
## Min. : 1 Length:227496 Min. : 1 Length:227496
## 1st Qu.:1215 Class :character 1st Qu.: 855 Class :character
## Median :1617 Mode :character Median :1696 Mode :character
## Mean :1578 Mean :1962
## 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :7290
## NA's :3066
## ActualElapsedTime AirTime ArrDelay DepDelay
## Min. : 34.0 Min. : 11.0 Min. :-70.000 Min. :-33.000
## 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000 1st Qu.: -3.000
## Median :128.0 Median :107.0 Median : 0.000 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094 Mean : 9.445
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000 3rd Qu.: 9.000
## Max. :575.0 Max. :549.0 Max. :978.000 Max. :981.000
## NA's :3622 NA's :3622 NA's :3622 NA's :2905
## Origin Dest Distance TaxiIn
## Length:227496 Length:227496 Min. : 79.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 376.0 1st Qu.: 4.000
## Mode :character Mode :character Median : 809.0 Median : 5.000
## Mean : 787.8 Mean : 6.099
## 3rd Qu.:1042.0 3rd Qu.: 7.000
## Max. :3904.0 Max. :165.000
## NA's :3066
## TaxiOut Cancelled CancellationCode Diverted
## Min. : 1.00 Min. :0.00000 Length:227496 Min. :0.000000
## 1st Qu.: 10.00 1st Qu.:0.00000 Class :character 1st Qu.:0.000000
## Median : 14.00 Median :0.00000 Mode :character Median :0.000000
## Mean : 15.09 Mean :0.01307 Mean :0.002853
## 3rd Qu.: 18.00 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :163.00 Max. :1.00000 Max. :1.000000
## NA's :2947
sum_na <- sum(is.na(hflights)==TRUE)
cat("The total number of missing values in this dataset is: ", sum_na)
## The total number of missing values in this dataset is: 25755
The quantity in the FALSE column is the amount of rows that have at least one missing value. The quantity in the TRUE column is the amount of rows that have no missing values.
table(complete.cases(hflights))
##
## FALSE TRUE
## 3622 223874
prop.table(table(complete.cases(hflights))) * 100
##
## FALSE TRUE
## 1.592116 98.407884
library(visdat)
vis_dat(hflights, warn_large_data = FALSE)
vis_miss(hflights, warn_large_data = FALSE)
The variables with missing values are:
For this case, only focusinf on depature and arrival time variables
library(ggplot2)
ggplot(hflights,
aes(x= DepTime,
y= ArrTime)) +
geom_point()
## Warning: Removed 3066 rows containing missing values or values outside the scale range
## (`geom_point()`).
Adding geom_miss_point()
library(naniar)
ggplot(hflights,
aes(x= DepTime,
y= ArrTime)) +
geom_miss_point()
Arranging for months
ggplot(hflights,
aes(x= DepTime,
y= ArrTime)) +
geom_miss_point() +
facet_wrap(~Month)
Adding themes to the plots
ggplot(hflights,
aes(x= DepTime,
y= ArrTime)) +
geom_miss_point() +
facet_wrap(~Month) +
theme_dark()
gg_miss_var(hflights)
Adding arguments for themes:
gg_miss_var(hflights) + theme_bw()
Adding arguments for labels:
gg_miss_var(hflights) + labs(y= "Look at all those missing values")
Adding facets:
gg_miss_var(hflights, facet = Month)
as_shadow(hflights)
## # A tibble: 227,496 × 21
## Year_NA Month_NA DayofMonth_NA DayOfWeek_NA DepTime_NA ArrTime_NA
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 !NA !NA !NA !NA !NA !NA
## 2 !NA !NA !NA !NA !NA !NA
## 3 !NA !NA !NA !NA !NA !NA
## 4 !NA !NA !NA !NA !NA !NA
## 5 !NA !NA !NA !NA !NA !NA
## 6 !NA !NA !NA !NA !NA !NA
## 7 !NA !NA !NA !NA !NA !NA
## 8 !NA !NA !NA !NA !NA !NA
## 9 !NA !NA !NA !NA !NA !NA
## 10 !NA !NA !NA !NA !NA !NA
## # ℹ 227,486 more rows
## # ℹ 15 more variables: UniqueCarrier_NA <fct>, FlightNum_NA <fct>,
## # TailNum_NA <fct>, ActualElapsedTime_NA <fct>, AirTime_NA <fct>,
## # ArrDelay_NA <fct>, DepDelay_NA <fct>, Origin_NA <fct>, Dest_NA <fct>,
## # Distance_NA <fct>, TaxiIn_NA <fct>, TaxiOut_NA <fct>, Cancelled_NA <fct>,
## # CancellationCode_NA <fct>, Diverted_NA <fct>
hf_shadow <- bind_shadow(hflights)
hf_nab <- nabular(hflights)
library (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(hf_shadow)
## Rows: 227,496
## Columns: 42
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428,…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA",…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 6…
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 4…
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -…
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -…
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", …
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", …
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224,…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13,…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Year_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayofMonth_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayOfWeek_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ UniqueCarrier_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ FlightNum_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TailNum_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ActualElapsedTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ AirTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrDelay_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepDelay_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Origin_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Dest_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Distance_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiIn_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiOut_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Cancelled_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ CancellationCode_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Diverted_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
glimpse(hf_nab)
## Rows: 227,496
## Columns: 42
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428,…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA",…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 6…
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 4…
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -…
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -…
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", …
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", …
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224,…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13,…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Year_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Month_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayofMonth_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DayOfWeek_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ UniqueCarrier_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ FlightNum_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TailNum_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ActualElapsedTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ AirTime_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ ArrDelay_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ DepDelay_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Origin_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Dest_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Distance_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiIn_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ TaxiOut_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Cancelled_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ CancellationCode_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
## $ Diverted_NA <fct> !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA, !NA,…
all.equal(hf_shadow, hf_nab)
## [1] TRUE
hflights %>%
bind_shadow() %>%
group_by(DepTime_NA) %>%
summarise_at(.vars = "ArrTime",
.funs = c("mean", "sd", "var", "min", "max"),
na.rm = TRUE)
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `min = .Primitive("min")(ArrTime, na.rm = TRUE)`.
## ℹ In group 2: `DepTime_NA = NA`.
## Caused by warning:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
## # A tibble: 2 × 6
## DepTime_NA mean sd var min max
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 !NA 1578. 472. 223163. 1 2400
## 2 NA NaN NA NA Inf -Inf
ggplot(hf_shadow,
aes(x= Month,
color = DepTime_NA)) +
geom_density()
hflights %>%
bind_shadow() %>%
ggplot(aes(x = ArrTime,
fill = ActualElapsedTime_NA)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3066 rows containing non-finite outside the scale range
## (`stat_bin()`).
hflights %>%
bind_shadow() %>%
ggplot(aes(x = ActualElapsedTime,
fill = ArrTime_NA)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3622 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Combining variables of class <shade> and <factor> was deprecated in ggplot2
## 3.4.0.
## ℹ Please ensure your variables are compatible before plotting (location:
## `compute_panel()`)
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("simputation")
##
## The downloaded binary packages are in
## /var/folders/dz/5w4vtyc93lz766gz_bpfqt1m0000gn/T//RtmpiCg9E4/downloaded_packages
library(simputation)
##
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
##
## impute_median
library(dplyr)
hflights %>%
impute_lm(ActualElapsedTime ~ DepTime + ArrTime) %>%
ggplot(aes(x = DepTime,
y = ActualElapsedTime)) +
geom_point()
## Warning: Removed 3066 rows containing missing values or values outside the scale range
## (`geom_point()`).
hf_shadow %>%
as.data.frame() %>%
impute_lm(ActualElapsedTime ~ DepTime + ArrTime) %>%
ggplot(aes(x = DepTime,
y = ActualElapsedTime,
colour = ActualElapsedTime_NA)) +
geom_point()
## Warning: Removed 3066 rows containing missing values or values outside the scale range
## (`geom_point()`).
n_distinct(hflights)
## [1] 227496
n_distinct(hflights$DepTime)
## [1] 1208
n_miss(hflights)
## [1] 25755
n_miss(hflights$DepTime)
## [1] 2905
n_complete(hflights)
## [1] 4751661
n_complete(hflights$DepTime)
## [1] 224591
prop_miss_case(hflights)
## [1] 0.01592116
pct_miss_case(hflights)
## [1] 1.592116
miss_case_summary(hflights)
## # A tibble: 227,496 × 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 195 8 38.1
## 2 211 8 38.1
## 3 324 8 38.1
## 4 336 8 38.1
## 5 348 8 38.1
## 6 416 8 38.1
## 7 425 8 38.1
## 8 535 8 38.1
## 9 804 8 38.1
## 10 952 8 38.1
## # ℹ 227,486 more rows
miss_case_table(hflights)
## # A tibble: 5 × 3
## n_miss_in_case n_cases pct_cases
## <int> <int> <dbl>
## 1 0 223874 98.4
## 2 3 556 0.244
## 3 5 119 0.0523
## 4 6 42 0.0185
## 5 8 2905 1.28
prop_miss_var(hflights)
## [1] 0.3809524
pct_miss_var(hflights)
## [1] 38.09524
miss_var_summary(hflights)
## # A tibble: 21 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 ActualElapsedTime 3622 1.59
## 2 AirTime 3622 1.59
## 3 ArrDelay 3622 1.59
## 4 ArrTime 3066 1.35
## 5 TaxiIn 3066 1.35
## 6 TaxiOut 2947 1.30
## 7 DepTime 2905 1.28
## 8 DepDelay 2905 1.28
## 9 Year 0 0
## 10 Month 0 0
## # ℹ 11 more rows
miss_var_table(hflights)
## # A tibble: 5 × 3
## n_miss_in_var n_vars pct_vars
## <int> <int> <dbl>
## 1 0 13 61.9
## 2 2905 2 9.52
## 3 2947 1 4.76
## 4 3066 2 9.52
## 5 3622 3 14.3
hflights %>% miss_var_summary()
## # A tibble: 21 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 ActualElapsedTime 3622 1.59
## 2 AirTime 3622 1.59
## 3 ArrDelay 3622 1.59
## 4 ArrTime 3066 1.35
## 5 TaxiIn 3066 1.35
## 6 TaxiOut 2947 1.30
## 7 DepTime 2905 1.28
## 8 DepDelay 2905 1.28
## 9 Year 0 0
## 10 Month 0 0
## # ℹ 11 more rows
hflights %>%
group_by(Month) %>%
miss_var_summary() %>%
filter(variable == "ActualElapsedTime")
## # A tibble: 12 × 4
## # Groups: Month [12]
## Month variable n_miss pct_miss
## <int> <chr> <int> <num>
## 1 1 ActualElapsedTime 245 1.30
## 2 2 ActualElapsedTime 1153 6.73
## 3 3 ActualElapsedTime 203 1.04
## 4 4 ActualElapsedTime 327 1.76
## 5 5 ActualElapsedTime 342 1.78
## 6 6 ActualElapsedTime 240 1.22
## 7 7 ActualElapsedTime 236 1.15
## 8 8 ActualElapsedTime 249 1.23
## 9 9 ActualElapsedTime 151 0.836
## 10 10 ActualElapsedTime 148 0.792
## 11 11 ActualElapsedTime 88 0.488
## 12 12 ActualElapsedTime 240 1.26
hflights %>%
add_prop_miss() %>%
head()
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 5424 2011 1 1 6 1400 1500 AA 428
## 5425 2011 1 2 7 1401 1501 AA 428
## 5426 2011 1 3 1 1352 1502 AA 428
## 5427 2011 1 4 2 1403 1513 AA 428
## 5428 2011 1 5 3 1405 1507 AA 428
## 5429 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 5424 N576AA 60 40 -10 0 IAH DFW 224
## 5425 N557AA 60 45 -9 1 IAH DFW 224
## 5426 N541AA 70 48 -8 -8 IAH DFW 224
## 5427 N403AA 70 39 3 3 IAH DFW 224
## 5428 N492AA 62 44 -3 5 IAH DFW 224
## 5429 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted prop_miss_all
## 5424 7 13 0 0 0
## 5425 6 9 0 0 0
## 5426 5 17 0 0 0
## 5427 9 22 0 0 0
## 5428 9 9 0 0 0
## 5429 6 13 0 0 0
install.packages("rpart")
##
## The downloaded binary packages are in
## /var/folders/dz/5w4vtyc93lz766gz_bpfqt1m0000gn/T//RtmpiCg9E4/downloaded_packages
install.packages("rpart.plot")
##
## The downloaded binary packages are in
## /var/folders/dz/5w4vtyc93lz766gz_bpfqt1m0000gn/T//RtmpiCg9E4/downloaded_packages
library(rpart)
library(rpart.plot)
library(dplyr)
library(naniar)
library(hflights)
hflights %>%
add_prop_miss() %>%
rpart(prop_miss_all ~ ., data= .) %>%
prp(type = 4, extra = 101, prefix = "prop. miss = ")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call prp with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.