Example 3: Predicting Delayed Flights
library(tidyverse)
library(caret)
library(e1071)
# load and preprocess dataset
delays.df <- mlba::FlightDelays %>%
mutate(
# change numerical variables to categorical
DAY_WEEK = factor(DAY_WEEK),
ORIGIN = factor(ORIGIN),
DEST = factor(DEST),
CARRIER = factor(CARRIER),
Flight.Status = factor(Flight.Status),
# create hourly bins for departure time
CRS_DEP_TIME = factor(round(CRS_DEP_TIME / 100))
) %>%
select(DAY_WEEK, CRS_DEP_TIME, ORIGIN, DEST, CARRIER, Flight.Status)
# create training and holdout sets
set.seed(1)
idx <- createDataPartition(delays.df$Flight.Status, p=0.6, list=FALSE)
train.df <- delays.df[idx, ]
holdout.df <- delays.df[-idx, ]
# run naive bayes
delays.nb <- naiveBayes(Flight.Status ~ ., data = train.df)
delays.nb
#>
#> Naive Bayes Classifier for Discrete Predictors
#>
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#>
#> A-priori probabilities:
#> Y
#> delayed ontime
#> 0.195 0.805
#>
#> Conditional probabilities:
#> DAY_WEEK
#> Y 1 2 3 4 5 6 7
#> delayed 0.2101 0.1440 0.1284 0.1128 0.1790 0.0506 0.1751
#> ontime 0.1241 0.1429 0.1532 0.1682 0.1805 0.1288 0.1024
#>
#> CRS_DEP_TIME
#> Y 6 7 8 9 10 11 12 13 14 15
#> delayed 0.0350 0.0545 0.0545 0.0156 0.0272 0.0117 0.0623 0.0311 0.0350 0.2179
#> ontime 0.0573 0.0526 0.0705 0.0564 0.0536 0.0348 0.0677 0.0742 0.0677 0.1316
#> CRS_DEP_TIME
#> Y 16 17 18 19 20 21
#> delayed 0.0856 0.1751 0.0311 0.0661 0.0195 0.0778
#> ontime 0.0846 0.0996 0.0301 0.0432 0.0197 0.0564
#>
#> ORIGIN
#> Y BWI DCA IAD
#> delayed 0.105 0.486 0.409
#> ontime 0.063 0.641 0.296
#>
#> DEST
#> Y EWR JFK LGA
#> delayed 0.374 0.210 0.416
#> ontime 0.287 0.180 0.534
#>
#> CARRIER
#> Y CO DH DL MQ OH RU UA US
#> delayed 0.05447 0.35019 0.10506 0.17899 0.00778 0.22568 0.01167 0.06615
#> ontime 0.03853 0.23214 0.18327 0.12782 0.01786 0.18233 0.01692 0.20113
# use prop.table() with margin = 1 to convert a count table to a proportions table,
# where each row sums up to 1 (use margin = 2 for column sums)
prop.table(table(train.df$Flight.Status, train.df$DEST), margin = 1)
#>
#> EWR JFK LGA
#> delayed 0.374 0.210 0.416
#> ontime 0.287 0.180 0.534
## predict probabilities
pred.prob <- predict(delays.nb, newdata=holdout.df, type="raw")
## predict class membership
pred.class <- predict(delays.nb, newdata=holdout.df)
df <- data.frame(actual=holdout.df$Flight.Status, predicted=pred.class, pred.prob)
df[holdout.df$CARRIER == "DL" & holdout.df$DAY_WEEK == 7 & holdout.df$CRS_DEP_TIME == 10 &
holdout.df$DEST == "LGA" & holdout.df$ORIGIN == "DCA",]
#> actual predicted delayed ontime
#> 71 ontime ontime 0.0665 0.934
# training
confusionMatrix(predict(delays.nb, newdata=train.df), train.df$Flight.Status)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction delayed ontime
#> delayed 46 47
#> ontime 211 1017
#>
#> Accuracy : 0.805
#> 95% CI : (0.782, 0.826)
#> No Information Rate : 0.805
#> P-Value [Acc > NIR] : 0.544
#>
#> Kappa : 0.178
#>
#> Mcnemar's Test P-Value : <0.0000000000000002
#>
#> Sensitivity : 0.1790
#> Specificity : 0.9558
#> Pos Pred Value : 0.4946
#> Neg Pred Value : 0.8282
#> Prevalence : 0.1945
#> Detection Rate : 0.0348
#> Detection Prevalence : 0.0704
#> Balanced Accuracy : 0.5674
#>
#> 'Positive' Class : delayed
#>
# holdout
confusionMatrix(predict(delays.nb, newdata=holdout.df), holdout.df$Flight.Status)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction delayed ontime
#> delayed 21 35
#> ontime 150 674
#>
#> Accuracy : 0.79
#> 95% CI : (0.761, 0.816)
#> No Information Rate : 0.806
#> P-Value [Acc > NIR] : 0.891
#>
#> Kappa : 0.099
#>
#> Mcnemar's Test P-Value : <0.0000000000000002
#>
#> Sensitivity : 0.1228
#> Specificity : 0.9506
#> Pos Pred Value : 0.3750
#> Neg Pred Value : 0.8180
#> Prevalence : 0.1943
#> Detection Rate : 0.0239
#> Detection Prevalence : 0.0636
#> Balanced Accuracy : 0.5367
#>
#> 'Positive' Class : delayed
#>
library(gains)
actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
gain <- gains(actual, pred.prob[,"delayed"], groups=length(actual) - 2)
nactual <-sum(actual)
ggplot() +
geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) +
geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
labs(x="# Cases", y="Cumulative")

ggsave(file=file.path(getwd(), "figures", "chapter_08", "Flights-NB-gain.pdf"),
width=3, height=3,
last_plot() + theme_bw())