if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999, digits = 3)

Solution: Naive Bayes

Example 3: Predicting Delayed Flights

library(tidyverse)
library(caret)
library(e1071)
# load and preprocess dataset
delays.df <- mlba::FlightDelays %>%
  mutate(
    # change numerical variables to categorical
    DAY_WEEK = factor(DAY_WEEK),
    ORIGIN = factor(ORIGIN),
    DEST = factor(DEST),
    CARRIER = factor(CARRIER),
    Flight.Status = factor(Flight.Status),
    # create hourly bins for departure time
    CRS_DEP_TIME = factor(round(CRS_DEP_TIME / 100))
  ) %>%
  select(DAY_WEEK, CRS_DEP_TIME, ORIGIN, DEST, CARRIER, Flight.Status)

# create training and holdout sets
set.seed(1)
idx <- createDataPartition(delays.df$Flight.Status, p=0.6, list=FALSE)
train.df <- delays.df[idx, ]
holdout.df <- delays.df[-idx, ]

# run naive bayes
delays.nb <- naiveBayes(Flight.Status ~ ., data = train.df)
delays.nb
#> 
#> Naive Bayes Classifier for Discrete Predictors
#> 
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#> 
#> A-priori probabilities:
#> Y
#> delayed  ontime 
#>   0.195   0.805 
#> 
#> Conditional probabilities:
#>          DAY_WEEK
#> Y              1      2      3      4      5      6      7
#>   delayed 0.2101 0.1440 0.1284 0.1128 0.1790 0.0506 0.1751
#>   ontime  0.1241 0.1429 0.1532 0.1682 0.1805 0.1288 0.1024
#> 
#>          CRS_DEP_TIME
#> Y              6      7      8      9     10     11     12     13     14     15
#>   delayed 0.0350 0.0545 0.0545 0.0156 0.0272 0.0117 0.0623 0.0311 0.0350 0.2179
#>   ontime  0.0573 0.0526 0.0705 0.0564 0.0536 0.0348 0.0677 0.0742 0.0677 0.1316
#>          CRS_DEP_TIME
#> Y             16     17     18     19     20     21
#>   delayed 0.0856 0.1751 0.0311 0.0661 0.0195 0.0778
#>   ontime  0.0846 0.0996 0.0301 0.0432 0.0197 0.0564
#> 
#>          ORIGIN
#> Y           BWI   DCA   IAD
#>   delayed 0.105 0.486 0.409
#>   ontime  0.063 0.641 0.296
#> 
#>          DEST
#> Y           EWR   JFK   LGA
#>   delayed 0.374 0.210 0.416
#>   ontime  0.287 0.180 0.534
#> 
#>          CARRIER
#> Y              CO      DH      DL      MQ      OH      RU      UA      US
#>   delayed 0.05447 0.35019 0.10506 0.17899 0.00778 0.22568 0.01167 0.06615
#>   ontime  0.03853 0.23214 0.18327 0.12782 0.01786 0.18233 0.01692 0.20113
# use prop.table() with margin = 1 to convert a count table to a proportions table,
# where each row sums up to 1 (use margin = 2 for column sums)
prop.table(table(train.df$Flight.Status, train.df$DEST), margin = 1)
#>          
#>             EWR   JFK   LGA
#>   delayed 0.374 0.210 0.416
#>   ontime  0.287 0.180 0.534
## predict probabilities
pred.prob <- predict(delays.nb, newdata=holdout.df, type="raw")
## predict class membership
pred.class <- predict(delays.nb, newdata=holdout.df)

df <- data.frame(actual=holdout.df$Flight.Status, predicted=pred.class, pred.prob)

df[holdout.df$CARRIER == "DL" & holdout.df$DAY_WEEK == 7 & holdout.df$CRS_DEP_TIME == 10 &
     holdout.df$DEST == "LGA" & holdout.df$ORIGIN == "DCA",]
#>    actual predicted delayed ontime
#> 71 ontime    ontime  0.0665  0.934
# training
confusionMatrix(predict(delays.nb, newdata=train.df), train.df$Flight.Status)
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction delayed ontime
#>    delayed      46     47
#>    ontime      211   1017
#>                                              
#>                Accuracy : 0.805              
#>                  95% CI : (0.782, 0.826)     
#>     No Information Rate : 0.805              
#>     P-Value [Acc > NIR] : 0.544              
#>                                              
#>                   Kappa : 0.178              
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.1790             
#>             Specificity : 0.9558             
#>          Pos Pred Value : 0.4946             
#>          Neg Pred Value : 0.8282             
#>              Prevalence : 0.1945             
#>          Detection Rate : 0.0348             
#>    Detection Prevalence : 0.0704             
#>       Balanced Accuracy : 0.5674             
#>                                              
#>        'Positive' Class : delayed            
#> 
# holdout
confusionMatrix(predict(delays.nb, newdata=holdout.df), holdout.df$Flight.Status)
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction delayed ontime
#>    delayed      21     35
#>    ontime      150    674
#>                                              
#>                Accuracy : 0.79               
#>                  95% CI : (0.761, 0.816)     
#>     No Information Rate : 0.806              
#>     P-Value [Acc > NIR] : 0.891              
#>                                              
#>                   Kappa : 0.099              
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.1228             
#>             Specificity : 0.9506             
#>          Pos Pred Value : 0.3750             
#>          Neg Pred Value : 0.8180             
#>              Prevalence : 0.1943             
#>          Detection Rate : 0.0239             
#>    Detection Prevalence : 0.0636             
#>       Balanced Accuracy : 0.5367             
#>                                              
#>        'Positive' Class : delayed            
#> 
library(gains)
actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
gain <- gains(actual, pred.prob[,"delayed"], groups=length(actual) - 2)

nactual <-sum(actual)
ggplot() +
  geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) +
  geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
  labs(x="# Cases", y="Cumulative")

ggsave(file=file.path(getwd(), "figures", "chapter_08", "Flights-NB-gain.pdf"),
       width=3, height=3,
       last_plot() + theme_bw())

Working with Continuous Predictors

(p_delayed = dnorm(213, mean=211.36215, sd=15.31))
#> [1] 0.0259
  (p_ontime = dnorm(213, mean=211.99436, sd=12.79))
#> [1] 0.0311
  p_ontime * 0.805
#> [1] 0.025
  p_delayed * 0.195
#> [1] 0.00505