if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999)

The Logistic Regression Model

library(ggplot2)
  library(gridExtra)

  p <- seq(0.005, 0.995, 0.01)
  df <- data.frame(
    p = p,
    odds = p / (1 - p),
    logit = log(p / (1 - p))
  )

  g1 <- ggplot(df, aes(x=p, y=odds)) +
    geom_line() + coord_cartesian(xlim=c(0, 1), ylim=c(0, 100)) +
    labs(x='Probability of success', y='Odds', title='(a)') +
    geom_hline(yintercept = 0) +
    theme_bw() +
    theme(axis.line = element_line(colour = "black"),
          axis.line.x = element_blank(),
          panel.border = element_blank())
  g2 <- ggplot(df, aes(x=p, y=logit)) +
    geom_line() + coord_cartesian(xlim=c(0, 1), ylim=c(-4, 4)) +
    labs(x='Probability of success', y='Logit', title='(b)' ) +
    geom_hline(yintercept = 0) +
    theme_bw() +
    theme(axis.line = element_line(colour = "black"),
          axis.line.x = element_blank(),
          panel.border = element_blank())

  grid.arrange(g1, g2, ncol=2)

  g1 <- ggplotGrob(g1)
  g2 <- ggplotGrob(g2)
  maxWidth = grid::unit.pmax(g1$widths[2:5], g2$widths[2:5])
  g1$widths[2:5] <- as.list(maxWidth)
  g2$widths[2:5] <- as.list(maxWidth)
  g <- arrangeGrob(g1, g2)
  ggsave(file=file.path(getwd(), "figures", "chapter_10", "odds_logit.pdf"), g,
         width=6, height=6)

Example: Acceptance of Personal Loan

Model with a Single Predictor

bank.df <- mlba::UniversalBank
g <- ggplot(bank.df, aes(x=Income, y=Personal.Loan)) +
  geom_jitter(width=0, height=0.01, alpha=0.1) +
  geom_function(fun=function(x){ return (1 / (1 + exp(6.04892 - 0.036*x)))}) +
  xlim(0, 250) +
  labs(x='Income (in $000s)') +
  theme_bw()
g

ggsave(file=file.path(getwd(), "figures", "chapter_10", "fitted_logistic.pdf"), g,
         width=5, height=3)

Estimating the Logistic Model from Data: Computing Parameter Estimates

Estimated Model

library(caret)
library(tidyverse)

# load and preprocess data
bank.df <- mlba::UniversalBank %>%
  select(-c(ID, ZIP.Code)) %>% # Drop ID and zip code columns.
  mutate(
    Education = factor(Education, levels=c(1:3),
                       labels=c("Undergrad", "Graduate", "Advanced/Professional")),
    Personal.Loan = factor(Personal.Loan, levels=c(0, 1),
                           labels=c("No", "Yes"))
  )

# partition data
set.seed(2)
idx <- caret::createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE)
train.df <- bank.df[idx, ]
holdout.df <- bank.df[-idx, ]

# build model
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
logit.reg <- caret::train(Personal.Loan ~ ., data=train.df, trControl=trControl,
                      # fit logistic regression with a generalized linear model
                      method="glm", family="binomial")
logit.reg
#> Generalized Linear Model 
#> 
#> 3000 samples
#>   11 predictor
#>    2 classes: 'No', 'Yes' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (5 fold) 
#> Summary of sample sizes: 2399, 2401, 2400, 2401, 2399 
#> Resampling results:
#> 
#>   Accuracy   Kappa   
#>   0.9566677  0.718506
summary(logit.reg$finalModel)
#> 
#> Call:
#> NULL
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -3.0684  -0.1835  -0.0708  -0.0208   3.8578  
#> 
#> Coefficients:
#>                                    Estimate Std. Error z value
#> (Intercept)                      -9.8328891  2.4283189  -4.049
#> Age                              -0.1230491  0.0924749  -1.331
#> Experience                        0.1222957  0.0917230   1.333
#> Income                            0.0594009  0.0039386  15.082
#> Family                            0.6263822  0.1028635   6.089
#> CCAvg                             0.1324588  0.0568480   2.330
#> EducationGraduate                 3.9156275  0.3529431  11.094
#> `EducationAdvanced/Professional`  4.1670416  0.3502743  11.897
#> Mortgage                          0.0010050  0.0007649   1.314
#> Securities.Account               -0.7335570  0.3760782  -1.951
#> CD.Account                        4.2298260  0.4473287   9.456
#> Online                           -1.0063789  0.2205216  -4.564
#> CreditCard                       -1.1629874  0.2846606  -4.086
#>                                              Pr(>|z|)    
#> (Intercept)                             0.00005138032 ***
#> Age                                            0.1833    
#> Experience                                     0.1824    
#> Income                           < 0.0000000000000002 ***
#> Family                                  0.00000000113 ***
#> CCAvg                                          0.0198 *  
#> EducationGraduate                < 0.0000000000000002 ***
#> `EducationAdvanced/Professional` < 0.0000000000000002 ***
#> Mortgage                                       0.1889    
#> Securities.Account                             0.0511 .  
#> CD.Account                       < 0.0000000000000002 ***
#> Online                                  0.00000502768 ***
#> CreditCard                              0.00004397779 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 1897.22  on 2999  degrees of freedom
#> Residual deviance:  686.66  on 2987  degrees of freedom
#> AIC: 712.66
#> 
#> Number of Fisher Scoring iterations: 8

Evaluating Classification Performance

Interpreting Results in Terms of Odds (for a Profiling Goal)

# use predict() with type = "response" to compute predicted probabilities.
logit.reg.pred <- predict(logit.reg, holdout.df[, -8], type = "prob")

# display four different cases
interestingCases = c(1, 12, 32, 1333)
data.frame(
  actual = holdout.df$Personal.Loan[interestingCases],
  p0 = logit.reg.pred[interestingCases, 1],
  p1 = logit.reg.pred[interestingCases, 2],
  predicted = ifelse(logit.reg.pred[interestingCases, 2] > 0.5, 1, 0)
)
#>   actual        p0           p1 predicted
#> 1     No 0.9996265 0.0003734802         0
#> 2    Yes 0.2333702 0.7666297573         1
#> 3    Yes 0.2982080 0.7017919717         1
#> 4     No 0.3145616 0.6854383741         1
library(gains)
actual <- ifelse(holdout.df$Personal.Loan == "Yes", 1, 0)
gain <- gains(actual, logit.reg.pred[,2], groups=length(actual) - 2)

# plot gains chart
nactual <-sum(actual)
g1 <- ggplot() +
  geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) +
  geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
  labs(x="# Cases", y="Cumulative") 

# plot decile-wise lift chart
gain10 <- gains(actual, logit.reg.pred[,2], groups=10)
g2 <- ggplot(mapping=aes(x=gain10$depth, y=gain10$lift / 100)) +
  geom_col(fill="steelblue") +
  geom_text(aes(label=round(gain10$lift / 100, 1)), vjust=-0.2, size=3) +
  ylim(0, 8) + labs(x="Percentile", y="Lift")
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2, widths=c(0.45, 0.55))
ggsave(file=file.path(getwd(), "figures", "chapter_10", "gains-lift.pdf"), g,
       width=6, height=2.5)

Logistic Regression for Multi-Class Classification

Ordinal Classes

# simulate simple data
Y = rep(c("a", "b", "c"), 100)
x = rep(c(1, 2, 3), 100) +  rnorm(300, 0, 1)

# ordinal logistic regression
Y = factor(Y, ordered = T)
MASS::polr(Y ~ x)
#> Call:
#> MASS::polr(formula = Y ~ x)
#> 
#> Coefficients:
#>        x 
#> 1.190752 
#> 
#> Intercepts:
#>      a|b      b|c 
#> 1.510435 3.433951 
#> 
#> Residual Deviance: 534.2951 
#> AIC: 540.2951
# nominal logistic regression
Y = factor(Y, ordered = F)
nnet::multinom(Y ~ x)
#> # weights:  9 (4 variable)
#> initial  value 329.583687 
#> iter  10 value 267.566226
#> final  value 267.566156 
#> converged
#> Call:
#> nnet::multinom(formula = Y ~ x)
#> 
#> Coefficients:
#>   (Intercept)         x
#> b   -1.550826 0.9391046
#> c   -3.632134 1.7678581
#> 
#> Residual Deviance: 535.1323 
#> AIC: 543.1323

Example of Complete Analysis: Predicting Delayed Flights

# code for generating top-left bar chart
# for other plots replace the aggregating variable in the group_by and ggplot commands
# adjust the x-label
library(tidyverse)

delays.df <- mlba::FlightDelays
averageDelay <- delays.df %>%
  group_by(DAY_WEEK) %>%
  summarize(mean=mean(Flight.Status == 'delayed'))

ggplot(averageDelay, aes(x=DAY_WEEK, y=mean)) +
  geom_col(fill='steelblue') +
  labs(x='Day of Week', y='Average delay')

averageDelayChart <- function(data, groupBy, xlabel) {
  averageDelay <- data %>%
    mutate(groupBy = factor(groupBy)) %>%
    group_by_at(groupBy) %>%
    summarize(mean=mean(Flight.Status == 'delayed'))

  return (ggplot(averageDelay, aes_string(x=groupBy, y='mean')) +
    geom_col(fill='steelblue') +
    theme_bw() +
    labs(x=xlabel, y='Average delay'))
}
binned.df <- data.frame(
  Flight.Status = delays.df$Flight.Status,
  CRS_DEP_TIME = round(delays.df$CRS_DEP_TIME / 100)
)
# need to change Weather to factor
delays.df$Weather <- factor(delays.df$Weather)
g1 <- averageDelayChart(delays.df, 'DAY_WEEK', 'Day of week')
g2 <- averageDelayChart(delays.df, 'DEST', 'Destination')
g3 <- averageDelayChart(binned.df, 'CRS_DEP_TIME', 'Departure time (planned)')
g4 <- averageDelayChart(delays.df, 'CARRIER', 'Carrier')
g5 <- averageDelayChart(delays.df, 'ORIGIN', 'Origin')
g6 <- averageDelayChart(delays.df, 'Weather', 'Weather')

grid.arrange(g1, g2, g3, g4, g5, g6, ncol=2, nrow=3)

g <- arrangeGrob(g1, g2, g3, g4, g5, g6, ncol=2, nrow=3)
ggsave(file=file.path(getwd(), "figures", "chapter_10", "LRFlightDelaysBarCharts_new.pdf"),
       g, width=6, height=6, units="in")
# code for generating top-right bar chart
# for other plots, replace aggregating variable by setting argument by =  in
# aggregate().
# in function barplot(), set the x-label (argument xlab =) and y-label
# (argument names.arg =)
# according to the variable of choice.
delays.df = mlba::FlightDelays
barplot(aggregate(delays.df$Flight.Status == "delayed", by = list(delays.df$DAY_WEEK),
                  mean, rm.na = T)[,2], xlab = "Day of Week", ylab = "Average Delay",
                  names.arg = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))

# create matrix for plot
agg <- delays.df %>%
  mutate(isDelay=1 * (delays.df$Flight.Status == "delayed")) %>%
  group_by(DAY_WEEK, CARRIER, ORIGIN) %>%
  summarize(mean=mean(isDelay))

# plot with ggplot
# use facet_grid() with arguments scales = "free" and space = "free" to skip
# missing values.
ggplot(agg, aes(y=CARRIER, x=DAY_WEEK, fill=mean)) +
  geom_tile() +
  facet_grid(ORIGIN ~ ., scales="free", space="free") +
  scale_fill_gradient(low="white", high="steelblue")

ggsave(file=file.path(getwd(), "figures", "chapter_10", "LRFlightDelaysHeatmap.pdf"),
         last_plot() + theme_bw(), width=6, height=4, units="in")

Model-Fitting and Estimation

df <- mlba::FlightDelays %>%
  mutate(
    # transform variables and create bins
    DAY_WEEK = factor(DAY_WEEK, levels=c(1:7),
                      labels=c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
    CRS_DEP_TIME = factor(round(CRS_DEP_TIME / 100)),
    Flight.Status = factor(Flight.Status, levels=c("ontime", "delayed")),
    # convert other variables to factors
    across(c(ORIGIN,DEST, CARRIER, DAY_WEEK), factor)
  ) %>%
  # select predictors and outcome variables
  select(DAY_WEEK, CRS_DEP_TIME, ORIGIN, DEST, CARRIER, Weather, Flight.Status)

# create training and holdout sets
set.seed(1)
idx <- caret::createDataPartition(df$Flight.Status, p=0.6, list=FALSE)
train.df <- df[idx, ]
holdout.df <- df[-idx, ]

# run logistic model, and show coefficients and odds
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(Flight.Status ~ ., data=train.df,
                      method="glm",  family="binomial", trControl=trControl)
round(data.frame(summary(model$finalModel)$coefficients,
                 odds = exp(coef(model$finalModel))), 5)
#>                Estimate Std..Error  z.value Pr...z..           odds
#> (Intercept)    -0.21384    0.69680 -0.30689  0.75892        0.80747
#> DAY_WEEKTue    -0.52429    0.27952 -1.87569  0.06070        0.59197
#> DAY_WEEKWed    -0.69392    0.27798 -2.49626  0.01255        0.49961
#> DAY_WEEKThu    -0.60191    0.27030 -2.22686  0.02596        0.54776
#> DAY_WEEKFri    -0.26015    0.25150 -1.03440  0.30095        0.77094
#> DAY_WEEKSat    -1.11520    0.33755 -3.30386  0.00095        0.32785
#> DAY_WEEKSun    -0.10445    0.28210 -0.37026  0.71119        0.90082
#> CRS_DEP_TIME7  -0.43838    0.54479 -0.80467  0.42101        0.64508
#> CRS_DEP_TIME8  -0.24910    0.48284 -0.51590  0.60593        0.77950
#> CRS_DEP_TIME9  -0.63715    0.58699 -1.08545  0.27772        0.52880
#> CRS_DEP_TIME10 -0.15908    0.51903 -0.30649  0.75923        0.85293
#> CRS_DEP_TIME11 -1.22648    0.81530 -1.50434  0.13249        0.29332
#> CRS_DEP_TIME12 -0.35617    0.49911 -0.71361  0.47547        0.70035
#> CRS_DEP_TIME13 -0.66094    0.53055 -1.24576  0.21285        0.51637
#> CRS_DEP_TIME14 -0.20777    0.51273 -0.40523  0.68531        0.81239
#> CRS_DEP_TIME15  0.82860    0.39011  2.12400  0.03367        2.29011
#> CRS_DEP_TIME16  0.25165    0.43368  0.58025  0.56174        1.28614
#> CRS_DEP_TIME17  0.65296    0.39735  1.64326  0.10033        1.92121
#> CRS_DEP_TIME18  0.12476    0.55253  0.22580  0.82136        1.13288
#> CRS_DEP_TIME19  1.13361    0.44727  2.53450  0.01126        3.10686
#> CRS_DEP_TIME20  1.17696    0.53655  2.19358  0.02827        3.24451
#> CRS_DEP_TIME21  0.72162    0.42745  1.68820  0.09137        2.05777
#> ORIGINDCA      -0.61459    0.43080 -1.42664  0.15368        0.54086
#> ORIGINIAD      -0.47502    0.42356 -1.12150  0.26208        0.62187
#> DESTJFK        -0.20290    0.34695 -0.58482  0.55867        0.81636
#> DESTLGA         0.14842    0.34988  0.42422  0.67141        1.16001
#> CARRIERDH      -0.35048    0.63475 -0.55216  0.58084        0.70435
#> CARRIERDL      -1.12127    0.53402 -2.09968  0.03576        0.32587
#> CARRIERMQ       0.01041    0.51025  0.02039  0.98373        1.01046
#> CARRIEROH      -1.69844    0.92132 -1.84349  0.06526        0.18297
#> CARRIERRU      -0.65571    0.45153 -1.45219  0.14645        0.51908
#> CARRIERUA      -0.48542    0.99757 -0.48660  0.62654        0.61544
#> CARRIERUS      -1.43210    0.54216 -2.64148  0.00825        0.23881
#> Weather        17.65528  520.28452  0.03393  0.97293 46514573.53318

Model Performance

library(gains)
actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
pred <- predict(model, holdout.df, type="prob")[, 2]
gain <- gains(actual, pred, groups=length(pred))

nactual <-sum(actual)
ggplot() +
  geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) +
  geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
  labs(x="# Cases", y="Cumulative")

confusionMatrix(predict(model, holdout.df), holdout.df$Flight.Status)
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction ontime delayed
#>    ontime     704     155
#>    delayed      5      16
#>                                              
#>                Accuracy : 0.8182             
#>                  95% CI : (0.7911, 0.8431)   
#>     No Information Rate : 0.8057             
#>     P-Value [Acc > NIR] : 0.186              
#>                                              
#>                   Kappa : 0.1297             
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.99295            
#>             Specificity : 0.09357            
#>          Pos Pred Value : 0.81956            
#>          Neg Pred Value : 0.76190            
#>              Prevalence : 0.80568            
#>          Detection Rate : 0.80000            
#>    Detection Prevalence : 0.97614            
#>       Balanced Accuracy : 0.54326            
#>                                              
#>        'Positive' Class : ontime             
#> 
ggsave(file=file.path(getwd(), "figures", "chapter_10", "LRperformance_Flights.pdf"),
       last_plot() + theme_bw(),
       width=3, height=3)

Variable Selection

# fewer predictors
reduced.df <- mlba::FlightDelays %>%
  mutate(
    HOUR = round(CRS_DEP_TIME / 100),
    SUN_MON = DAY_WEEK %in% c(1, 7),
    CARRIER_CO_MQ = CARRIER %in% c("CO", "MQ"),
    CARRIER_DL_US = CARRIER %in% c("DL", "US"),
    CRS_DEP_TIME15 = HOUR %in% c(15),
    EVENING = HOUR %in% c(19, 20, 21),
    Flight.Status = factor(Flight.Status, levels=c("ontime", "delayed"))
  ) %>%
  select(Weather, SUN_MON, CARRIER_CO_MQ, CARRIER_DL_US, CRS_DEP_TIME15, EVENING, Flight.Status)
# create training and holdout sets
set.seed(1)
idx <- caret::createDataPartition(reduced.df$Flight.Status, p=0.6, list=FALSE)
train.df <- reduced.df[idx, ]
holdout.df <- reduced.df[-idx, ]

# run logistic model, and show coefficients and odds
model <- caret::train(Flight.Status ~ ., data=train.df,
                      method="glm", family="binomial", trControl=trControl)
summary(model$finalModel)
#> 
#> Call:
#> NULL
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -1.2131  -0.6530  -0.5564  -0.3821   2.3036  
#> 
#> Coefficients:
#>                    Estimate Std. Error z value             Pr(>|z|)    
#> (Intercept)         -1.7872     0.1292 -13.831 < 0.0000000000000002 ***
#> Weather             17.8081   519.4979   0.034              0.97265    
#> SUN_MONTRUE          0.5186     0.1625   3.190              0.00142 ** 
#> CARRIER_CO_MQTRUE    0.3501     0.1837   1.906              0.05667 .  
#> CARRIER_DL_USTRUE   -0.7931     0.1857  -4.272          0.000019414 ***
#> CRS_DEP_TIME15TRUE   0.7798     0.1968   3.963          0.000073959 ***
#> EVENINGTRUE          1.0021     0.1923   5.210          0.000000189 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 1301.9  on 1320  degrees of freedom
#> Residual deviance: 1161.0  on 1314  degrees of freedom
#> AIC: 1175
#> 
#> Number of Fisher Scoring iterations: 15
# evaluate
pred <- predict(model, holdout.df)
confusionMatrix(pred, holdout.df$Flight.Status)
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction ontime delayed
#>    ontime     706     155
#>    delayed      3      16
#>                                              
#>                Accuracy : 0.8205             
#>                  95% CI : (0.7935, 0.8453)   
#>     No Information Rate : 0.8057             
#>     P-Value [Acc > NIR] : 0.1431             
#>                                              
#>                   Kappa : 0.1348             
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.99577            
#>             Specificity : 0.09357            
#>          Pos Pred Value : 0.81998            
#>          Neg Pred Value : 0.84211            
#>              Prevalence : 0.80568            
#>          Detection Rate : 0.80227            
#>    Detection Prevalence : 0.97841            
#>       Balanced Accuracy : 0.54467            
#>                                              
#>        'Positive' Class : ontime             
#> 
actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
pred <- predict(model, holdout.df, type="prob")[, 2]
redGain <- gains(actual, pred, groups=length(pred))
# recreate training and holdout sets
set.seed(1)
idx <- caret::createDataPartition(df$Flight.Status, p=0.6, list=FALSE)
train.df <- df[idx, ]
holdout.df <- df[-idx, ]

# run logistic model, and show coefficients and odds
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(-1, -3, by=-0.1), alpha=1)
model <- caret::train(Flight.Status ~ ., data=train.df,
                      method="glmnet",  family="binomial",
                      tuneGrid=tuneGrid, trControl=trControl)
coefs <- coef(model$finalModel, s=model$bestTune$lambda)
coefs[which(as.matrix(coefs)!=0),]
#>    (Intercept)    DAY_WEEKTue    DAY_WEEKWed    DAY_WEEKThu    DAY_WEEKSat 
#>    -1.47757041    -0.14636848    -0.32774992    -0.25943593    -0.74102779 
#>    DAY_WEEKSun  CRS_DEP_TIME7  CRS_DEP_TIME8  CRS_DEP_TIME9 CRS_DEP_TIME11 
#>     0.05774036    -0.06598006    -0.02352006    -0.39623280    -0.64284994 
#> CRS_DEP_TIME12 CRS_DEP_TIME13 CRS_DEP_TIME14 CRS_DEP_TIME15 CRS_DEP_TIME16 
#>    -0.10596502    -0.27250425    -0.01618465     0.74211925     0.10447674 
#> CRS_DEP_TIME17 CRS_DEP_TIME19 CRS_DEP_TIME20 CRS_DEP_TIME21      ORIGINDCA 
#>     0.68013894     1.08565534     0.88910176     0.61380555    -0.02786617 
#>        DESTJFK      CARRIERDH      CARRIERDL      CARRIERMQ      CARRIEROH 
#>    -0.04497810     0.09240657    -0.48586126     0.34601726    -0.46270331 
#>      CARRIERUS        Weather 
#>    -0.74293183     4.34312680
confusionMatrix(predict(model, holdout.df), holdout.df$Flight.Status)
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction ontime delayed
#>    ontime     709     159
#>    delayed      0      12
#>                                              
#>                Accuracy : 0.8193             
#>                  95% CI : (0.7923, 0.8442)   
#>     No Information Rate : 0.8057             
#>     P-Value [Acc > NIR] : 0.1637             
#>                                              
#>                   Kappa : 0.1084             
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 1.00000            
#>             Specificity : 0.07018            
#>          Pos Pred Value : 0.81682            
#>          Neg Pred Value : 1.00000            
#>              Prevalence : 0.80568            
#>          Detection Rate : 0.80568            
#>    Detection Prevalence : 0.98636            
#>       Balanced Accuracy : 0.53509            
#>                                              
#>        'Positive' Class : ontime             
#> 
actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
pred <- predict(model, holdout.df, type="prob")[, 2]
lassoGain <- gains(actual, pred, groups=length(pred))

nactual <-sum(actual)
ggplot() +
  geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual), color='grey') +
  geom_line(aes(x=redGain$cume.obs, y=redGain$cume.pct.of.total * nactual)) +
  geom_line(aes(x=lassoGain$cume.obs, y=lassoGain$cume.pct.of.total * nactual), color='red') +
  geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
  labs(x="# Cases", y="Cumulative")

ggsave(file=file.path(getwd(), "figures", "chapter_10", "LRFlightDelaysCoefsReduced.pdf"),
       last_plot() + theme_bw(), width=3, height=3)