The Logistic Regression Model
library(ggplot2)
library(gridExtra)
p <- seq(0.005, 0.995, 0.01)
df <- data.frame(
p = p,
odds = p / (1 - p),
logit = log(p / (1 - p))
)
g1 <- ggplot(df, aes(x=p, y=odds)) +
geom_line() + coord_cartesian(xlim=c(0, 1), ylim=c(0, 100)) +
labs(x='Probability of success', y='Odds', title='(a)') +
geom_hline(yintercept = 0) +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
axis.line.x = element_blank(),
panel.border = element_blank())
g2 <- ggplot(df, aes(x=p, y=logit)) +
geom_line() + coord_cartesian(xlim=c(0, 1), ylim=c(-4, 4)) +
labs(x='Probability of success', y='Logit', title='(b)' ) +
geom_hline(yintercept = 0) +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
axis.line.x = element_blank(),
panel.border = element_blank())
grid.arrange(g1, g2, ncol=2)

g1 <- ggplotGrob(g1)
g2 <- ggplotGrob(g2)
maxWidth = grid::unit.pmax(g1$widths[2:5], g2$widths[2:5])
g1$widths[2:5] <- as.list(maxWidth)
g2$widths[2:5] <- as.list(maxWidth)
g <- arrangeGrob(g1, g2)
ggsave(file=file.path(getwd(), "figures", "chapter_10", "odds_logit.pdf"), g,
width=6, height=6)
Example of Complete Analysis: Predicting Delayed Flights
# code for generating top-left bar chart
# for other plots replace the aggregating variable in the group_by and ggplot commands
# adjust the x-label
library(tidyverse)
delays.df <- mlba::FlightDelays
averageDelay <- delays.df %>%
group_by(DAY_WEEK) %>%
summarize(mean=mean(Flight.Status == 'delayed'))
ggplot(averageDelay, aes(x=DAY_WEEK, y=mean)) +
geom_col(fill='steelblue') +
labs(x='Day of Week', y='Average delay')

averageDelayChart <- function(data, groupBy, xlabel) {
averageDelay <- data %>%
mutate(groupBy = factor(groupBy)) %>%
group_by_at(groupBy) %>%
summarize(mean=mean(Flight.Status == 'delayed'))
return (ggplot(averageDelay, aes_string(x=groupBy, y='mean')) +
geom_col(fill='steelblue') +
theme_bw() +
labs(x=xlabel, y='Average delay'))
}
binned.df <- data.frame(
Flight.Status = delays.df$Flight.Status,
CRS_DEP_TIME = round(delays.df$CRS_DEP_TIME / 100)
)
# need to change Weather to factor
delays.df$Weather <- factor(delays.df$Weather)
g1 <- averageDelayChart(delays.df, 'DAY_WEEK', 'Day of week')
g2 <- averageDelayChart(delays.df, 'DEST', 'Destination')
g3 <- averageDelayChart(binned.df, 'CRS_DEP_TIME', 'Departure time (planned)')
g4 <- averageDelayChart(delays.df, 'CARRIER', 'Carrier')
g5 <- averageDelayChart(delays.df, 'ORIGIN', 'Origin')
g6 <- averageDelayChart(delays.df, 'Weather', 'Weather')
grid.arrange(g1, g2, g3, g4, g5, g6, ncol=2, nrow=3)

g <- arrangeGrob(g1, g2, g3, g4, g5, g6, ncol=2, nrow=3)
ggsave(file=file.path(getwd(), "figures", "chapter_10", "LRFlightDelaysBarCharts_new.pdf"),
g, width=6, height=6, units="in")
# code for generating top-right bar chart
# for other plots, replace aggregating variable by setting argument by = in
# aggregate().
# in function barplot(), set the x-label (argument xlab =) and y-label
# (argument names.arg =)
# according to the variable of choice.
delays.df = mlba::FlightDelays
barplot(aggregate(delays.df$Flight.Status == "delayed", by = list(delays.df$DAY_WEEK),
mean, rm.na = T)[,2], xlab = "Day of Week", ylab = "Average Delay",
names.arg = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))

# create matrix for plot
agg <- delays.df %>%
mutate(isDelay=1 * (delays.df$Flight.Status == "delayed")) %>%
group_by(DAY_WEEK, CARRIER, ORIGIN) %>%
summarize(mean=mean(isDelay))
# plot with ggplot
# use facet_grid() with arguments scales = "free" and space = "free" to skip
# missing values.
ggplot(agg, aes(y=CARRIER, x=DAY_WEEK, fill=mean)) +
geom_tile() +
facet_grid(ORIGIN ~ ., scales="free", space="free") +
scale_fill_gradient(low="white", high="steelblue")

ggsave(file=file.path(getwd(), "figures", "chapter_10", "LRFlightDelaysHeatmap.pdf"),
last_plot() + theme_bw(), width=6, height=4, units="in")
Model-Fitting and Estimation
df <- mlba::FlightDelays %>%
mutate(
# transform variables and create bins
DAY_WEEK = factor(DAY_WEEK, levels=c(1:7),
labels=c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
CRS_DEP_TIME = factor(round(CRS_DEP_TIME / 100)),
Flight.Status = factor(Flight.Status, levels=c("ontime", "delayed")),
# convert other variables to factors
across(c(ORIGIN,DEST, CARRIER, DAY_WEEK), factor)
) %>%
# select predictors and outcome variables
select(DAY_WEEK, CRS_DEP_TIME, ORIGIN, DEST, CARRIER, Weather, Flight.Status)
# create training and holdout sets
set.seed(1)
idx <- caret::createDataPartition(df$Flight.Status, p=0.6, list=FALSE)
train.df <- df[idx, ]
holdout.df <- df[-idx, ]
# run logistic model, and show coefficients and odds
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
model <- caret::train(Flight.Status ~ ., data=train.df,
method="glm", family="binomial", trControl=trControl)
round(data.frame(summary(model$finalModel)$coefficients,
odds = exp(coef(model$finalModel))), 5)
#> Estimate Std..Error z.value Pr...z.. odds
#> (Intercept) -0.21384 0.69680 -0.30689 0.75892 0.80747
#> DAY_WEEKTue -0.52429 0.27952 -1.87569 0.06070 0.59197
#> DAY_WEEKWed -0.69392 0.27798 -2.49626 0.01255 0.49961
#> DAY_WEEKThu -0.60191 0.27030 -2.22686 0.02596 0.54776
#> DAY_WEEKFri -0.26015 0.25150 -1.03440 0.30095 0.77094
#> DAY_WEEKSat -1.11520 0.33755 -3.30386 0.00095 0.32785
#> DAY_WEEKSun -0.10445 0.28210 -0.37026 0.71119 0.90082
#> CRS_DEP_TIME7 -0.43838 0.54479 -0.80467 0.42101 0.64508
#> CRS_DEP_TIME8 -0.24910 0.48284 -0.51590 0.60593 0.77950
#> CRS_DEP_TIME9 -0.63715 0.58699 -1.08545 0.27772 0.52880
#> CRS_DEP_TIME10 -0.15908 0.51903 -0.30649 0.75923 0.85293
#> CRS_DEP_TIME11 -1.22648 0.81530 -1.50434 0.13249 0.29332
#> CRS_DEP_TIME12 -0.35617 0.49911 -0.71361 0.47547 0.70035
#> CRS_DEP_TIME13 -0.66094 0.53055 -1.24576 0.21285 0.51637
#> CRS_DEP_TIME14 -0.20777 0.51273 -0.40523 0.68531 0.81239
#> CRS_DEP_TIME15 0.82860 0.39011 2.12400 0.03367 2.29011
#> CRS_DEP_TIME16 0.25165 0.43368 0.58025 0.56174 1.28614
#> CRS_DEP_TIME17 0.65296 0.39735 1.64326 0.10033 1.92121
#> CRS_DEP_TIME18 0.12476 0.55253 0.22580 0.82136 1.13288
#> CRS_DEP_TIME19 1.13361 0.44727 2.53450 0.01126 3.10686
#> CRS_DEP_TIME20 1.17696 0.53655 2.19358 0.02827 3.24451
#> CRS_DEP_TIME21 0.72162 0.42745 1.68820 0.09137 2.05777
#> ORIGINDCA -0.61459 0.43080 -1.42664 0.15368 0.54086
#> ORIGINIAD -0.47502 0.42356 -1.12150 0.26208 0.62187
#> DESTJFK -0.20290 0.34695 -0.58482 0.55867 0.81636
#> DESTLGA 0.14842 0.34988 0.42422 0.67141 1.16001
#> CARRIERDH -0.35048 0.63475 -0.55216 0.58084 0.70435
#> CARRIERDL -1.12127 0.53402 -2.09968 0.03576 0.32587
#> CARRIERMQ 0.01041 0.51025 0.02039 0.98373 1.01046
#> CARRIEROH -1.69844 0.92132 -1.84349 0.06526 0.18297
#> CARRIERRU -0.65571 0.45153 -1.45219 0.14645 0.51908
#> CARRIERUA -0.48542 0.99757 -0.48660 0.62654 0.61544
#> CARRIERUS -1.43210 0.54216 -2.64148 0.00825 0.23881
#> Weather 17.65528 520.28452 0.03393 0.97293 46514573.53318
Variable Selection
# fewer predictors
reduced.df <- mlba::FlightDelays %>%
mutate(
HOUR = round(CRS_DEP_TIME / 100),
SUN_MON = DAY_WEEK %in% c(1, 7),
CARRIER_CO_MQ = CARRIER %in% c("CO", "MQ"),
CARRIER_DL_US = CARRIER %in% c("DL", "US"),
CRS_DEP_TIME15 = HOUR %in% c(15),
EVENING = HOUR %in% c(19, 20, 21),
Flight.Status = factor(Flight.Status, levels=c("ontime", "delayed"))
) %>%
select(Weather, SUN_MON, CARRIER_CO_MQ, CARRIER_DL_US, CRS_DEP_TIME15, EVENING, Flight.Status)
# create training and holdout sets
set.seed(1)
idx <- caret::createDataPartition(reduced.df$Flight.Status, p=0.6, list=FALSE)
train.df <- reduced.df[idx, ]
holdout.df <- reduced.df[-idx, ]
# run logistic model, and show coefficients and odds
model <- caret::train(Flight.Status ~ ., data=train.df,
method="glm", family="binomial", trControl=trControl)
summary(model$finalModel)
#>
#> Call:
#> NULL
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.2131 -0.6530 -0.5564 -0.3821 2.3036
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -1.7872 0.1292 -13.831 < 0.0000000000000002 ***
#> Weather 17.8081 519.4979 0.034 0.97265
#> SUN_MONTRUE 0.5186 0.1625 3.190 0.00142 **
#> CARRIER_CO_MQTRUE 0.3501 0.1837 1.906 0.05667 .
#> CARRIER_DL_USTRUE -0.7931 0.1857 -4.272 0.000019414 ***
#> CRS_DEP_TIME15TRUE 0.7798 0.1968 3.963 0.000073959 ***
#> EVENINGTRUE 1.0021 0.1923 5.210 0.000000189 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 1301.9 on 1320 degrees of freedom
#> Residual deviance: 1161.0 on 1314 degrees of freedom
#> AIC: 1175
#>
#> Number of Fisher Scoring iterations: 15
# evaluate
pred <- predict(model, holdout.df)
confusionMatrix(pred, holdout.df$Flight.Status)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction ontime delayed
#> ontime 706 155
#> delayed 3 16
#>
#> Accuracy : 0.8205
#> 95% CI : (0.7935, 0.8453)
#> No Information Rate : 0.8057
#> P-Value [Acc > NIR] : 0.1431
#>
#> Kappa : 0.1348
#>
#> Mcnemar's Test P-Value : <0.0000000000000002
#>
#> Sensitivity : 0.99577
#> Specificity : 0.09357
#> Pos Pred Value : 0.81998
#> Neg Pred Value : 0.84211
#> Prevalence : 0.80568
#> Detection Rate : 0.80227
#> Detection Prevalence : 0.97841
#> Balanced Accuracy : 0.54467
#>
#> 'Positive' Class : ontime
#>
actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
pred <- predict(model, holdout.df, type="prob")[, 2]
redGain <- gains(actual, pred, groups=length(pred))
# recreate training and holdout sets
set.seed(1)
idx <- caret::createDataPartition(df$Flight.Status, p=0.6, list=FALSE)
train.df <- df[idx, ]
holdout.df <- df[-idx, ]
# run logistic model, and show coefficients and odds
trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
tuneGrid <- expand.grid(lambda=10^seq(-1, -3, by=-0.1), alpha=1)
model <- caret::train(Flight.Status ~ ., data=train.df,
method="glmnet", family="binomial",
tuneGrid=tuneGrid, trControl=trControl)
coefs <- coef(model$finalModel, s=model$bestTune$lambda)
coefs[which(as.matrix(coefs)!=0),]
#> (Intercept) DAY_WEEKTue DAY_WEEKWed DAY_WEEKThu DAY_WEEKSat
#> -1.47757041 -0.14636848 -0.32774992 -0.25943593 -0.74102779
#> DAY_WEEKSun CRS_DEP_TIME7 CRS_DEP_TIME8 CRS_DEP_TIME9 CRS_DEP_TIME11
#> 0.05774036 -0.06598006 -0.02352006 -0.39623280 -0.64284994
#> CRS_DEP_TIME12 CRS_DEP_TIME13 CRS_DEP_TIME14 CRS_DEP_TIME15 CRS_DEP_TIME16
#> -0.10596502 -0.27250425 -0.01618465 0.74211925 0.10447674
#> CRS_DEP_TIME17 CRS_DEP_TIME19 CRS_DEP_TIME20 CRS_DEP_TIME21 ORIGINDCA
#> 0.68013894 1.08565534 0.88910176 0.61380555 -0.02786617
#> DESTJFK CARRIERDH CARRIERDL CARRIERMQ CARRIEROH
#> -0.04497810 0.09240657 -0.48586126 0.34601726 -0.46270331
#> CARRIERUS Weather
#> -0.74293183 4.34312680
confusionMatrix(predict(model, holdout.df), holdout.df$Flight.Status)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction ontime delayed
#> ontime 709 159
#> delayed 0 12
#>
#> Accuracy : 0.8193
#> 95% CI : (0.7923, 0.8442)
#> No Information Rate : 0.8057
#> P-Value [Acc > NIR] : 0.1637
#>
#> Kappa : 0.1084
#>
#> Mcnemar's Test P-Value : <0.0000000000000002
#>
#> Sensitivity : 1.00000
#> Specificity : 0.07018
#> Pos Pred Value : 0.81682
#> Neg Pred Value : 1.00000
#> Prevalence : 0.80568
#> Detection Rate : 0.80568
#> Detection Prevalence : 0.98636
#> Balanced Accuracy : 0.53509
#>
#> 'Positive' Class : ontime
#>
actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
pred <- predict(model, holdout.df, type="prob")[, 2]
lassoGain <- gains(actual, pred, groups=length(pred))
nactual <-sum(actual)
ggplot() +
geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual), color='grey') +
geom_line(aes(x=redGain$cume.obs, y=redGain$cume.pct.of.total * nactual)) +
geom_line(aes(x=lassoGain$cume.obs, y=lassoGain$cume.pct.of.total * nactual), color='red') +
geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
labs(x="# Cases", y="Cumulative")

ggsave(file=file.path(getwd(), "figures", "chapter_10", "LRFlightDelaysCoefsReduced.pdf"),
last_plot() + theme_bw(), width=3, height=3)