install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("rsample")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("vip")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("modeldata")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(dplyr) # for data wrangling
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # for awesome plotting
library(rsample) # for data splitting
# Model interpretability packages
library(vip) # variable importance
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
# Job attrition data ----
library(modeldata)
data("attrition", package = "modeldata")
# access data
dim(attrition)
## [1] 1470 31
# Convert ordered factor columns into factor columns only!
df <- attrition %>% mutate_if(is.ordered, factor, ordered = FALSE)
# Create training (70%) and test (30%) sets for the
# rsample::attrition data.
set.seed(123) # for reproducibility
churn_split <- initial_split(df, prop = .7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test <- testing(churn_split)
# 5.3 Simple logistic regression ----
model1 <- glm(Attrition ~ MonthlyIncome,
family = "binomial", data = churn_train)
model2 <- glm(Attrition ~ OverTime, family = "binomial",
data = churn_train)
#
par(mfrow = c(2, 2))
plot(model1)

par(mfrow = c(1,1))
plot(churn_train$MonthlyIncome, model1$fitted.values)

ggplot(aes(x = MonthlyIncome, y = ifelse(Attrition == "Yes", 1, 0)), data = churn_train) +
geom_point() +
geom_smooth(method = loess, formula = model1$fitted.values ~ x, color = "gray") +
ylab("Probability of Attrition") +
ggtitle("Predicted Probabilites of Model 1")

# ggplot(aes(x = OverTime, y = ifelse(Attrition == "Yes", 1, 0)), data = churn_train) +
# geom_point() +
# geom_smooth(method = loess, formula = model1$fitted.values ~ x, color = "gray") +
# ylab("Probability of Attrition") +
# ggtitle("Predicted Probabilites of Model 1")
#
exp(coef(model1))
## (Intercept) MonthlyIncome
## 0.4122647 0.9998614
exp(coef(model2))
## (Intercept) OverTimeYes
## 0.1189759 3.6323389
confint(model1)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.1932606571 -5.761048e-01
## MonthlyIncome -0.0001948723 -8.803311e-05
confint(model2)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -2.3695727 -1.902409
## OverTimeYes 0.9463761 1.635373
# codings from the textbook
churn_train2 <- churn_train %>% mutate(prob = ifelse(Attrition == "Yes", 1, 0))
churn_train2 <- broom::augment(model2, churn_train2) %>%
mutate(.fitted = exp(.fitted))
p1 <- ggplot(churn_train2, aes(MonthlyIncome, prob)) +
geom_point(alpha = 0.15) +
geom_smooth(method = "glm", method.args = list(family = "binomial")) +
ggtitle("Predicted probabilities for model1") +
xlab("Monthly Income") +
ylab("Probability of Attrition")
p2 <- ggplot(churn_train2, aes(OverTime, .fitted, color = OverTime)) +
geom_boxplot(show.legend = FALSE) +
geom_rug(sides = "b", position = "jitter", alpha = 0.2, show.legend = FALSE) +
ggtitle("Predicted probabilities for model2") +
xlab("Over Time") +
scale_y_continuous("Probability of Attrition", limits = c(0, 1))
gridExtra::grid.arrange(p1, p2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'

# 5.4 Multiple logistic regression ----
model3 <- glm(
Attrition ~ MonthlyIncome + OverTime,
family = "binomial",
data = churn_train
)
tidy(model3)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.33 0.177 -7.54 4.74e-14
## 2 MonthlyIncome -0.000147 0.0000280 -5.27 1.38e- 7
## 3 OverTimeYes 1.35 0.180 7.50 6.59e-14
# Fig 5.3
churn_train3 <- churn_train %>% mutate(prob = ifelse(Attrition == "Yes", 1, 0))
churn_train3 <- broom::augment(model3, churn_train3) %>% mutate(.fitted = exp(.fitted))
ggplot(churn_train3, aes(MonthlyIncome, prob, color = OverTime)) +
geom_point(alpha = .15) +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
ggtitle("Predicted probabilities for model3") +
xlab("Monthly Income") +
ylab("Probability of Attrition")
## `geom_smooth()` using formula = 'y ~ x'

churn_train3$prob
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [593] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [630] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [667] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [704] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [741] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [778] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [815] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [852] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [889] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [926] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [963] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1000] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
library(ISLR2)
data(Default)
fit1 <- glm(default ~ balance + income, data = Default, family = "binomial")
summary(fit1)
##
## Call:
## glm(formula = default ~ balance + income, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
glm(formula = default ~ balance + income, family = "binomial",
data = Default)
##
## Call: glm(formula = default ~ balance + income, family = "binomial",
## data = Default)
##
## Coefficients:
## (Intercept) balance income
## -1.154e+01 5.647e-03 2.081e-05
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
## Null Deviance: 2921
## Residual Deviance: 1579 AIC: 1585