install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("rsample")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("vip")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("modeldata")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(dplyr)  # for data wrangling
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) # for awesome plotting
library(rsample) # for data splitting
# Model interpretability packages
library(vip)       # variable importance
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
# Job attrition data ----
library(modeldata)
data("attrition", package = "modeldata")
# access data
dim(attrition)
## [1] 1470   31
# Convert ordered factor columns into factor columns only!
df <- attrition %>% mutate_if(is.ordered, factor, ordered = FALSE)

# Create training (70%) and test (30%) sets for the 
# rsample::attrition data.
set.seed(123)  # for reproducibility
churn_split <- initial_split(df, prop = .7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test  <- testing(churn_split) 



# 5.3 Simple logistic regression ----
model1 <- glm(Attrition ~ MonthlyIncome, 
              family = "binomial", data = churn_train)

model2 <- glm(Attrition ~ OverTime, family = "binomial", 
              data = churn_train)
#
par(mfrow = c(2, 2))
plot(model1)

par(mfrow = c(1,1))

plot(churn_train$MonthlyIncome, model1$fitted.values)

ggplot(aes(x = MonthlyIncome, y = ifelse(Attrition == "Yes", 1, 0)), data = churn_train) +
  geom_point() +
  geom_smooth(method = loess, formula = model1$fitted.values ~ x, color = "gray") +
  ylab("Probability of Attrition") + 
  ggtitle("Predicted Probabilites of Model 1")

# ggplot(aes(x = OverTime, y = ifelse(Attrition == "Yes", 1, 0)), data = churn_train) +
#           geom_point() +
#           geom_smooth(method = loess, formula = model1$fitted.values ~ x, color = "gray") +
#           ylab("Probability of Attrition") + 
#           ggtitle("Predicted Probabilites of Model 1")

#
exp(coef(model1))
##   (Intercept) MonthlyIncome 
##     0.4122647     0.9998614
exp(coef(model2))
## (Intercept) OverTimeYes 
##   0.1189759   3.6323389
confint(model1)
## Waiting for profiling to be done...
##                       2.5 %        97.5 %
## (Intercept)   -1.1932606571 -5.761048e-01
## MonthlyIncome -0.0001948723 -8.803311e-05
confint(model2)
## Waiting for profiling to be done...
##                  2.5 %    97.5 %
## (Intercept) -2.3695727 -1.902409
## OverTimeYes  0.9463761  1.635373
# codings from the textbook
churn_train2 <- churn_train %>% mutate(prob = ifelse(Attrition == "Yes", 1, 0))
churn_train2 <- broom::augment(model2, churn_train2) %>% 
  mutate(.fitted = exp(.fitted))

p1 <- ggplot(churn_train2, aes(MonthlyIncome, prob)) +
  geom_point(alpha = 0.15) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  ggtitle("Predicted probabilities for model1") +
  xlab("Monthly Income") +
  ylab("Probability of Attrition")

p2 <- ggplot(churn_train2, aes(OverTime, .fitted, color = OverTime)) +
  geom_boxplot(show.legend = FALSE) +
  geom_rug(sides = "b", position = "jitter", alpha = 0.2, show.legend = FALSE) +
  ggtitle("Predicted probabilities for model2") +
  xlab("Over Time") +
  scale_y_continuous("Probability of Attrition", limits = c(0, 1))

gridExtra::grid.arrange(p1, p2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'

# 5.4 Multiple logistic regression ----
model3 <- glm(
  Attrition ~ MonthlyIncome + OverTime,
  family = "binomial", 
  data = churn_train
)

tidy(model3)
## # A tibble: 3 × 5
##   term           estimate std.error statistic  p.value
##   <chr>             <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   -1.33     0.177         -7.54 4.74e-14
## 2 MonthlyIncome -0.000147 0.0000280     -5.27 1.38e- 7
## 3 OverTimeYes    1.35     0.180          7.50 6.59e-14
# Fig 5.3
churn_train3 <- churn_train %>% mutate(prob = ifelse(Attrition == "Yes", 1, 0))
churn_train3 <- broom::augment(model3, churn_train3) %>% mutate(.fitted = exp(.fitted))

ggplot(churn_train3, aes(MonthlyIncome, prob, color = OverTime)) +
  geom_point(alpha = .15) +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
  ggtitle("Predicted probabilities for model3") +
  xlab("Monthly Income") +
  ylab("Probability of Attrition")
## `geom_smooth()` using formula = 'y ~ x'

churn_train3$prob
##    [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##   [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##   [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [593] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [630] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [667] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [704] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [741] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [778] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [815] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [852] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [889] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [926] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [963] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1000] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
library(ISLR2)
data(Default)
fit1 <- glm(default ~ balance + income, data = Default, family = "binomial")
summary(fit1)
## 
## Call:
## glm(formula = default ~ balance + income, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8
glm(formula = default ~ balance + income, family = "binomial", 
      data = Default)
## 
## Call:  glm(formula = default ~ balance + income, family = "binomial", 
##     data = Default)
## 
## Coefficients:
## (Intercept)      balance       income  
##  -1.154e+01    5.647e-03    2.081e-05  
## 
## Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
## Null Deviance:       2921 
## Residual Deviance: 1579  AIC: 1585