library(modeldata)
## 
## Attaching package: 'modeldata'
## The following object is masked from 'package:datasets':
## 
##     penguins
library(rsample)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:rsample':
## 
##     calibration
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data("attrition")
set.seed(123)
churn_split <- initial_split(attrition, prop = 0.7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test  <- testing(churn_split)
# Model 1: Attrition based on Monthly Income
model1 <- glm(Attrition ~ MonthlyIncome, 
              data = churn_train, family = "binomial")

# Model 2: Attrition based on Monthly Income + Overtime
model2 <- glm(Attrition ~ MonthlyIncome + OverTime, 
              data = churn_train, family = "binomial")

# Model 3: Attrition based on ALL variables
model3 <- glm(Attrition ~ ., 
              data = churn_train, family = "binomial")

# Print summaries for comparison
summary(model1)
## 
## Call:
## glm(formula = Attrition ~ MonthlyIncome, family = "binomial", 
##     data = churn_train)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.861e-01  1.572e-01  -5.636 1.74e-08 ***
## MonthlyIncome -1.386e-04  2.719e-05  -5.098 3.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 905.68  on 1027  degrees of freedom
## Residual deviance: 870.83  on 1026  degrees of freedom
## AIC: 874.83
## 
## Number of Fisher Scoring iterations: 5
summary(model2)
## 
## Call:
## glm(formula = Attrition ~ MonthlyIncome + OverTime, family = "binomial", 
##     data = churn_train)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.333e+00  1.769e-01  -7.539 4.74e-14 ***
## MonthlyIncome -1.474e-04  2.799e-05  -5.268 1.38e-07 ***
## OverTimeYes    1.349e+00  1.800e-01   7.496 6.59e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 905.68  on 1027  degrees of freedom
## Residual deviance: 815.00  on 1025  degrees of freedom
## AIC: 821
## 
## Number of Fisher Scoring iterations: 5
summary(model3)
## 
## Call:
## glm(formula = Attrition ~ ., family = "binomial", data = churn_train)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.906e+01  1.427e+03  -0.013 0.989344    
## Age                             -2.359e-02  1.742e-02  -1.354 0.175683    
## BusinessTravelTravel_Frequently  2.568e+00  5.586e-01   4.597 4.29e-06 ***
## BusinessTravelTravel_Rarely      1.575e+00  5.163e-01   3.050 0.002291 ** 
## DailyRate                       -5.920e-05  2.760e-04  -0.215 0.830152    
## DepartmentResearch_Development   1.514e+01  1.427e+03   0.011 0.991538    
## DepartmentSales                  1.187e-01  1.611e+03   0.000 0.999941    
## DistanceFromHome                 5.276e-02  1.332e-02   3.961 7.47e-05 ***
## Education.L                     -5.112e-02  4.889e-01  -0.105 0.916721    
## Education.Q                     -3.151e-01  4.161e-01  -0.757 0.448894    
## Education.C                     -7.785e-02  3.203e-01  -0.243 0.807958    
## Education^4                      5.476e-02  2.320e-01   0.236 0.813397    
## EducationFieldLife_Sciences     -7.256e-01  8.902e-01  -0.815 0.414983    
## EducationFieldMarketing         -2.145e-01  9.592e-01  -0.224 0.823076    
## EducationFieldMedical           -8.363e-01  9.044e-01  -0.925 0.355149    
## EducationFieldOther             -7.631e-01  1.003e+00  -0.761 0.446674    
## EducationFieldTechnical_Degree   2.103e-01  9.222e-01   0.228 0.819564    
## EnvironmentSatisfaction.L       -1.033e+00  2.292e-01  -4.506 6.61e-06 ***
## EnvironmentSatisfaction.Q        3.409e-01  2.266e-01   1.504 0.132453    
## EnvironmentSatisfaction.C       -3.215e-01  2.300e-01  -1.398 0.162261    
## GenderMale                       6.367e-01  2.404e-01   2.649 0.008078 ** 
## HourlyRate                       2.626e-03  5.574e-03   0.471 0.637529    
## JobInvolvement.L                -1.306e+00  4.126e-01  -3.166 0.001548 ** 
## JobInvolvement.Q                -1.445e-01  3.272e-01  -0.442 0.658791    
## JobInvolvement.C                -2.743e-01  2.193e-01  -1.251 0.211077    
## JobLevel                        -1.687e-02  4.106e-01  -0.041 0.967225    
## JobRoleHuman_Resources           1.699e+01  1.427e+03   0.012 0.990502    
## JobRoleLaboratory_Technician     1.740e+00  6.190e-01   2.810 0.004950 ** 
## JobRoleManager                   2.430e-01  1.110e+00   0.219 0.826648    
## JobRoleManufacturing_Director    7.649e-01  6.323e-01   1.210 0.226377    
## JobRoleResearch_Director        -1.568e+00  1.361e+00  -1.152 0.249441    
## JobRoleResearch_Scientist        5.864e-01  6.446e-01   0.910 0.363004    
## JobRoleSales_Executive           1.610e+01  7.475e+02   0.022 0.982810    
## JobRoleSales_Representative      1.707e+01  7.475e+02   0.023 0.981779    
## JobSatisfaction.L               -1.003e+00  2.315e-01  -4.330 1.49e-05 ***
## JobSatisfaction.Q                1.128e-01  2.288e-01   0.493 0.622017    
## JobSatisfaction.C               -4.714e-01  2.337e-01  -2.017 0.043710 *  
## MaritalStatusMarried             1.383e-01  3.233e-01   0.428 0.668805    
## MaritalStatusSingle              1.516e+00  4.245e-01   3.571 0.000356 ***
## MonthlyIncome                    5.920e-05  1.046e-04   0.566 0.571504    
## MonthlyRate                     -3.168e-06  1.575e-05  -0.201 0.840613    
## NumCompaniesWorked               1.508e-01  5.023e-02   3.002 0.002680 ** 
## OverTimeYes                      2.086e+00  2.498e-01   8.352  < 2e-16 ***
## PercentSalaryHike               -2.437e-02  4.882e-02  -0.499 0.617706    
## PerformanceRating.L              9.753e-03  3.559e-01   0.027 0.978141    
## RelationshipSatisfaction.L      -7.115e-01  2.440e-01  -2.916 0.003547 ** 
## RelationshipSatisfaction.Q       2.048e-01  2.321e-01   0.882 0.377613    
## RelationshipSatisfaction.C      -1.236e-01  2.256e-01  -0.548 0.583870    
## StockOptionLevel                 6.136e-03  1.908e-01   0.032 0.974345    
## TotalWorkingYears               -9.310e-02  3.801e-02  -2.449 0.014313 *  
## TrainingTimesLastYear           -1.842e-01  9.061e-02  -2.033 0.042030 *  
## WorkLifeBalance.L               -6.501e-01  3.709e-01  -1.753 0.079648 .  
## WorkLifeBalance.Q                5.031e-01  3.016e-01   1.668 0.095285 .  
## WorkLifeBalance.C                3.148e-01  2.189e-01   1.438 0.150516    
## YearsAtCompany                   8.674e-02  5.629e-02   1.541 0.123330    
## YearsInCurrentRole              -1.951e-01  5.985e-02  -3.260 0.001114 ** 
## YearsSinceLastPromotion          1.937e-01  5.512e-02   3.514 0.000441 ***
## YearsWithCurrManager            -1.096e-01  6.181e-02  -1.773 0.076256 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 905.68  on 1027  degrees of freedom
## Residual deviance: 566.48  on  970  degrees of freedom
## AIC: 682.48
## 
## Number of Fisher Scoring iterations: 16
# Predict on test data
test_probs <- predict(model3, newdata = churn_test, type = "response")

# Convert probabilities to factors ('No' or 'Yes')
test_preds <- factor(ifelse(test_probs > 0.5, "Yes", "No"), 
                     levels = levels(churn_test$Attrition))

# Create and Print the Confusion Matrix
conf_matrix <- confusionMatrix(data = test_preds, 
                               reference = churn_test$Attrition)

print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  353  38
##        Yes  17  34
##                                           
##                Accuracy : 0.8756          
##                  95% CI : (0.8411, 0.9049)
##     No Information Rate : 0.8371          
##     P-Value [Acc > NIR] : 0.014514        
##                                           
##                   Kappa : 0.483           
##                                           
##  Mcnemar's Test P-Value : 0.007001        
##                                           
##             Sensitivity : 0.9541          
##             Specificity : 0.4722          
##          Pos Pred Value : 0.9028          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.8371          
##          Detection Rate : 0.7986          
##    Detection Prevalence : 0.8846          
##       Balanced Accuracy : 0.7131          
##                                           
##        'Positive' Class : No              
##