library(modeldata)
##
## Attaching package: 'modeldata'
## The following object is masked from 'package:datasets':
##
## penguins
library(rsample)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:rsample':
##
## calibration
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data("attrition")
set.seed(123)
churn_split <- initial_split(attrition, prop = 0.7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test <- testing(churn_split)
# Model 1: Attrition based on Monthly Income
model1 <- glm(Attrition ~ MonthlyIncome,
data = churn_train, family = "binomial")
# Model 2: Attrition based on Monthly Income + Overtime
model2 <- glm(Attrition ~ MonthlyIncome + OverTime,
data = churn_train, family = "binomial")
# Model 3: Attrition based on ALL variables
model3 <- glm(Attrition ~ .,
data = churn_train, family = "binomial")
# Print summaries for comparison
summary(model1)
##
## Call:
## glm(formula = Attrition ~ MonthlyIncome, family = "binomial",
## data = churn_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.861e-01 1.572e-01 -5.636 1.74e-08 ***
## MonthlyIncome -1.386e-04 2.719e-05 -5.098 3.44e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 905.68 on 1027 degrees of freedom
## Residual deviance: 870.83 on 1026 degrees of freedom
## AIC: 874.83
##
## Number of Fisher Scoring iterations: 5
summary(model2)
##
## Call:
## glm(formula = Attrition ~ MonthlyIncome + OverTime, family = "binomial",
## data = churn_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.333e+00 1.769e-01 -7.539 4.74e-14 ***
## MonthlyIncome -1.474e-04 2.799e-05 -5.268 1.38e-07 ***
## OverTimeYes 1.349e+00 1.800e-01 7.496 6.59e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 905.68 on 1027 degrees of freedom
## Residual deviance: 815.00 on 1025 degrees of freedom
## AIC: 821
##
## Number of Fisher Scoring iterations: 5
summary(model3)
##
## Call:
## glm(formula = Attrition ~ ., family = "binomial", data = churn_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.906e+01 1.427e+03 -0.013 0.989344
## Age -2.359e-02 1.742e-02 -1.354 0.175683
## BusinessTravelTravel_Frequently 2.568e+00 5.586e-01 4.597 4.29e-06 ***
## BusinessTravelTravel_Rarely 1.575e+00 5.163e-01 3.050 0.002291 **
## DailyRate -5.920e-05 2.760e-04 -0.215 0.830152
## DepartmentResearch_Development 1.514e+01 1.427e+03 0.011 0.991538
## DepartmentSales 1.187e-01 1.611e+03 0.000 0.999941
## DistanceFromHome 5.276e-02 1.332e-02 3.961 7.47e-05 ***
## Education.L -5.112e-02 4.889e-01 -0.105 0.916721
## Education.Q -3.151e-01 4.161e-01 -0.757 0.448894
## Education.C -7.785e-02 3.203e-01 -0.243 0.807958
## Education^4 5.476e-02 2.320e-01 0.236 0.813397
## EducationFieldLife_Sciences -7.256e-01 8.902e-01 -0.815 0.414983
## EducationFieldMarketing -2.145e-01 9.592e-01 -0.224 0.823076
## EducationFieldMedical -8.363e-01 9.044e-01 -0.925 0.355149
## EducationFieldOther -7.631e-01 1.003e+00 -0.761 0.446674
## EducationFieldTechnical_Degree 2.103e-01 9.222e-01 0.228 0.819564
## EnvironmentSatisfaction.L -1.033e+00 2.292e-01 -4.506 6.61e-06 ***
## EnvironmentSatisfaction.Q 3.409e-01 2.266e-01 1.504 0.132453
## EnvironmentSatisfaction.C -3.215e-01 2.300e-01 -1.398 0.162261
## GenderMale 6.367e-01 2.404e-01 2.649 0.008078 **
## HourlyRate 2.626e-03 5.574e-03 0.471 0.637529
## JobInvolvement.L -1.306e+00 4.126e-01 -3.166 0.001548 **
## JobInvolvement.Q -1.445e-01 3.272e-01 -0.442 0.658791
## JobInvolvement.C -2.743e-01 2.193e-01 -1.251 0.211077
## JobLevel -1.687e-02 4.106e-01 -0.041 0.967225
## JobRoleHuman_Resources 1.699e+01 1.427e+03 0.012 0.990502
## JobRoleLaboratory_Technician 1.740e+00 6.190e-01 2.810 0.004950 **
## JobRoleManager 2.430e-01 1.110e+00 0.219 0.826648
## JobRoleManufacturing_Director 7.649e-01 6.323e-01 1.210 0.226377
## JobRoleResearch_Director -1.568e+00 1.361e+00 -1.152 0.249441
## JobRoleResearch_Scientist 5.864e-01 6.446e-01 0.910 0.363004
## JobRoleSales_Executive 1.610e+01 7.475e+02 0.022 0.982810
## JobRoleSales_Representative 1.707e+01 7.475e+02 0.023 0.981779
## JobSatisfaction.L -1.003e+00 2.315e-01 -4.330 1.49e-05 ***
## JobSatisfaction.Q 1.128e-01 2.288e-01 0.493 0.622017
## JobSatisfaction.C -4.714e-01 2.337e-01 -2.017 0.043710 *
## MaritalStatusMarried 1.383e-01 3.233e-01 0.428 0.668805
## MaritalStatusSingle 1.516e+00 4.245e-01 3.571 0.000356 ***
## MonthlyIncome 5.920e-05 1.046e-04 0.566 0.571504
## MonthlyRate -3.168e-06 1.575e-05 -0.201 0.840613
## NumCompaniesWorked 1.508e-01 5.023e-02 3.002 0.002680 **
## OverTimeYes 2.086e+00 2.498e-01 8.352 < 2e-16 ***
## PercentSalaryHike -2.437e-02 4.882e-02 -0.499 0.617706
## PerformanceRating.L 9.753e-03 3.559e-01 0.027 0.978141
## RelationshipSatisfaction.L -7.115e-01 2.440e-01 -2.916 0.003547 **
## RelationshipSatisfaction.Q 2.048e-01 2.321e-01 0.882 0.377613
## RelationshipSatisfaction.C -1.236e-01 2.256e-01 -0.548 0.583870
## StockOptionLevel 6.136e-03 1.908e-01 0.032 0.974345
## TotalWorkingYears -9.310e-02 3.801e-02 -2.449 0.014313 *
## TrainingTimesLastYear -1.842e-01 9.061e-02 -2.033 0.042030 *
## WorkLifeBalance.L -6.501e-01 3.709e-01 -1.753 0.079648 .
## WorkLifeBalance.Q 5.031e-01 3.016e-01 1.668 0.095285 .
## WorkLifeBalance.C 3.148e-01 2.189e-01 1.438 0.150516
## YearsAtCompany 8.674e-02 5.629e-02 1.541 0.123330
## YearsInCurrentRole -1.951e-01 5.985e-02 -3.260 0.001114 **
## YearsSinceLastPromotion 1.937e-01 5.512e-02 3.514 0.000441 ***
## YearsWithCurrManager -1.096e-01 6.181e-02 -1.773 0.076256 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 905.68 on 1027 degrees of freedom
## Residual deviance: 566.48 on 970 degrees of freedom
## AIC: 682.48
##
## Number of Fisher Scoring iterations: 16
# Predict on test data
test_probs <- predict(model3, newdata = churn_test, type = "response")
# Convert probabilities to factors ('No' or 'Yes')
test_preds <- factor(ifelse(test_probs > 0.5, "Yes", "No"),
levels = levels(churn_test$Attrition))
# Create and Print the Confusion Matrix
conf_matrix <- confusionMatrix(data = test_preds,
reference = churn_test$Attrition)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 353 38
## Yes 17 34
##
## Accuracy : 0.8756
## 95% CI : (0.8411, 0.9049)
## No Information Rate : 0.8371
## P-Value [Acc > NIR] : 0.014514
##
## Kappa : 0.483
##
## Mcnemar's Test P-Value : 0.007001
##
## Sensitivity : 0.9541
## Specificity : 0.4722
## Pos Pred Value : 0.9028
## Neg Pred Value : 0.6667
## Prevalence : 0.8371
## Detection Rate : 0.7986
## Detection Prevalence : 0.8846
## Balanced Accuracy : 0.7131
##
## 'Positive' Class : No
##