Import the data file
emp_raw <- read.csv("HR_Employee_Attrition_Data.csv")
emp <- emp_raw
Explore the structure of the data
str(emp)
## 'data.frame': 2940 obs. of 35 variables:
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
summary(emp)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 No :2466 Non-Travel : 300 Min. : 102.0
## 1st Qu.:30.00 Yes: 474 Travel_Frequently: 554 1st Qu.: 465.0
## Median :36.00 Travel_Rarely :2086 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## Department DistanceFromHome Education
## Human Resources : 126 Min. : 1.000 Min. :1.000
## Research & Development:1922 1st Qu.: 2.000 1st Qu.:2.000
## Sales : 892 Median : 7.000 Median :3.000
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber
## Human Resources : 54 Min. :1 Min. : 1.0
## Life Sciences :1212 1st Qu.:1 1st Qu.: 735.8
## Marketing : 318 Median :1 Median :1470.5
## Medical : 928 Mean :1 Mean :1470.5
## Other : 164 3rd Qu.:1 3rd Qu.:2205.2
## Technical Degree: 264 Max. :1 Max. :2940.0
##
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement
## Min. :1.000 Female:1176 Min. : 30.00 Min. :1.00
## 1st Qu.:2.000 Male :1764 1st Qu.: 48.00 1st Qu.:2.00
## Median :3.000 Median : 66.00 Median :3.00
## Mean :2.722 Mean : 65.89 Mean :2.73
## 3rd Qu.:4.000 3rd Qu.: 84.00 3rd Qu.:3.00
## Max. :4.000 Max. :100.00 Max. :4.00
##
## JobLevel JobRole JobSatisfaction
## Min. :1.000 Sales Executive :652 Min. :1.000
## 1st Qu.:1.000 Research Scientist :584 1st Qu.:2.000
## Median :2.000 Laboratory Technician :518 Median :3.000
## Mean :2.064 Manufacturing Director :290 Mean :2.729
## 3rd Qu.:3.000 Healthcare Representative:262 3rd Qu.:4.000
## Max. :5.000 Manager :204 Max. :4.000
## (Other) :430
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked
## Divorced: 654 Min. : 1009 Min. : 2094 Min. :0.000
## Married :1346 1st Qu.: 2911 1st Qu.: 8045 1st Qu.:1.000
## Single : 940 Median : 4919 Median :14236 Median :2.000
## Mean : 6503 Mean :14313 Mean :2.693
## 3rd Qu.: 8380 3rd Qu.:20462 3rd Qu.:4.000
## Max. :19999 Max. :26999 Max. :9.000
##
## Over18 OverTime PercentSalaryHike PerformanceRating
## Y:2940 No :2108 Min. :11.00 Min. :3.000
## Yes: 832 1st Qu.:12.00 1st Qu.:3.000
## Median :14.00 Median :3.000
## Mean :15.21 Mean :3.154
## 3rd Qu.:18.00 3rd Qu.:3.000
## Max. :25.00 Max. :4.000
##
## RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
## Min. :1.000 Min. :80 Min. :0.0000 Min. : 0.00
## 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00
## Median :3.000 Median :80 Median :1.0000 Median :10.00
## Mean :2.712 Mean :80 Mean :0.7939 Mean :11.28
## 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :4.000 Max. :80 Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.799 Mean :2.761 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :4.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
Drop the the columns with no variability.
Also, drop Employee Number as it is just an identifier.
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
nearZeroVar(emp)
## [1] 9 22 27
emp$Over18 <- NULL
emp$EmployeeCount <- NULL
emp$StandardHours <- NULL
emp$EmployeeNumber <- NULL
There are multiple numeric variables that are actually factors, convert these to factors.
emp$Education <- as.factor(emp$Education)
emp$EnvironmentSatisfaction <- as.factor(emp$EnvironmentSatisfaction)
emp$JobInvolvement <- as.factor(emp$JobInvolvement)
emp$JobLevel <- as.factor(emp$JobLevel)
emp$JobSatisfaction <- as.factor(emp$JobSatisfaction)
emp$PerformanceRating <- as.factor(emp$PerformanceRating)
emp$RelationshipSatisfaction <- as.factor(emp$RelationshipSatisfaction)
emp$StockOptionLevel <- as.factor(emp$StockOptionLevel)
emp$TrainingTimesLastYear <- as.factor(emp$TrainingTimesLastYear)
emp$WorkLifeBalance <- as.factor(emp$WorkLifeBalance)
Create Training and Testing Sets.
set.seed(777)
library(caTools)
## Warning: package 'caTools' was built under R version 3.3.3
split = sample.split(emp$Attrition, SplitRatio = 0.75)
# Create training and testing sets
train = subset(emp, split == TRUE)
test = subset(emp, split == FALSE)
Build the CART Model
# Load CART packages
library(rpart)
## Warning: package 'rpart' was built under R version 3.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
model = rpart(Attrition ~ ., data=train, method="class")
#Plot the model
prp(model)
#Predict on the test data
prediction <- predict(model, newdata=test, type="class")
Baseline Accuracy vs CART Accuracy
# Baseline Accuracy
table(test$Attrition)
##
## No Yes
## 616 118
616/nrow(test)
## [1] 0.8392371
#Confusion matrix
table(test$Attrition, prediction)
## prediction
## No Yes
## No 603 13
## Yes 78 40
#CART model accuracy
(603+40)/(nrow(test))
## [1] 0.8760218
Baseline Accuracy - If we just predict attrition as “No” for every observation, we will get an accuracy of 84%. Model Accuracy - The model gave us an accuracy of 87.6%, an improvement of approx. 4% over the baseline accuracy.
As a fully grown tree is prone to overfitting, lets prune the tree and see if we can improve the model.
printcp(model)
##
## Classification tree:
## rpart(formula = Attrition ~ ., data = train, method = "class")
##
## Variables actually used in tree construction:
## [1] Age DailyRate
## [3] DistanceFromHome EducationField
## [5] EnvironmentSatisfaction HourlyRate
## [7] JobRole JobSatisfaction
## [9] MaritalStatus MonthlyIncome
## [11] NumCompaniesWorked OverTime
## [13] RelationshipSatisfaction StockOptionLevel
## [15] TotalWorkingYears
##
## Root node error: 356/2206 = 0.16138
##
## n= 2206
##
## CP nsplit rel error xerror xstd
## 1 0.041199 0 1.00000 1.00000 0.048535
## 2 0.018258 3 0.87640 0.94382 0.047406
## 3 0.015449 9 0.76404 0.89326 0.046341
## 4 0.014045 13 0.69944 0.85955 0.045602
## 5 0.012640 14 0.68539 0.83989 0.045160
## 6 0.011236 18 0.62640 0.83146 0.044969
## 7 0.010000 19 0.61517 0.83989 0.045160
plotcp(model)
bestcp <- model$cptable[which.min(model$cptable[,"xerror"]),"CP"]
prunedModel <-prune(model, cp= bestcp)
prp(prunedModel)
printcp(prunedModel)
##
## Classification tree:
## rpart(formula = Attrition ~ ., data = train, method = "class")
##
## Variables actually used in tree construction:
## [1] Age DailyRate
## [3] DistanceFromHome EnvironmentSatisfaction
## [5] HourlyRate JobRole
## [7] JobSatisfaction MaritalStatus
## [9] MonthlyIncome NumCompaniesWorked
## [11] OverTime RelationshipSatisfaction
## [13] StockOptionLevel TotalWorkingYears
##
## Root node error: 356/2206 = 0.16138
##
## n= 2206
##
## CP nsplit rel error xerror xstd
## 1 0.041199 0 1.00000 1.00000 0.048535
## 2 0.018258 3 0.87640 0.94382 0.047406
## 3 0.015449 9 0.76404 0.89326 0.046341
## 4 0.014045 13 0.69944 0.85955 0.045602
## 5 0.012640 14 0.68539 0.83989 0.045160
## 6 0.011236 18 0.62640 0.83146 0.044969
plotcp(prunedModel)
#Predict on the test data
prediction_pm <- predict(prunedModel, newdata=test, type="class")
table(test$Attrition, prediction_pm)
## prediction_pm
## No Yes
## No 597 19
## Yes 76 42
(597+42)/nrow(test)
## [1] 0.8705722
So, the pruning does not improve the model accuracy; the accuracy actually drops a little to approoximately 87% post pruning.
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.3.3
## Warning: package 'gplots' was built under R version 3.3.3
prediction_ROC <- predict(prunedModel, newdata=test)
pred = prediction(prediction_ROC[,2], test$Attrition)
perf = performance(pred, "tpr", "fpr")
plot(perf)
#Area under the curve
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.7433276