library(rsample)
## Loading required package: broom
## Loading required package: tidyr
##
## Attaching package: 'rsample'
## The following object is masked from 'package:tidyr':
##
## fill
library(yardstick)
library(corrr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ rsample::fill() masks tidyr::fill()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-16
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Load raw data
raw_DDSA <- read_excel("CaseStudy2-data.xlsx")
# Change characters to factors for more meaningfull analysis
DDSA <- raw_DDSA%>%
mutate_if(is.character, as.factor) %>%
select(Attrition, everything())
# attach so we don't have to keep writing DDSA$
attach(DDSA)
# View observations and variables
glimpse(DDSA)
## Observations: 1,470
## Variables: 35
## $ Attrition <fct> Yes, No, Yes, No, No, No, No, No, No,...
## $ Age <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 3...
## $ BusinessTravel <fct> Travel_Rarely, Travel_Frequently, Tra...
## $ DailyRate <dbl> 1102, 279, 1373, 1392, 591, 1005, 132...
## $ Department <fct> Sales, Research & Development, Resear...
## $ DistanceFromHome <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, ...
## $ Education <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1...
## $ EducationField <fct> Life Sciences, Life Sciences, Other, ...
## $ EmployeeCount <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ EmployeeNumber <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14,...
## $ EnvironmentSatisfaction <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1...
## $ Gender <fct> Female, Male, Male, Female, Male, Mal...
## $ HourlyRate <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 9...
## $ JobInvolvement <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3...
## $ JobLevel <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1...
## $ JobRole <fct> Sales Executive, Research Scientist, ...
## $ JobSatisfaction <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3...
## $ MaritalStatus <fct> Single, Married, Single, Married, Mar...
## $ MonthlyIncome <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2...
## $ MonthlyRate <dbl> 19479, 24907, 2396, 23159, 16632, 118...
## $ NumCompaniesWorked <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1...
## $ Over18 <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y...
## $ OverTime <fct> Yes, No, Yes, Yes, No, No, Yes, No, N...
## $ PercentSalaryHike <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 1...
## $ PerformanceRating <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3...
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4...
## $ StandardHours <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 8...
## $ StockOptionLevel <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1...
## $ TotalWorkingYears <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, ...
## $ TrainingTimesLastYear <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1...
## $ WorkLifeBalance <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2...
## $ YearsAtCompany <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, ...
## $ YearsInCurrentRole <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2...
## $ YearsSinceLastPromotion <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4...
## $ YearsWithCurrManager <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3...
# View summary statistics
summary(DDSA)
## Attrition Age BusinessTravel DailyRate
## No :1233 Min. :18.00 Non-Travel : 150 Min. : 102.0
## Yes: 237 1st Qu.:30.00 Travel_Frequently: 277 1st Qu.: 465.0
## Median :36.00 Travel_Rarely :1043 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## Department DistanceFromHome Education
## Human Resources : 63 Min. : 1.000 Min. :1.000
## Research & Development:961 1st Qu.: 2.000 1st Qu.:2.000
## Sales :446 Median : 7.000 Median :3.000
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber
## Human Resources : 27 Min. :1 Min. : 1.0
## Life Sciences :606 1st Qu.:1 1st Qu.: 491.2
## Marketing :159 Median :1 Median :1020.5
## Medical :464 Mean :1 Mean :1024.9
## Other : 82 3rd Qu.:1 3rd Qu.:1555.8
## Technical Degree:132 Max. :1 Max. :2068.0
##
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement
## Min. :1.000 Female:588 Min. : 30.00 Min. :1.00
## 1st Qu.:2.000 Male :882 1st Qu.: 48.00 1st Qu.:2.00
## Median :3.000 Median : 66.00 Median :3.00
## Mean :2.722 Mean : 65.89 Mean :2.73
## 3rd Qu.:4.000 3rd Qu.: 83.75 3rd Qu.:3.00
## Max. :4.000 Max. :100.00 Max. :4.00
##
## JobLevel JobRole JobSatisfaction
## Min. :1.000 Sales Executive :326 Min. :1.000
## 1st Qu.:1.000 Research Scientist :292 1st Qu.:2.000
## Median :2.000 Laboratory Technician :259 Median :3.000
## Mean :2.064 Manufacturing Director :145 Mean :2.729
## 3rd Qu.:3.000 Healthcare Representative:131 3rd Qu.:4.000
## Max. :5.000 Manager :102 Max. :4.000
## (Other) :215
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked
## Divorced:327 Min. : 1009 Min. : 2094 Min. :0.000
## Married :673 1st Qu.: 2911 1st Qu.: 8047 1st Qu.:1.000
## Single :470 Median : 4919 Median :14236 Median :2.000
## Mean : 6503 Mean :14313 Mean :2.693
## 3rd Qu.: 8379 3rd Qu.:20462 3rd Qu.:4.000
## Max. :19999 Max. :26999 Max. :9.000
##
## Over18 OverTime PercentSalaryHike PerformanceRating
## Y:1470 No :1054 Min. :11.00 Min. :3.000
## Yes: 416 1st Qu.:12.00 1st Qu.:3.000
## Median :14.00 Median :3.000
## Mean :15.21 Mean :3.154
## 3rd Qu.:18.00 3rd Qu.:3.000
## Max. :25.00 Max. :4.000
##
## RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
## Min. :1.000 Min. :80 Min. :0.0000 Min. : 0.00
## 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00
## Median :3.000 Median :80 Median :1.0000 Median :10.00
## Mean :2.712 Mean :80 Mean :0.7939 Mean :11.28
## 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :4.000 Max. :80 Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.799 Mean :2.761 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :4.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
# Changed the following variables to categorical to eventually have a LifeSatisfaction variable
DDSA$DistanceFromHome <- cut(DDSA$DistanceFromHome, breaks = c(-Inf, 9, 21, Inf), labels=c("Poor", "Fair", "Good"))
DDSA$RelationshipSatisfaction <- cut(DDSA$RelationshipSatisfaction, breaks = c(-Inf, 2, 3, Inf), labels=c("Poor", "Fair", "Good"))
DDSA$WorkLifeBalance <- cut(DDSA$WorkLifeBalance, breaks = c(-Inf, 2, 3, Inf), labels=c("Poor", "Fair", "Good"))
ggplot(data = DDSA) +
geom_bar(mapping = aes(x=DistanceFromHome))

ggplot(data = DDSA) +
geom_bar(mapping = aes(x=RelationshipSatisfaction))

ggplot(data = DDSA) +
geom_bar(mapping = aes(x=WorkLifeBalance))

# Is there a relationship between Age and Income.
# Color each point based on the Gender of the participant
ggplot(data=DDSA) +
geom_point(mapping = aes(x = Age, y = MonthlyIncome, color=Gender)) +
geom_smooth(mapping = aes(x = Age, y = MonthlyIncome))
## `geom_smooth()` using method = 'gam'

# Split test/training sets ~ uses the rsample package
set.seed(100)
train_test_split <- initial_split(DDSA, prop = 0.8)
train_test_split
## <1177/293/1470>
# Retrieve train and test sets
train_tbl <- training(train_test_split)
test_tbl <- testing(train_test_split)
table(test_tbl$Attrition)
##
## No Yes
## 237 56
237/nrow(test_tbl)
## [1] 0.8088737
modelCart = rpart(Attrition ~ ., data=train_tbl, method="class")
#Plot the model
prp(modelCart)

#Predict the test data
predictionCart <- predict(modelCart, newdata=test_tbl, type="class")
#CART Accuracy
#Confusion matrix
t1 <- table(test_tbl$Attrition, predictionCart)
print(t1)
## predictionCart
## No Yes
## No 228 9
## Yes 40 16
# Confusion Matrix info: http://cs229.stanford.edu/section/evaluation_metrics.pdf
#CART model accuracy
(t1[1]+t1[4])/(nrow(test_tbl))
## [1] 0.8327645
Need to plot ROC curve
# Random Forest model
modelRf = randomForest(Attrition ~ ., data=train_tbl, ntree = 100, mtry = 5, importance = TRUE, method="class")
print(modelRf)
##
## Call:
## randomForest(formula = Attrition ~ ., data = train_tbl, ntree = 100, mtry = 5, importance = TRUE, method = "class")
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 13.34%
## Confusion matrix:
## No Yes class.error
## No 989 7 0.007028112
## Yes 150 31 0.828729282
# Random Forest info: https://medium.com/@williamkoehrsen/random-forest-simple-explanation-377895a60d2d
# OOB estimate of error rate: 13.34% ~
#OOB vs No. Of Trees
# Out-of-bag (OOB) error ~ a method of measuring the prediction error of random forests,
# boosted decision trees, and other machine learning models utilizing bootstrap aggregating
#(bagging) to sub-sample data samples used for training.
plot(modelRf, main="")
legend("topright", c("OOB", "0", "1"), text.col=1:6, lty=1:3, col=1:3)
title(main="Error Rates Random Forest")

## List the importance of the variables.
impVar <- round(randomForest::importance(modelRf), 2)
impVar[order(impVar[,3], decreasing=TRUE),]
## No Yes MeanDecreaseAccuracy MeanDecreaseGini
## OverTime 6.95 8.47 9.75 15.72
## Age 4.80 6.13 7.40 19.32
## MonthlyIncome 6.12 1.85 7.21 22.06
## TotalWorkingYears 4.92 2.90 6.21 15.00
## JobRole 4.38 4.14 6.08 15.97
## MaritalStatus 4.71 4.86 6.04 6.74
## YearsAtCompany 3.22 3.21 4.92 12.33
## YearsWithCurrManager 4.31 2.03 4.54 9.06
## JobLevel 3.51 3.91 4.49 7.04
## StockOptionLevel 2.11 4.18 3.56 8.20
## JobSatisfaction 2.93 2.42 3.54 8.41
## EnvironmentSatisfaction 2.54 2.57 3.43 9.35
## BusinessTravel 2.60 2.19 3.07 5.85
## YearsInCurrentRole 2.39 1.46 2.96 8.07
## Department 1.62 2.74 2.80 3.42
## NumCompaniesWorked 0.45 2.84 1.51 10.13
## EducationField 0.84 1.21 1.41 10.67
## DailyRate 0.44 1.91 1.25 16.99
## Education 1.51 -0.55 1.18 6.09
## YearsSinceLastPromotion 1.46 -0.90 1.04 7.51
## WorkLifeBalance 0.47 1.15 0.93 6.44
## HourlyRate 0.51 0.82 0.85 14.15
## TrainingTimesLastYear 0.26 0.89 0.59 7.27
## DistanceFromHome -0.49 1.93 0.54 6.25
## EmployeeNumber -0.05 0.28 0.17 15.12
## EmployeeCount 0.00 0.00 0.00 0.00
## Over18 0.00 0.00 0.00 0.00
## StandardHours 0.00 0.00 0.00 0.00
## RelationshipSatisfaction 0.27 -0.66 -0.03 5.89
## Gender -0.87 0.83 -0.46 1.85
## JobInvolvement -0.94 -0.50 -1.04 6.20
## PercentSalaryHike -1.90 0.92 -1.35 9.44
## PerformanceRating -1.59 -0.57 -1.65 1.27
## MonthlyRate -2.48 0.94 -1.82 14.06
# Most important variables of attrition
attrition_variables <- DDSA %>%
tibble::as_tibble() %>%
select(Attrition, OverTime,Age, MonthlyIncome) %>%
rowid_to_column(var = "Case")
attrition_variables
## # A tibble: 1,470 x 5
## Case Attrition OverTime Age MonthlyIncome
## <int> <fct> <fct> <dbl> <dbl>
## 1 1 Yes Yes 41. 5993.
## 2 2 No No 49. 5130.
## 3 3 Yes Yes 37. 2090.
## 4 4 No Yes 33. 2909.
## 5 5 No No 27. 3468.
## 6 6 No No 32. 3068.
## 7 7 No Yes 59. 2670.
## 8 8 No No 30. 2693.
## 9 9 No No 38. 9526.
## 10 10 No No 36. 5237.
## # ... with 1,460 more rows