DDS_Analytics

library(rsample)

## Loading required package: broom

## Loading required package: tidyr

## 
## Attaching package: 'rsample'

## The following object is masked from 'package:tidyr':
## 
##     fill

library(yardstick)
library(corrr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readxl)
library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ rsample::fill() masks tidyr::fill()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ readr::spec()   masks yardstick::spec()

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loaded glmnet 2.0-16

library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

# Load raw data
raw_DDSA <- read_excel("CaseStudy2-data.xlsx")

# Change characters to factors for more meaningfull analysis
DDSA <- raw_DDSA%>%
  mutate_if(is.character, as.factor) %>%
  select(Attrition, everything())
# attach so we don't have to keep writing DDSA$
attach(DDSA)

# View observations and variables
glimpse(DDSA)

## Observations: 1,470
## Variables: 35
## $ Attrition                <fct> Yes, No, Yes, No, No, No, No, No, No,...
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 3...
## $ BusinessTravel           <fct> Travel_Rarely, Travel_Frequently, Tra...
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 132...
## $ Department               <fct> Sales, Research & Development, Resear...
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, ...
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1...
## $ EducationField           <fct> Life Sciences, Life Sciences, Other, ...
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14,...
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1...
## $ Gender                   <fct> Female, Male, Male, Female, Male, Mal...
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 9...
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3...
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1...
## $ JobRole                  <fct> Sales Executive, Research Scientist, ...
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3...
## $ MaritalStatus            <fct> Single, Married, Single, Married, Mar...
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2...
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 118...
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1...
## $ Over18                   <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y...
## $ OverTime                 <fct> Yes, No, Yes, Yes, No, No, Yes, No, N...
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 1...
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3...
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4...
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 8...
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1...
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, ...
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1...
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2...
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, ...
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2...
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4...
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3...

# View summary statistics
summary(DDSA)

##  Attrition       Age                  BusinessTravel   DailyRate     
##  No :1233   Min.   :18.00   Non-Travel       : 150   Min.   : 102.0  
##  Yes: 237   1st Qu.:30.00   Travel_Frequently: 277   1st Qu.: 465.0  
##             Median :36.00   Travel_Rarely    :1043   Median : 802.0  
##             Mean   :36.92                            Mean   : 802.5  
##             3rd Qu.:43.00                            3rd Qu.:1157.0  
##             Max.   :60.00                            Max.   :1499.0  
##                                                                      
##                   Department  DistanceFromHome   Education    
##  Human Resources       : 63   Min.   : 1.000   Min.   :1.000  
##  Research & Development:961   1st Qu.: 2.000   1st Qu.:2.000  
##  Sales                 :446   Median : 7.000   Median :3.000  
##                               Mean   : 9.193   Mean   :2.913  
##                               3rd Qu.:14.000   3rd Qu.:4.000  
##                               Max.   :29.000   Max.   :5.000  
##                                                               
##           EducationField EmployeeCount EmployeeNumber  
##  Human Resources : 27    Min.   :1     Min.   :   1.0  
##  Life Sciences   :606    1st Qu.:1     1st Qu.: 491.2  
##  Marketing       :159    Median :1     Median :1020.5  
##  Medical         :464    Mean   :1     Mean   :1024.9  
##  Other           : 82    3rd Qu.:1     3rd Qu.:1555.8  
##  Technical Degree:132    Max.   :1     Max.   :2068.0  
##                                                        
##  EnvironmentSatisfaction    Gender      HourlyRate     JobInvolvement
##  Min.   :1.000           Female:588   Min.   : 30.00   Min.   :1.00  
##  1st Qu.:2.000           Male  :882   1st Qu.: 48.00   1st Qu.:2.00  
##  Median :3.000                        Median : 66.00   Median :3.00  
##  Mean   :2.722                        Mean   : 65.89   Mean   :2.73  
##  3rd Qu.:4.000                        3rd Qu.: 83.75   3rd Qu.:3.00  
##  Max.   :4.000                        Max.   :100.00   Max.   :4.00  
##                                                                      
##     JobLevel                          JobRole    JobSatisfaction
##  Min.   :1.000   Sales Executive          :326   Min.   :1.000  
##  1st Qu.:1.000   Research Scientist       :292   1st Qu.:2.000  
##  Median :2.000   Laboratory Technician    :259   Median :3.000  
##  Mean   :2.064   Manufacturing Director   :145   Mean   :2.729  
##  3rd Qu.:3.000   Healthcare Representative:131   3rd Qu.:4.000  
##  Max.   :5.000   Manager                  :102   Max.   :4.000  
##                  (Other)                  :215                  
##   MaritalStatus MonthlyIncome    MonthlyRate    NumCompaniesWorked
##  Divorced:327   Min.   : 1009   Min.   : 2094   Min.   :0.000     
##  Married :673   1st Qu.: 2911   1st Qu.: 8047   1st Qu.:1.000     
##  Single  :470   Median : 4919   Median :14236   Median :2.000     
##                 Mean   : 6503   Mean   :14313   Mean   :2.693     
##                 3rd Qu.: 8379   3rd Qu.:20462   3rd Qu.:4.000     
##                 Max.   :19999   Max.   :26999   Max.   :9.000     
##                                                                   
##  Over18   OverTime   PercentSalaryHike PerformanceRating
##  Y:1470   No :1054   Min.   :11.00     Min.   :3.000    
##           Yes: 416   1st Qu.:12.00     1st Qu.:3.000    
##                      Median :14.00     Median :3.000    
##                      Mean   :15.21     Mean   :3.154    
##                      3rd Qu.:18.00     3rd Qu.:3.000    
##                      Max.   :25.00     Max.   :4.000    
##                                                         
##  RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
##  Min.   :1.000            Min.   :80    Min.   :0.0000   Min.   : 0.00    
##  1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000   1st Qu.: 6.00    
##  Median :3.000            Median :80    Median :1.0000   Median :10.00    
##  Mean   :2.712            Mean   :80    Mean   :0.7939   Mean   :11.28    
##  3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000   3rd Qu.:15.00    
##  Max.   :4.000            Max.   :80    Max.   :3.0000   Max.   :40.00    
##                                                                           
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         Min.   :1.000   Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         Median :3.000   Median : 5.000   Median : 3.000    
##  Mean   :2.799         Mean   :2.761   Mean   : 7.008   Mean   : 4.229    
##  3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.: 9.000   3rd Qu.: 7.000    
##  Max.   :6.000         Max.   :4.000   Max.   :40.000   Max.   :18.000    
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 1.000          Median : 3.000      
##  Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :15.000          Max.   :17.000      
##

# Changed the following variables to categorical to eventually have a LifeSatisfaction variable
DDSA$DistanceFromHome <- cut(DDSA$DistanceFromHome, breaks = c(-Inf, 9, 21, Inf), labels=c("Poor", "Fair", "Good"))

DDSA$RelationshipSatisfaction <- cut(DDSA$RelationshipSatisfaction, breaks = c(-Inf, 2, 3, Inf), labels=c("Poor", "Fair", "Good"))

DDSA$WorkLifeBalance <- cut(DDSA$WorkLifeBalance, breaks = c(-Inf, 2, 3, Inf), labels=c("Poor", "Fair", "Good"))

ggplot(data = DDSA) +
  geom_bar(mapping = aes(x=DistanceFromHome))

ggplot(data = DDSA) +
  geom_bar(mapping = aes(x=RelationshipSatisfaction))

ggplot(data = DDSA) +
  geom_bar(mapping = aes(x=WorkLifeBalance))

# Is there a relationship between Age and Income. 
# Color each point based on the Gender of the participant
ggplot(data=DDSA) +
  geom_point(mapping = aes(x = Age, y = MonthlyIncome, color=Gender)) +
  geom_smooth(mapping = aes(x = Age, y = MonthlyIncome))

## `geom_smooth()` using method = 'gam'

# Split test/training sets ~ uses the rsample package
set.seed(100)
train_test_split <- initial_split(DDSA, prop = 0.8)
train_test_split

## <1177/293/1470>

# Retrieve train and test sets
train_tbl <- training(train_test_split)
test_tbl <- testing(train_test_split)

table(test_tbl$Attrition)

## 
##  No Yes 
## 237  56

237/nrow(test_tbl)

## [1] 0.8088737

modelCart = rpart(Attrition ~ ., data=train_tbl, method="class")
#Plot the model
prp(modelCart)

#Predict the test data
predictionCart <- predict(modelCart, newdata=test_tbl, type="class")

#CART Accuracy
#Confusion matrix 
t1 <- table(test_tbl$Attrition, predictionCart)
print(t1)

##      predictionCart
##        No Yes
##   No  228   9
##   Yes  40  16

# Confusion Matrix info: http://cs229.stanford.edu/section/evaluation_metrics.pdf

#CART model accuracy
(t1[1]+t1[4])/(nrow(test_tbl))

## [1] 0.8327645

Need to plot ROC curve

# Random Forest model
modelRf = randomForest(Attrition ~ ., data=train_tbl, ntree = 100, mtry = 5, importance = TRUE, method="class")
print(modelRf)

## 
## Call:
##  randomForest(formula = Attrition ~ ., data = train_tbl, ntree = 100,      mtry = 5, importance = TRUE, method = "class") 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 13.34%
## Confusion matrix:
##      No Yes class.error
## No  989   7 0.007028112
## Yes 150  31 0.828729282

# Random Forest info: https://medium.com/@williamkoehrsen/random-forest-simple-explanation-377895a60d2d
# OOB estimate of  error rate: 13.34% ~

#OOB vs No. Of Trees
# Out-of-bag (OOB) error ~ a method of measuring the prediction error of random forests, 
# boosted decision trees, and other machine learning models utilizing bootstrap aggregating 
#(bagging) to sub-sample data samples used for training.
plot(modelRf, main="")
legend("topright", c("OOB", "0", "1"), text.col=1:6, lty=1:3, col=1:3)
title(main="Error Rates Random Forest")

## List the importance of the variables.
impVar <- round(randomForest::importance(modelRf), 2)
impVar[order(impVar[,3], decreasing=TRUE),]

##                             No   Yes MeanDecreaseAccuracy MeanDecreaseGini
## OverTime                  6.95  8.47                 9.75            15.72
## Age                       4.80  6.13                 7.40            19.32
## MonthlyIncome             6.12  1.85                 7.21            22.06
## TotalWorkingYears         4.92  2.90                 6.21            15.00
## JobRole                   4.38  4.14                 6.08            15.97
## MaritalStatus             4.71  4.86                 6.04             6.74
## YearsAtCompany            3.22  3.21                 4.92            12.33
## YearsWithCurrManager      4.31  2.03                 4.54             9.06
## JobLevel                  3.51  3.91                 4.49             7.04
## StockOptionLevel          2.11  4.18                 3.56             8.20
## JobSatisfaction           2.93  2.42                 3.54             8.41
## EnvironmentSatisfaction   2.54  2.57                 3.43             9.35
## BusinessTravel            2.60  2.19                 3.07             5.85
## YearsInCurrentRole        2.39  1.46                 2.96             8.07
## Department                1.62  2.74                 2.80             3.42
## NumCompaniesWorked        0.45  2.84                 1.51            10.13
## EducationField            0.84  1.21                 1.41            10.67
## DailyRate                 0.44  1.91                 1.25            16.99
## Education                 1.51 -0.55                 1.18             6.09
## YearsSinceLastPromotion   1.46 -0.90                 1.04             7.51
## WorkLifeBalance           0.47  1.15                 0.93             6.44
## HourlyRate                0.51  0.82                 0.85            14.15
## TrainingTimesLastYear     0.26  0.89                 0.59             7.27
## DistanceFromHome         -0.49  1.93                 0.54             6.25
## EmployeeNumber           -0.05  0.28                 0.17            15.12
## EmployeeCount             0.00  0.00                 0.00             0.00
## Over18                    0.00  0.00                 0.00             0.00
## StandardHours             0.00  0.00                 0.00             0.00
## RelationshipSatisfaction  0.27 -0.66                -0.03             5.89
## Gender                   -0.87  0.83                -0.46             1.85
## JobInvolvement           -0.94 -0.50                -1.04             6.20
## PercentSalaryHike        -1.90  0.92                -1.35             9.44
## PerformanceRating        -1.59 -0.57                -1.65             1.27
## MonthlyRate              -2.48  0.94                -1.82            14.06

# Most important variables of attrition
attrition_variables <- DDSA %>%
  tibble::as_tibble() %>%
  select(Attrition, OverTime,Age, MonthlyIncome) %>%
  rowid_to_column(var = "Case")
attrition_variables

## # A tibble: 1,470 x 5
##     Case Attrition OverTime   Age MonthlyIncome
##    <int> <fct>     <fct>    <dbl>         <dbl>
##  1     1 Yes       Yes        41.         5993.
##  2     2 No        No         49.         5130.
##  3     3 Yes       Yes        37.         2090.
##  4     4 No        Yes        33.         2909.
##  5     5 No        No         27.         3468.
##  6     6 No        No         32.         3068.
##  7     7 No        Yes        59.         2670.
##  8     8 No        No         30.         2693.
##  9     9 No        No         38.         9526.
## 10    10 No        No         36.         5237.
## # ... with 1,460 more rows