Job Retention Project

In this project, I will be looking at a data set from the peopleanalytics package in R. I hope to examine relationships and find meaningul insights into job retention. This analysis is ideal for anyone in the HR analytics field.

rm(list = ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DataExplorer)
library(runway)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(peopleanalyticsdata)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(InformationValue)
## 
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, sensitivity, specificity

Looking at data and understanding data

job = job_retention

## Looking at data
head(job)
##   gender                  field  level sentiment intention left month
## 1      M      Public/Government   High         3         8    1     1
## 2      F                Finance    Low         8         4    0    12
## 3      M Education and Training Medium         7         7    1     5
## 4      M                Finance    Low         8         4    0    12
## 5      M                Finance   High         7         6    1     1
## 6      F                 Health Medium         6        10    1     2
summary(job)
##     gender             field              level             sentiment     
##  Length:3770        Length:3770        Length:3770        Min.   : 1.000  
##  Class :character   Class :character   Class :character   1st Qu.: 7.000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 8.000  
##                                                           Mean   : 7.586  
##                                                           3rd Qu.: 9.000  
##                                                           Max.   :10.000  
##    intention           left            month       
##  Min.   : 1.000   Min.   :0.0000   Min.   : 1.000  
##  1st Qu.: 3.000   1st Qu.:0.0000   1st Qu.: 7.000  
##  Median : 4.000   Median :0.0000   Median :12.000  
##  Mean   : 4.193   Mean   :0.3592   Mean   : 9.481  
##  3rd Qu.: 6.000   3rd Qu.:1.0000   3rd Qu.:12.000  
##  Max.   :10.000   Max.   :1.0000   Max.   :12.000
job %>% glimpse()
## Rows: 3,770
## Columns: 7
## $ gender    <chr> "M", "F", "M", "M", "M", "F", "M", "M", "M", "M", "F", "F", …
## $ field     <chr> "Public/Government", "Finance", "Education and Training", "F…
## $ level     <chr> "High", "Low", "Medium", "Low", "High", "Medium", "Medium", …
## $ sentiment <int> 3, 8, 7, 8, 7, 6, 8, 7, 7, 7, 8, 8, 4, 7, 7, 7, 8, 7, 8, 7, …
## $ intention <int> 8, 4, 7, 4, 6, 10, 2, 9, 6, 4, 3, 9, 4, 2, 4, 2, 4, 3, 4, 4,…
## $ left      <int> 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ month     <int> 1, 12, 5, 12, 1, 2, 12, 12, 5, 12, 12, 3, 6, 8, 4, 8, 3, 12,…
## Understanding data
job %>% introduce()
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1 3770       7                3                  4                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0          3770              26390       153784
job %>% plot_bar()

job %>% plot_histogram()

job %>% plot_qq()

job %>% plot_prcomp()

EDA

## Early EDA
ggplot(job, aes(x = gender, y = sentiment, group = gender)) +
  geom_boxplot(aes(fill = gender)) +
  labs(title = "Sentiment by Gender")

ggplot(job, aes(x = gender, y = intention, group = gender)) +
  geom_boxplot(aes(fill = gender)) +
  labs(title = "Intention by Gender")

ggplot(job, aes(x = field, y = sentiment, group = field)) +
  geom_boxplot(aes(fill = field)) +
  labs(title = "Sentiment by Field")

ggplot(job, aes(x = field, y = intention, group = field)) +
  geom_boxplot(aes(fill = field)) +
  labs(title = "Intention by Field")

ggplot(job, aes(x = level, y = sentiment, group = level)) +
  geom_boxplot(aes(fill = level)) +
  labs(title = "Sentiment by Level")

ggplot(job, aes(x = level, y = intention, group = level)) +
  geom_boxplot(aes(fill = level)) +
  labs(title = "Intention by Level")

a = ggplot(job, aes(x = sentiment, fill = as.factor(left))) + geom_density(alpha = 0.4)
b = ggplot(job, aes(x = intention, fill = as.factor(left))) + geom_density(alpha = 0.4)

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(a, b)

cor(job$intention, job$left)
## [1] 0.2539576
cor(job$sentiment, job$left)
## [1] -0.1337085
table(job$left) / nrow(job)
## 
##         0         1 
## 0.6408488 0.3591512
## Feature Engineering
job = job %>% mutate(gender_and_level = case_when(
  level == "Low" & gender == "F" ~ "Low Female",
  level == "Low" & gender == "M" ~ "Low Male",
  level == "Medium" & gender == "F" ~ "Medium Female",
  level == "Medium" & gender == "M" ~ "Medium Male",
  level == "High" & gender == "F" ~ "High Female",
  level == "High" & gender == "M" ~ "High Male"
))

ggplot(job, aes(x = gender_and_level, y = intention, group = gender_and_level)) +
  geom_boxplot(aes(fill = gender_and_level)) +
  labs(title = "Intention by Level")

t = job %>% select(gender_and_level, left) %>%
  table()
as_tibble(prop.table(t, 1)) %>% filter(left == 1) %>% arrange(n)
## # A tibble: 6 x 3
##   gender_and_level left      n
##   <chr>            <chr> <dbl>
## 1 High Female      1     0.28 
## 2 High Male        1     0.321
## 3 Low Male         1     0.362
## 4 Medium Male      1     0.363
## 5 Low Female       1     0.374
## 6 Medium Female    1     0.375
t = job %>% select(level, left) %>%
  table()
as_tibble(prop.table(t, 1)) %>% filter(left == 1) %>% arrange(n)
## # A tibble: 3 x 3
##   level  left      n
##   <chr>  <chr> <dbl>
## 1 High   1     0.312
## 2 Low    1     0.366
## 3 Medium 1     0.366
t = job %>% select(field, left) %>%
  table()
as_tibble(prop.table(t, 1)) %>% filter(left == 1) %>% arrange(n)
## # A tibble: 6 x 3
##   field                  left      n
##   <chr>                  <chr> <dbl>
## 1 Education and Training 1     0.322
## 2 Sales/Marketing        1     0.345
## 3 Public/Government      1     0.356
## 4 Law                    1     0.372
## 5 Finance                1     0.393
## 6 Health                 1     0.403
ggplot(data = job, aes(x = field)) +
  geom_bar(stat = "count", aes(fill = as.factor(left)))

ggplot(data = job, aes(x = level)) +
  geom_bar(stat = "count", aes(fill = as.factor(left)))

ggplot(data = job, aes(x = gender)) +
  geom_bar(stat = "count", aes(fill = as.factor(left)))

## Dummy Variables
dummyVariable = dummyVars(~., data = job)
job_dummy <- data.frame(predict(dummyVariable,newdata = job))

cor(job_dummy$genderF, job_dummy$left)
## [1] 0.00941275
cor(job_dummy$genderM, job_dummy$left)
## [1] -0.00941275
cors = round(cor(job_dummy),2)
library(corrplot)
## corrplot 0.84 loaded
corrplot(cors,type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

Model Building

I will focus on trying out different models and comparing their AUCs and MSEs. I will not be focusing on hypertunining parameters at this time.

### Build the model next

job_dummy = job_dummy %>% select(-month)

set.seed(12345)


### Logistic Model

MSE_vector = c()

for(i in c(.5,.55,.6,.65,.7,.75,.8,.85,.9))
{
  print(i)
  data_idx = createDataPartition(job_dummy$left, p = i, list = FALSE)
  train = job_dummy[data_idx,]
  test = job_dummy[-data_idx,]
  
  logistic_model = glm(left ~., family = "binomial", data = train )
  logistic_model_predictions = predict(logistic_model, newdata = test, type = "response")
  
  mse = sum((job_dummy$left - logistic_model_predictions) ** 2) 
  
  MSE_vector = c(MSE_vector, mse)
  
}
## [1] 0.5
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## [1] 0.55
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in job_dummy$left - logistic_model_predictions: longer object length is
## not a multiple of shorter object length
## [1] 0.6
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.65
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.7
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.75
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.8
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## [1] 0.85
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.9
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
a = data.frame(p = seq(.5,.9, .05),
           MSE = MSE_vector)

ggplot(a, aes(x = p, y = MSE)) +
  geom_line(color = "blue") +
  labs(title = "MSE by Varying Proportion of Data Set in Training Set")

data_idx = createDataPartition(job_dummy$left, p = .7, list = FALSE)
train = job_dummy[data_idx,]
test = job_dummy[-data_idx,]



logistic_model = glm(left ~., family = "binomial", data = train )
summary(logistic_model)
## 
## Call:
## glm(formula = left ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6047  -0.9113  -0.7529   1.2045   1.9433  
## 
## Coefficients: (7 not defined because of singularities)
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.26883    0.29177  -4.349 1.37e-05 ***
## genderF                       -0.09329    0.19984  -0.467   0.6406    
## genderM                             NA         NA      NA       NA    
## fieldEducation.and.Training    0.01103    0.15520   0.071   0.9434    
## fieldFinance                   0.23985    0.15282   1.570   0.1165    
## fieldHealth                    0.25693    0.23018   1.116   0.2643    
## fieldLaw                      -0.08885    0.26641  -0.334   0.7387    
## fieldPublic.Government         0.13727    0.17706   0.775   0.4382    
## fieldSales.Marketing                NA         NA      NA       NA    
## levelHigh                     -0.10442    0.17544  -0.595   0.5517    
## levelLow                      -0.01476    0.12506  -0.118   0.9061    
## levelMedium                         NA         NA      NA       NA    
## sentiment                     -0.05672    0.02482  -2.285   0.0223 *  
## intention                      0.24206    0.02169  11.159  < 2e-16 ***
## gender_and_levelHigh.Female   -0.52775    0.38227  -1.381   0.1674    
## gender_and_levelHigh.Male           NA         NA      NA       NA    
## gender_and_levelLow.Female     0.14200    0.22693   0.626   0.5315    
## gender_and_levelLow.Male            NA         NA      NA       NA    
## gender_and_levelMedium.Female       NA         NA      NA       NA    
## gender_and_levelMedium.Male         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3460.0  on 2638  degrees of freedom
## Residual deviance: 3267.2  on 2626  degrees of freedom
## AIC: 3293.2
## 
## Number of Fisher Scoring iterations: 4
AIC(logistic_model)
## [1] 3293.179
BIC(logistic_model)
## [1] 3369.595
logistic_model_predictions = predict(logistic_model, newdata = test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
hist(logistic_model_predictions)

logistic_roc = roc(test$left, logistic_model_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(logistic_roc, color = "blue")

logistic_auc = auc(logistic_roc)
logistic_mse = sum((test$left - logistic_model_predictions) ** 2)


backwards_logistic_model = step(logistic_model)
## Start:  AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance + 
##     fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing + 
##     levelHigh + levelLow + levelMedium + sentiment + intention + 
##     gender_and_levelHigh.Female + gender_and_levelHigh.Male + 
##     gender_and_levelLow.Female + gender_and_levelLow.Male + gender_and_levelMedium.Female + 
##     gender_and_levelMedium.Male
## 
## 
## Step:  AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance + 
##     fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing + 
##     levelHigh + levelLow + levelMedium + sentiment + intention + 
##     gender_and_levelHigh.Female + gender_and_levelHigh.Male + 
##     gender_and_levelLow.Female + gender_and_levelLow.Male + gender_and_levelMedium.Female
## 
## 
## Step:  AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance + 
##     fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing + 
##     levelHigh + levelLow + levelMedium + sentiment + intention + 
##     gender_and_levelHigh.Female + gender_and_levelHigh.Male + 
##     gender_and_levelLow.Female + gender_and_levelLow.Male
## 
## 
## Step:  AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance + 
##     fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing + 
##     levelHigh + levelLow + levelMedium + sentiment + intention + 
##     gender_and_levelHigh.Female + gender_and_levelHigh.Male + 
##     gender_and_levelLow.Female
## 
## 
## Step:  AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance + 
##     fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing + 
##     levelHigh + levelLow + levelMedium + sentiment + intention + 
##     gender_and_levelHigh.Female + gender_and_levelLow.Female
## 
## 
## Step:  AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance + 
##     fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing + 
##     levelHigh + levelLow + sentiment + intention + gender_and_levelHigh.Female + 
##     gender_and_levelLow.Female
## 
## 
## Step:  AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance + 
##     fieldHealth + fieldLaw + fieldPublic.Government + levelHigh + 
##     levelLow + sentiment + intention + gender_and_levelHigh.Female + 
##     gender_and_levelLow.Female
## 
## 
## Step:  AIC=3293.18
## left ~ genderF + fieldEducation.and.Training + fieldFinance + 
##     fieldHealth + fieldLaw + fieldPublic.Government + levelHigh + 
##     levelLow + sentiment + intention + gender_and_levelHigh.Female + 
##     gender_and_levelLow.Female
## 
##                               Df Deviance    AIC
## - fieldEducation.and.Training  1   3267.2 3291.2
## - levelLow                     1   3267.2 3291.2
## - fieldLaw                     1   3267.3 3291.3
## - genderF                      1   3267.4 3291.4
## - levelHigh                    1   3267.5 3291.5
## - gender_and_levelLow.Female   1   3267.6 3291.6
## - fieldPublic.Government       1   3267.8 3291.8
## - fieldHealth                  1   3268.4 3292.4
## - gender_and_levelHigh.Female  1   3269.1 3293.1
## <none>                             3267.2 3293.2
## - fieldFinance                 1   3269.7 3293.7
## - sentiment                    1   3272.4 3296.4
## - intention                    1   3397.7 3421.7
## 
## Step:  AIC=3291.18
## left ~ genderF + fieldFinance + fieldHealth + fieldLaw + fieldPublic.Government + 
##     levelHigh + levelLow + sentiment + intention + gender_and_levelHigh.Female + 
##     gender_and_levelLow.Female
## 
##                               Df Deviance    AIC
## - levelLow                     1   3267.2 3289.2
## - fieldLaw                     1   3267.4 3289.4
## - genderF                      1   3267.4 3289.4
## - levelHigh                    1   3267.5 3289.5
## - gender_and_levelLow.Female   1   3267.6 3289.6
## - fieldPublic.Government       1   3268.1 3290.1
## - fieldHealth                  1   3268.8 3290.8
## - gender_and_levelHigh.Female  1   3269.1 3291.1
## <none>                             3267.2 3291.2
## - sentiment                    1   3272.4 3294.4
## - fieldFinance                 1   3273.0 3295.0
## - intention                    1   3397.7 3419.7
## 
## Step:  AIC=3289.2
## left ~ genderF + fieldFinance + fieldHealth + fieldLaw + fieldPublic.Government + 
##     levelHigh + sentiment + intention + gender_and_levelHigh.Female + 
##     gender_and_levelLow.Female
## 
##                               Df Deviance    AIC
## - fieldLaw                     1   3267.4 3287.4
## - genderF                      1   3267.4 3287.4
## - levelHigh                    1   3267.6 3287.6
## - gender_and_levelLow.Female   1   3267.6 3287.6
## - fieldPublic.Government       1   3268.2 3288.2
## - fieldHealth                  1   3268.8 3288.8
## <none>                             3267.2 3289.2
## - gender_and_levelHigh.Female  1   3269.4 3289.4
## - sentiment                    1   3272.4 3292.4
## - fieldFinance                 1   3273.0 3293.0
## - intention                    1   3398.2 3418.2
## 
## Step:  AIC=3287.37
## left ~ genderF + fieldFinance + fieldHealth + fieldPublic.Government + 
##     levelHigh + sentiment + intention + gender_and_levelHigh.Female + 
##     gender_and_levelLow.Female
## 
##                               Df Deviance    AIC
## - genderF                      1   3267.6 3285.6
## - levelHigh                    1   3267.8 3285.8
## - gender_and_levelLow.Female   1   3267.8 3285.8
## - fieldPublic.Government       1   3268.5 3286.5
## - fieldHealth                  1   3269.0 3287.0
## <none>                             3267.4 3287.4
## - gender_and_levelHigh.Female  1   3269.5 3287.5
## - sentiment                    1   3272.5 3290.5
## - fieldFinance                 1   3273.8 3291.8
## - intention                    1   3398.3 3416.3
## 
## Step:  AIC=3285.58
## left ~ fieldFinance + fieldHealth + fieldPublic.Government + 
##     levelHigh + sentiment + intention + gender_and_levelHigh.Female + 
##     gender_and_levelLow.Female
## 
##                               Df Deviance    AIC
## - gender_and_levelLow.Female   1   3267.8 3283.8
## - levelHigh                    1   3267.9 3283.9
## - fieldPublic.Government       1   3268.7 3284.7
## - fieldHealth                  1   3269.3 3285.3
## <none>                             3267.6 3285.6
## - gender_and_levelHigh.Female  1   3271.4 3287.4
## - sentiment                    1   3272.9 3288.9
## - fieldFinance                 1   3273.9 3289.9
## - intention                    1   3398.4 3414.4
## 
## Step:  AIC=3283.84
## left ~ fieldFinance + fieldHealth + fieldPublic.Government + 
##     levelHigh + sentiment + intention + gender_and_levelHigh.Female
## 
##                               Df Deviance    AIC
## - levelHigh                    1   3268.3 3282.3
## - fieldPublic.Government       1   3268.9 3282.9
## - fieldHealth                  1   3269.5 3283.5
## <none>                             3267.8 3283.8
## - gender_and_levelHigh.Female  1   3271.7 3285.7
## - sentiment                    1   3273.1 3287.1
## - fieldFinance                 1   3274.3 3288.3
## - intention                    1   3399.2 3413.2
## 
## Step:  AIC=3282.29
## left ~ fieldFinance + fieldHealth + fieldPublic.Government + 
##     sentiment + intention + gender_and_levelHigh.Female
## 
##                               Df Deviance    AIC
## - fieldPublic.Government       1   3269.4 3281.4
## - fieldHealth                  1   3269.9 3281.9
## <none>                             3268.3 3282.3
## - sentiment                    1   3273.8 3285.8
## - gender_and_levelHigh.Female  1   3274.6 3286.6
## - fieldFinance                 1   3274.6 3286.6
## - intention                    1   3400.2 3412.2
## 
## Step:  AIC=3281.39
## left ~ fieldFinance + fieldHealth + sentiment + intention + gender_and_levelHigh.Female
## 
##                               Df Deviance    AIC
## - fieldHealth                  1   3270.6 3280.6
## <none>                             3269.4 3281.4
## - fieldFinance                 1   3274.7 3284.7
## - sentiment                    1   3275.2 3285.2
## - gender_and_levelHigh.Female  1   3275.7 3285.7
## - intention                    1   3400.6 3410.6
## 
## Step:  AIC=3280.64
## left ~ fieldFinance + sentiment + intention + gender_and_levelHigh.Female
## 
##                               Df Deviance    AIC
## <none>                             3270.6 3280.6
## - fieldFinance                 1   3275.2 3283.2
## - sentiment                    1   3276.5 3284.5
## - gender_and_levelHigh.Female  1   3277.2 3285.2
## - intention                    1   3401.9 3409.9
summary(backwards_logistic_model)
## 
## Call:
## glm(formula = left ~ fieldFinance + sentiment + intention + gender_and_levelHigh.Female, 
##     family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6090  -0.9132  -0.7471   1.1982   1.9497  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.20610    0.24082  -5.008 5.49e-07 ***
## fieldFinance                 0.18754    0.08733   2.147   0.0318 *  
## sentiment                   -0.05976    0.02463  -2.427   0.0152 *  
## intention                    0.24132    0.02156  11.195  < 2e-16 ***
## gender_and_levelHigh.Female -0.72475    0.29692  -2.441   0.0147 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3460.0  on 2638  degrees of freedom
## Residual deviance: 3270.6  on 2634  degrees of freedom
## AIC: 3280.6
## 
## Number of Fisher Scoring iterations: 4
AIC(backwards_logistic_model)
## [1] 3280.643
BIC(backwards_logistic_model)
## [1] 3310.034
backwards_logistic_model_predictions = predict(backwards_logistic_model, newdata = test, type = "response")
hist(backwards_logistic_model_predictions)

backwards_logistic_roc = roc(test$left, backwards_logistic_model_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(backwards_logistic_roc, color = "blue")

backwards_logistic_auc = auc(backwards_logistic_roc)
backwards_mse = sum((test$left - backwards_logistic_model_predictions) ** 2)


### Decision Tree

tree_model = train(left ~ ., 
                  data=train, 
                  method="rpart", 
                  trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
tree_model
## CART 
## 
## 2639 samples
##   19 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2376, 2375, 2375, 2375, 2375, 2375, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE       Rsquared    MAE      
##   0.004361773  0.4672688  0.05931258  0.4349772
##   0.011553336  0.4691101  0.05252829  0.4388697
##   0.049062768  0.4775932  0.03282945  0.4535270
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.004361773.
suppressMessages(library(rattle))
fancyRpartPlot(tree_model$finalModel)

tree_predictions = predict(tree_model, newdata = test)
hist(tree_predictions)

tree_roc = roc(test$left, tree_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(tree_roc, color = "blue")

tree_auc = auc(tree_roc)
tree_mse = sum((test$left - tree_predictions) ** 2)


### SVM

svm_model = train(left ~.,
                  data = train, method = "svmPoly",
                  trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
svm_model
## Support Vector Machines with Polynomial Kernel 
## 
## 2639 samples
##   19 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2375, 2375, 2375, 2375, 2375, 2375, ... 
## Resampling results across tuning parameters:
## 
##   degree  scale  C     RMSE       Rsquared    MAE      
##   1       0.001  0.25  0.5751077  0.05314298  0.3768710
##   1       0.001  0.50  0.5751111  0.03739870  0.3768697
##   1       0.001  1.00  0.5750618  0.05551311  0.3768680
##   1       0.010  0.25  0.5749487  0.04018114  0.3768763
##   1       0.010  0.50  0.5747199  0.03857745  0.3768700
##   1       0.010  1.00  0.5742575  0.04561234  0.3768860
##   1       0.100  0.25  0.5732274  0.03783988  0.3769058
##   1       0.100  0.50  0.5732396  0.03853981  0.3768985
##   1       0.100  1.00  0.5732391  0.03421681  0.3768909
##   2       0.001  0.25  0.5749492  0.06010013  0.3768591
##   2       0.001  0.50  0.5747317  0.05713712  0.3768350
##   2       0.001  1.00  0.5742035  0.05985165  0.3767860
##   2       0.010  0.25  0.5594526  0.05669019  0.3750242
##   2       0.010  0.50  0.5464677  0.05651013  0.3733418
##   2       0.010  1.00  0.5297580  0.05604006  0.3701900
##   2       0.100  0.25  0.5192867  0.05457813  0.3657455
##   2       0.100  0.50  0.5197427  0.05375977  0.3659771
##   2       0.100  1.00  0.5199247  0.05353243  0.3659905
##   3       0.001  0.25  0.5745797  0.05844971  0.3768154
##   3       0.001  0.50  0.5738785  0.05907771  0.3767524
##   3       0.001  1.00  0.5725335  0.05922396  0.3766166
##   3       0.010  0.25  0.5388119  0.05445635  0.3721792
##   3       0.010  0.50  0.5249853  0.05487263  0.3686004
##   3       0.010  1.00  0.5204695  0.05558208  0.3669241
##   3       0.100  0.25  0.5290718  0.04242533  0.3739527
##   3       0.100  0.50  0.5310579  0.04106314  0.3757324
##   3       0.100  1.00  0.5334443  0.03895622  0.3773726
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were degree = 2, scale = 0.1 and C = 0.25.
svm_predictions = predict(svm_model, newdata = test)
hist(svm_predictions)

svm_roc = roc(test$left, svm_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(svm_roc, color = "blue")

svm_auc = auc(svm_roc)
svm_mse = sum((test$left - svm_predictions) **2)



### Lasso

lasso_model = train(left ~.,
                    data = train, method = "lasso",
                    trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
lasso_model
## The lasso 
## 
## 2639 samples
##   19 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2375, 2375, 2375, 2375, 2376, 2375, ... 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE       Rsquared    MAE      
##   0.1       0.4721405  0.06741913  0.4497494
##   0.5       0.4658603  0.06676659  0.4344801
##   0.9       0.4653961  0.06811550  0.4314640
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
lasso_predictions = predict(lasso_model, newdata = test)
hist(lasso_predictions)

lasso_roc = roc(test$left, lasso_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(lasso_roc, color = "blue")

lasso_auc = auc(lasso_roc)
lasso_mse = sum((test$left - lasso_predictions) **2)


### Ridge

ridge_model = train(left ~ .,
                    data = train, method = "ridge",
                    trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
ridge_model
## Ridge Regression 
## 
## 2639 samples
##   19 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2375, 2375, 2376, 2375, 2375, 2375, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE       Rsquared    MAE      
##   0e+00   0.4653620  0.06872936  0.4312849
##   1e-04   0.4653620  0.06872920  0.4312845
##   1e-01   0.4654241  0.06853354  0.4309016
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.
ridge_predictions = predict(ridge_model, newdata = test)
hist(ridge_predictions)

ridge_roc = roc(test$left, ridge_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ridge_roc, color = "blue")

ridge_auc = auc(ridge_roc)
ridge_mse = sum((test$left - ridge_predictions) **2)

ridge_model = train(left ~ .,
                    data = train, method = "ridge",
                    trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
model_results = data.frame(Model = c("Logistic", "Backwards Step",
                     "Decision Tree", "SVM","Lasso", "Ridge"),
           AUC = c(logistic_auc, backwards_logistic_auc, tree_auc,
                   svm_auc, lasso_auc, ridge_auc),
           MSE = c(logistic_mse, backwards_mse, tree_mse,
                   svm_mse, lasso_mse, tree_mse))
model_results
##            Model       AUC      MSE
## 1       Logistic 0.6481397 240.1911
## 2 Backwards Step 0.6467036 240.2120
## 3  Decision Tree 0.6362586 241.5930
## 4            SVM 0.6399986 291.7662
## 5          Lasso 0.6486666 240.0708
## 6          Ridge 0.6489834 241.5930
auc = ggplot(model_results, aes(x = Model, y = AUC, group = 1)) +
  geom_line(color = "grey") +
  geom_point(aes(color = Model)) +
  labs(title = "AUC By Model")

mse = model_results %>% filter(Model != "SVM") %>%
  ggplot(aes(x = Model, y = MSE, group = 1)) +
  geom_line(color = "grey") +
  geom_point(aes(color = Model)) +
  labs(title = "MSE By Model")

grid.arrange(auc, mse)

Conclusion

In this analysis, I compared many models against one another. We found out that the lasso model performed the best on this data set. While the AUC and MSE are not spectacular, it was due to focus being put on building models and not tuning models. If I had more time, the performance would be much better. I also would not have used all the variables. I did this for this analysis for ease for comparison.

In the MSE By Model visualization, I excluded the SVM model due to its data point throwing off the Y axis.

I thoroughly enjoy the HR analytics field and hope more HR professionals look into these types of analyses.