In this project, I will be looking at a data set from the peopleanalytics package in R. I hope to examine relationships and find meaningul insights into job retention. This analysis is ideal for anyone in the HR analytics field.
rm(list = ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DataExplorer)
library(runway)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(peopleanalyticsdata)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(InformationValue)
##
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, sensitivity, specificity
job = job_retention
## Looking at data
head(job)
## gender field level sentiment intention left month
## 1 M Public/Government High 3 8 1 1
## 2 F Finance Low 8 4 0 12
## 3 M Education and Training Medium 7 7 1 5
## 4 M Finance Low 8 4 0 12
## 5 M Finance High 7 6 1 1
## 6 F Health Medium 6 10 1 2
summary(job)
## gender field level sentiment
## Length:3770 Length:3770 Length:3770 Min. : 1.000
## Class :character Class :character Class :character 1st Qu.: 7.000
## Mode :character Mode :character Mode :character Median : 8.000
## Mean : 7.586
## 3rd Qu.: 9.000
## Max. :10.000
## intention left month
## Min. : 1.000 Min. :0.0000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.: 7.000
## Median : 4.000 Median :0.0000 Median :12.000
## Mean : 4.193 Mean :0.3592 Mean : 9.481
## 3rd Qu.: 6.000 3rd Qu.:1.0000 3rd Qu.:12.000
## Max. :10.000 Max. :1.0000 Max. :12.000
job %>% glimpse()
## Rows: 3,770
## Columns: 7
## $ gender <chr> "M", "F", "M", "M", "M", "F", "M", "M", "M", "M", "F", "F", …
## $ field <chr> "Public/Government", "Finance", "Education and Training", "F…
## $ level <chr> "High", "Low", "Medium", "Low", "High", "Medium", "Medium", …
## $ sentiment <int> 3, 8, 7, 8, 7, 6, 8, 7, 7, 7, 8, 8, 4, 7, 7, 7, 8, 7, 8, 7, …
## $ intention <int> 8, 4, 7, 4, 6, 10, 2, 9, 6, 4, 3, 9, 4, 2, 4, 2, 4, 3, 4, 4,…
## $ left <int> 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ month <int> 1, 12, 5, 12, 1, 2, 12, 12, 5, 12, 12, 3, 6, 8, 4, 8, 3, 12,…
## Understanding data
job %>% introduce()
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 3770 7 3 4 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 3770 26390 153784
job %>% plot_bar()
job %>% plot_histogram()
job %>% plot_qq()
job %>% plot_prcomp()
## Early EDA
ggplot(job, aes(x = gender, y = sentiment, group = gender)) +
geom_boxplot(aes(fill = gender)) +
labs(title = "Sentiment by Gender")
ggplot(job, aes(x = gender, y = intention, group = gender)) +
geom_boxplot(aes(fill = gender)) +
labs(title = "Intention by Gender")
ggplot(job, aes(x = field, y = sentiment, group = field)) +
geom_boxplot(aes(fill = field)) +
labs(title = "Sentiment by Field")
ggplot(job, aes(x = field, y = intention, group = field)) +
geom_boxplot(aes(fill = field)) +
labs(title = "Intention by Field")
ggplot(job, aes(x = level, y = sentiment, group = level)) +
geom_boxplot(aes(fill = level)) +
labs(title = "Sentiment by Level")
ggplot(job, aes(x = level, y = intention, group = level)) +
geom_boxplot(aes(fill = level)) +
labs(title = "Intention by Level")
a = ggplot(job, aes(x = sentiment, fill = as.factor(left))) + geom_density(alpha = 0.4)
b = ggplot(job, aes(x = intention, fill = as.factor(left))) + geom_density(alpha = 0.4)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(a, b)
cor(job$intention, job$left)
## [1] 0.2539576
cor(job$sentiment, job$left)
## [1] -0.1337085
table(job$left) / nrow(job)
##
## 0 1
## 0.6408488 0.3591512
## Feature Engineering
job = job %>% mutate(gender_and_level = case_when(
level == "Low" & gender == "F" ~ "Low Female",
level == "Low" & gender == "M" ~ "Low Male",
level == "Medium" & gender == "F" ~ "Medium Female",
level == "Medium" & gender == "M" ~ "Medium Male",
level == "High" & gender == "F" ~ "High Female",
level == "High" & gender == "M" ~ "High Male"
))
ggplot(job, aes(x = gender_and_level, y = intention, group = gender_and_level)) +
geom_boxplot(aes(fill = gender_and_level)) +
labs(title = "Intention by Level")
t = job %>% select(gender_and_level, left) %>%
table()
as_tibble(prop.table(t, 1)) %>% filter(left == 1) %>% arrange(n)
## # A tibble: 6 x 3
## gender_and_level left n
## <chr> <chr> <dbl>
## 1 High Female 1 0.28
## 2 High Male 1 0.321
## 3 Low Male 1 0.362
## 4 Medium Male 1 0.363
## 5 Low Female 1 0.374
## 6 Medium Female 1 0.375
t = job %>% select(level, left) %>%
table()
as_tibble(prop.table(t, 1)) %>% filter(left == 1) %>% arrange(n)
## # A tibble: 3 x 3
## level left n
## <chr> <chr> <dbl>
## 1 High 1 0.312
## 2 Low 1 0.366
## 3 Medium 1 0.366
t = job %>% select(field, left) %>%
table()
as_tibble(prop.table(t, 1)) %>% filter(left == 1) %>% arrange(n)
## # A tibble: 6 x 3
## field left n
## <chr> <chr> <dbl>
## 1 Education and Training 1 0.322
## 2 Sales/Marketing 1 0.345
## 3 Public/Government 1 0.356
## 4 Law 1 0.372
## 5 Finance 1 0.393
## 6 Health 1 0.403
ggplot(data = job, aes(x = field)) +
geom_bar(stat = "count", aes(fill = as.factor(left)))
ggplot(data = job, aes(x = level)) +
geom_bar(stat = "count", aes(fill = as.factor(left)))
ggplot(data = job, aes(x = gender)) +
geom_bar(stat = "count", aes(fill = as.factor(left)))
## Dummy Variables
dummyVariable = dummyVars(~., data = job)
job_dummy <- data.frame(predict(dummyVariable,newdata = job))
cor(job_dummy$genderF, job_dummy$left)
## [1] 0.00941275
cor(job_dummy$genderM, job_dummy$left)
## [1] -0.00941275
cors = round(cor(job_dummy),2)
library(corrplot)
## corrplot 0.84 loaded
corrplot(cors,type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
I will focus on trying out different models and comparing their AUCs and MSEs. I will not be focusing on hypertunining parameters at this time.
### Build the model next
job_dummy = job_dummy %>% select(-month)
set.seed(12345)
### Logistic Model
MSE_vector = c()
for(i in c(.5,.55,.6,.65,.7,.75,.8,.85,.9))
{
print(i)
data_idx = createDataPartition(job_dummy$left, p = i, list = FALSE)
train = job_dummy[data_idx,]
test = job_dummy[-data_idx,]
logistic_model = glm(left ~., family = "binomial", data = train )
logistic_model_predictions = predict(logistic_model, newdata = test, type = "response")
mse = sum((job_dummy$left - logistic_model_predictions) ** 2)
MSE_vector = c(MSE_vector, mse)
}
## [1] 0.5
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## [1] 0.55
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in job_dummy$left - logistic_model_predictions: longer object length is
## not a multiple of shorter object length
## [1] 0.6
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.65
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.7
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.75
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.8
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## [1] 0.85
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## longer object length is not a multiple of shorter object length
## [1] 0.9
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
a = data.frame(p = seq(.5,.9, .05),
MSE = MSE_vector)
ggplot(a, aes(x = p, y = MSE)) +
geom_line(color = "blue") +
labs(title = "MSE by Varying Proportion of Data Set in Training Set")
data_idx = createDataPartition(job_dummy$left, p = .7, list = FALSE)
train = job_dummy[data_idx,]
test = job_dummy[-data_idx,]
logistic_model = glm(left ~., family = "binomial", data = train )
summary(logistic_model)
##
## Call:
## glm(formula = left ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6047 -0.9113 -0.7529 1.2045 1.9433
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.26883 0.29177 -4.349 1.37e-05 ***
## genderF -0.09329 0.19984 -0.467 0.6406
## genderM NA NA NA NA
## fieldEducation.and.Training 0.01103 0.15520 0.071 0.9434
## fieldFinance 0.23985 0.15282 1.570 0.1165
## fieldHealth 0.25693 0.23018 1.116 0.2643
## fieldLaw -0.08885 0.26641 -0.334 0.7387
## fieldPublic.Government 0.13727 0.17706 0.775 0.4382
## fieldSales.Marketing NA NA NA NA
## levelHigh -0.10442 0.17544 -0.595 0.5517
## levelLow -0.01476 0.12506 -0.118 0.9061
## levelMedium NA NA NA NA
## sentiment -0.05672 0.02482 -2.285 0.0223 *
## intention 0.24206 0.02169 11.159 < 2e-16 ***
## gender_and_levelHigh.Female -0.52775 0.38227 -1.381 0.1674
## gender_and_levelHigh.Male NA NA NA NA
## gender_and_levelLow.Female 0.14200 0.22693 0.626 0.5315
## gender_and_levelLow.Male NA NA NA NA
## gender_and_levelMedium.Female NA NA NA NA
## gender_and_levelMedium.Male NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3460.0 on 2638 degrees of freedom
## Residual deviance: 3267.2 on 2626 degrees of freedom
## AIC: 3293.2
##
## Number of Fisher Scoring iterations: 4
AIC(logistic_model)
## [1] 3293.179
BIC(logistic_model)
## [1] 3369.595
logistic_model_predictions = predict(logistic_model, newdata = test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
hist(logistic_model_predictions)
logistic_roc = roc(test$left, logistic_model_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(logistic_roc, color = "blue")
logistic_auc = auc(logistic_roc)
logistic_mse = sum((test$left - logistic_model_predictions) ** 2)
backwards_logistic_model = step(logistic_model)
## Start: AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance +
## fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing +
## levelHigh + levelLow + levelMedium + sentiment + intention +
## gender_and_levelHigh.Female + gender_and_levelHigh.Male +
## gender_and_levelLow.Female + gender_and_levelLow.Male + gender_and_levelMedium.Female +
## gender_and_levelMedium.Male
##
##
## Step: AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance +
## fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing +
## levelHigh + levelLow + levelMedium + sentiment + intention +
## gender_and_levelHigh.Female + gender_and_levelHigh.Male +
## gender_and_levelLow.Female + gender_and_levelLow.Male + gender_and_levelMedium.Female
##
##
## Step: AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance +
## fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing +
## levelHigh + levelLow + levelMedium + sentiment + intention +
## gender_and_levelHigh.Female + gender_and_levelHigh.Male +
## gender_and_levelLow.Female + gender_and_levelLow.Male
##
##
## Step: AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance +
## fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing +
## levelHigh + levelLow + levelMedium + sentiment + intention +
## gender_and_levelHigh.Female + gender_and_levelHigh.Male +
## gender_and_levelLow.Female
##
##
## Step: AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance +
## fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing +
## levelHigh + levelLow + levelMedium + sentiment + intention +
## gender_and_levelHigh.Female + gender_and_levelLow.Female
##
##
## Step: AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance +
## fieldHealth + fieldLaw + fieldPublic.Government + fieldSales.Marketing +
## levelHigh + levelLow + sentiment + intention + gender_and_levelHigh.Female +
## gender_and_levelLow.Female
##
##
## Step: AIC=3293.18
## left ~ genderF + genderM + fieldEducation.and.Training + fieldFinance +
## fieldHealth + fieldLaw + fieldPublic.Government + levelHigh +
## levelLow + sentiment + intention + gender_and_levelHigh.Female +
## gender_and_levelLow.Female
##
##
## Step: AIC=3293.18
## left ~ genderF + fieldEducation.and.Training + fieldFinance +
## fieldHealth + fieldLaw + fieldPublic.Government + levelHigh +
## levelLow + sentiment + intention + gender_and_levelHigh.Female +
## gender_and_levelLow.Female
##
## Df Deviance AIC
## - fieldEducation.and.Training 1 3267.2 3291.2
## - levelLow 1 3267.2 3291.2
## - fieldLaw 1 3267.3 3291.3
## - genderF 1 3267.4 3291.4
## - levelHigh 1 3267.5 3291.5
## - gender_and_levelLow.Female 1 3267.6 3291.6
## - fieldPublic.Government 1 3267.8 3291.8
## - fieldHealth 1 3268.4 3292.4
## - gender_and_levelHigh.Female 1 3269.1 3293.1
## <none> 3267.2 3293.2
## - fieldFinance 1 3269.7 3293.7
## - sentiment 1 3272.4 3296.4
## - intention 1 3397.7 3421.7
##
## Step: AIC=3291.18
## left ~ genderF + fieldFinance + fieldHealth + fieldLaw + fieldPublic.Government +
## levelHigh + levelLow + sentiment + intention + gender_and_levelHigh.Female +
## gender_and_levelLow.Female
##
## Df Deviance AIC
## - levelLow 1 3267.2 3289.2
## - fieldLaw 1 3267.4 3289.4
## - genderF 1 3267.4 3289.4
## - levelHigh 1 3267.5 3289.5
## - gender_and_levelLow.Female 1 3267.6 3289.6
## - fieldPublic.Government 1 3268.1 3290.1
## - fieldHealth 1 3268.8 3290.8
## - gender_and_levelHigh.Female 1 3269.1 3291.1
## <none> 3267.2 3291.2
## - sentiment 1 3272.4 3294.4
## - fieldFinance 1 3273.0 3295.0
## - intention 1 3397.7 3419.7
##
## Step: AIC=3289.2
## left ~ genderF + fieldFinance + fieldHealth + fieldLaw + fieldPublic.Government +
## levelHigh + sentiment + intention + gender_and_levelHigh.Female +
## gender_and_levelLow.Female
##
## Df Deviance AIC
## - fieldLaw 1 3267.4 3287.4
## - genderF 1 3267.4 3287.4
## - levelHigh 1 3267.6 3287.6
## - gender_and_levelLow.Female 1 3267.6 3287.6
## - fieldPublic.Government 1 3268.2 3288.2
## - fieldHealth 1 3268.8 3288.8
## <none> 3267.2 3289.2
## - gender_and_levelHigh.Female 1 3269.4 3289.4
## - sentiment 1 3272.4 3292.4
## - fieldFinance 1 3273.0 3293.0
## - intention 1 3398.2 3418.2
##
## Step: AIC=3287.37
## left ~ genderF + fieldFinance + fieldHealth + fieldPublic.Government +
## levelHigh + sentiment + intention + gender_and_levelHigh.Female +
## gender_and_levelLow.Female
##
## Df Deviance AIC
## - genderF 1 3267.6 3285.6
## - levelHigh 1 3267.8 3285.8
## - gender_and_levelLow.Female 1 3267.8 3285.8
## - fieldPublic.Government 1 3268.5 3286.5
## - fieldHealth 1 3269.0 3287.0
## <none> 3267.4 3287.4
## - gender_and_levelHigh.Female 1 3269.5 3287.5
## - sentiment 1 3272.5 3290.5
## - fieldFinance 1 3273.8 3291.8
## - intention 1 3398.3 3416.3
##
## Step: AIC=3285.58
## left ~ fieldFinance + fieldHealth + fieldPublic.Government +
## levelHigh + sentiment + intention + gender_and_levelHigh.Female +
## gender_and_levelLow.Female
##
## Df Deviance AIC
## - gender_and_levelLow.Female 1 3267.8 3283.8
## - levelHigh 1 3267.9 3283.9
## - fieldPublic.Government 1 3268.7 3284.7
## - fieldHealth 1 3269.3 3285.3
## <none> 3267.6 3285.6
## - gender_and_levelHigh.Female 1 3271.4 3287.4
## - sentiment 1 3272.9 3288.9
## - fieldFinance 1 3273.9 3289.9
## - intention 1 3398.4 3414.4
##
## Step: AIC=3283.84
## left ~ fieldFinance + fieldHealth + fieldPublic.Government +
## levelHigh + sentiment + intention + gender_and_levelHigh.Female
##
## Df Deviance AIC
## - levelHigh 1 3268.3 3282.3
## - fieldPublic.Government 1 3268.9 3282.9
## - fieldHealth 1 3269.5 3283.5
## <none> 3267.8 3283.8
## - gender_and_levelHigh.Female 1 3271.7 3285.7
## - sentiment 1 3273.1 3287.1
## - fieldFinance 1 3274.3 3288.3
## - intention 1 3399.2 3413.2
##
## Step: AIC=3282.29
## left ~ fieldFinance + fieldHealth + fieldPublic.Government +
## sentiment + intention + gender_and_levelHigh.Female
##
## Df Deviance AIC
## - fieldPublic.Government 1 3269.4 3281.4
## - fieldHealth 1 3269.9 3281.9
## <none> 3268.3 3282.3
## - sentiment 1 3273.8 3285.8
## - gender_and_levelHigh.Female 1 3274.6 3286.6
## - fieldFinance 1 3274.6 3286.6
## - intention 1 3400.2 3412.2
##
## Step: AIC=3281.39
## left ~ fieldFinance + fieldHealth + sentiment + intention + gender_and_levelHigh.Female
##
## Df Deviance AIC
## - fieldHealth 1 3270.6 3280.6
## <none> 3269.4 3281.4
## - fieldFinance 1 3274.7 3284.7
## - sentiment 1 3275.2 3285.2
## - gender_and_levelHigh.Female 1 3275.7 3285.7
## - intention 1 3400.6 3410.6
##
## Step: AIC=3280.64
## left ~ fieldFinance + sentiment + intention + gender_and_levelHigh.Female
##
## Df Deviance AIC
## <none> 3270.6 3280.6
## - fieldFinance 1 3275.2 3283.2
## - sentiment 1 3276.5 3284.5
## - gender_and_levelHigh.Female 1 3277.2 3285.2
## - intention 1 3401.9 3409.9
summary(backwards_logistic_model)
##
## Call:
## glm(formula = left ~ fieldFinance + sentiment + intention + gender_and_levelHigh.Female,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6090 -0.9132 -0.7471 1.1982 1.9497
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.20610 0.24082 -5.008 5.49e-07 ***
## fieldFinance 0.18754 0.08733 2.147 0.0318 *
## sentiment -0.05976 0.02463 -2.427 0.0152 *
## intention 0.24132 0.02156 11.195 < 2e-16 ***
## gender_and_levelHigh.Female -0.72475 0.29692 -2.441 0.0147 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3460.0 on 2638 degrees of freedom
## Residual deviance: 3270.6 on 2634 degrees of freedom
## AIC: 3280.6
##
## Number of Fisher Scoring iterations: 4
AIC(backwards_logistic_model)
## [1] 3280.643
BIC(backwards_logistic_model)
## [1] 3310.034
backwards_logistic_model_predictions = predict(backwards_logistic_model, newdata = test, type = "response")
hist(backwards_logistic_model_predictions)
backwards_logistic_roc = roc(test$left, backwards_logistic_model_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(backwards_logistic_roc, color = "blue")
backwards_logistic_auc = auc(backwards_logistic_roc)
backwards_mse = sum((test$left - backwards_logistic_model_predictions) ** 2)
### Decision Tree
tree_model = train(left ~ .,
data=train,
method="rpart",
trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
tree_model
## CART
##
## 2639 samples
## 19 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2376, 2375, 2375, 2375, 2375, 2375, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.004361773 0.4672688 0.05931258 0.4349772
## 0.011553336 0.4691101 0.05252829 0.4388697
## 0.049062768 0.4775932 0.03282945 0.4535270
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.004361773.
suppressMessages(library(rattle))
fancyRpartPlot(tree_model$finalModel)
tree_predictions = predict(tree_model, newdata = test)
hist(tree_predictions)
tree_roc = roc(test$left, tree_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(tree_roc, color = "blue")
tree_auc = auc(tree_roc)
tree_mse = sum((test$left - tree_predictions) ** 2)
### SVM
svm_model = train(left ~.,
data = train, method = "svmPoly",
trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
svm_model
## Support Vector Machines with Polynomial Kernel
##
## 2639 samples
## 19 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2375, 2375, 2375, 2375, 2375, 2375, ...
## Resampling results across tuning parameters:
##
## degree scale C RMSE Rsquared MAE
## 1 0.001 0.25 0.5751077 0.05314298 0.3768710
## 1 0.001 0.50 0.5751111 0.03739870 0.3768697
## 1 0.001 1.00 0.5750618 0.05551311 0.3768680
## 1 0.010 0.25 0.5749487 0.04018114 0.3768763
## 1 0.010 0.50 0.5747199 0.03857745 0.3768700
## 1 0.010 1.00 0.5742575 0.04561234 0.3768860
## 1 0.100 0.25 0.5732274 0.03783988 0.3769058
## 1 0.100 0.50 0.5732396 0.03853981 0.3768985
## 1 0.100 1.00 0.5732391 0.03421681 0.3768909
## 2 0.001 0.25 0.5749492 0.06010013 0.3768591
## 2 0.001 0.50 0.5747317 0.05713712 0.3768350
## 2 0.001 1.00 0.5742035 0.05985165 0.3767860
## 2 0.010 0.25 0.5594526 0.05669019 0.3750242
## 2 0.010 0.50 0.5464677 0.05651013 0.3733418
## 2 0.010 1.00 0.5297580 0.05604006 0.3701900
## 2 0.100 0.25 0.5192867 0.05457813 0.3657455
## 2 0.100 0.50 0.5197427 0.05375977 0.3659771
## 2 0.100 1.00 0.5199247 0.05353243 0.3659905
## 3 0.001 0.25 0.5745797 0.05844971 0.3768154
## 3 0.001 0.50 0.5738785 0.05907771 0.3767524
## 3 0.001 1.00 0.5725335 0.05922396 0.3766166
## 3 0.010 0.25 0.5388119 0.05445635 0.3721792
## 3 0.010 0.50 0.5249853 0.05487263 0.3686004
## 3 0.010 1.00 0.5204695 0.05558208 0.3669241
## 3 0.100 0.25 0.5290718 0.04242533 0.3739527
## 3 0.100 0.50 0.5310579 0.04106314 0.3757324
## 3 0.100 1.00 0.5334443 0.03895622 0.3773726
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were degree = 2, scale = 0.1 and C = 0.25.
svm_predictions = predict(svm_model, newdata = test)
hist(svm_predictions)
svm_roc = roc(test$left, svm_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(svm_roc, color = "blue")
svm_auc = auc(svm_roc)
svm_mse = sum((test$left - svm_predictions) **2)
### Lasso
lasso_model = train(left ~.,
data = train, method = "lasso",
trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
lasso_model
## The lasso
##
## 2639 samples
## 19 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2375, 2375, 2375, 2375, 2376, 2375, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 0.4721405 0.06741913 0.4497494
## 0.5 0.4658603 0.06676659 0.4344801
## 0.9 0.4653961 0.06811550 0.4314640
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
lasso_predictions = predict(lasso_model, newdata = test)
hist(lasso_predictions)
lasso_roc = roc(test$left, lasso_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(lasso_roc, color = "blue")
lasso_auc = auc(lasso_roc)
lasso_mse = sum((test$left - lasso_predictions) **2)
### Ridge
ridge_model = train(left ~ .,
data = train, method = "ridge",
trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
ridge_model
## Ridge Regression
##
## 2639 samples
## 19 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2375, 2375, 2376, 2375, 2375, 2375, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0e+00 0.4653620 0.06872936 0.4312849
## 1e-04 0.4653620 0.06872920 0.4312845
## 1e-01 0.4654241 0.06853354 0.4309016
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.
ridge_predictions = predict(ridge_model, newdata = test)
hist(ridge_predictions)
ridge_roc = roc(test$left, ridge_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ridge_roc, color = "blue")
ridge_auc = auc(ridge_roc)
ridge_mse = sum((test$left - ridge_predictions) **2)
ridge_model = train(left ~ .,
data = train, method = "ridge",
trControl = trainControl(method = "cv"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
model_results = data.frame(Model = c("Logistic", "Backwards Step",
"Decision Tree", "SVM","Lasso", "Ridge"),
AUC = c(logistic_auc, backwards_logistic_auc, tree_auc,
svm_auc, lasso_auc, ridge_auc),
MSE = c(logistic_mse, backwards_mse, tree_mse,
svm_mse, lasso_mse, tree_mse))
model_results
## Model AUC MSE
## 1 Logistic 0.6481397 240.1911
## 2 Backwards Step 0.6467036 240.2120
## 3 Decision Tree 0.6362586 241.5930
## 4 SVM 0.6399986 291.7662
## 5 Lasso 0.6486666 240.0708
## 6 Ridge 0.6489834 241.5930
auc = ggplot(model_results, aes(x = Model, y = AUC, group = 1)) +
geom_line(color = "grey") +
geom_point(aes(color = Model)) +
labs(title = "AUC By Model")
mse = model_results %>% filter(Model != "SVM") %>%
ggplot(aes(x = Model, y = MSE, group = 1)) +
geom_line(color = "grey") +
geom_point(aes(color = Model)) +
labs(title = "MSE By Model")
grid.arrange(auc, mse)
In this analysis, I compared many models against one another. We found out that the lasso model performed the best on this data set. While the AUC and MSE are not spectacular, it was due to focus being put on building models and not tuning models. If I had more time, the performance would be much better. I also would not have used all the variables. I did this for this analysis for ease for comparison.
In the MSE By Model visualization, I excluded the SVM model due to its data point throwing off the Y axis.
I thoroughly enjoy the HR analytics field and hope more HR professionals look into these types of analyses.