Varun Sarma Devella, Jun Fang
12/6/2019
library(tidyverse)
library(randomForest)
library(gmodels)
library(rpart.plot)
library(rpart)
library(readr)## Parsed with column specification:
## cols(
## satisfaction_level = col_double(),
## last_evaluation = col_double(),
## number_project = col_double(),
## average_montly_hours = col_double(),
## time_spend_company = col_double(),
## Work_accident = col_double(),
## left = col_double(),
## promotion_last_5years = col_double(),
## sales = col_character(),
## salary = col_character()
## )
HR_comma_sep$salary = as.factor(HR_comma_sep$salary)
HR_comma_sep$sales = as.factor(HR_comma_sep$sales)
HR_comma_sep$number_project = as.factor(HR_comma_sep$number_project)
HR_comma_sep$Work_accident = as.factor(HR_comma_sep$Work_accident)
HR_comma_sep$promotion_last_5years = as.factor(HR_comma_sep$promotion_last_5years)left <- HR_comma_sep %>%
mutate(left = as.factor(ifelse( HR_comma_sep$left > 0 , 1, 0)) ,
random = runif(14999)) %>%
select(-HR_comma_sep$left)
train_rf <- left %>%
filter(random < .7) %>%
select(-random)
validation_rf <- left %>%
filter(random >= .7) %>%
select(-random)## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 10439 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 20878 matrix numeric
## oob.times 10439 -none- numeric
## classes 2 -none- character
## importance 32 -none- numeric
## importanceSD 24 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 10439 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
## 0 1
## 3525 1035
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4560
##
##
## | validation_rf$predicted_left
## validation_rf$left | 0 | 1 | Row Total |
## -------------------|-----------|-----------|-----------|
## 0 | 3446 | 30 | 3476 |
## | 214.370 | 730.101 | |
## | 0.991 | 0.009 | 0.762 |
## | 0.978 | 0.029 | |
## | 0.756 | 0.007 | |
## -------------------|-----------|-----------|-----------|
## 1 | 79 | 1005 | 1084 |
## | 687.408 | 2341.173 | |
## | 0.073 | 0.927 | 0.238 |
## | 0.022 | 0.971 | |
## | 0.017 | 0.220 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 3525 | 1035 | 4560 |
## | 0.773 | 0.227 | |
## -------------------|-----------|-----------|-----------|
##
##
## last_evaluation number_project average_montly_hours time_spend_company
## Min. :0.3600 2:2388 Min. : 96.0 Min. : 2.000
## 1st Qu.:0.5600 3:4055 1st Qu.:156.0 1st Qu.: 3.000
## Median :0.7200 4:4365 Median :200.0 Median : 3.000
## Mean :0.7161 5:2761 Mean :201.1 Mean : 3.498
## 3rd Qu.:0.8700 6:1174 3rd Qu.:245.0 3rd Qu.: 4.000
## Max. :1.0000 7: 256 Max. :310.0 Max. :10.000
##
## Work_accident left promotion_last_5years sales
## 0:12830 0:11428 0:14680 sales :4140
## 1: 2169 1: 3571 1: 319 technical :2720
## support :2229
## IT :1227
## product_mng: 902
## marketing : 858
## (Other) :2923
## salary random
## high :1237 Min. :0.0000583
## low :7316 1st Qu.:0.2510270
## medium:6446 Median :0.4976577
## Mean :0.4984611
## 3rd Qu.:0.7485921
## Max. :0.9999918
##
## CP nsplit rel error xerror xstd
## 1 0.33189317 0 1.0000000 1.0000000 0.017269285
## 2 0.07109191 1 0.6681068 0.6681068 0.014839564
## 3 0.06559309 2 0.5970149 0.6068342 0.014266471
## 4 0.05793401 3 0.5314218 0.5184603 0.013350036
## 5 0.04929301 6 0.3476041 0.3487824 0.011202277
## 6 0.02749411 8 0.2490181 0.2513747 0.009631146
## 7 0.01728201 9 0.2215240 0.2203456 0.009052925
## 8 0.01649647 10 0.2042419 0.2050275 0.008749569
## 9 0.01178319 11 0.1877455 0.1912804 0.008465847
## 10 0.01021210 12 0.1759623 0.1849961 0.008332215
## 11 0.01000000 13 0.1657502 0.1771406 0.008161455
## last_evaluation number_project average_montly_hours time_spend_company
## Min. :0.3600 2: 668 Min. : 96.0 Min. : 2.000
## 1st Qu.:0.5600 3:1192 1st Qu.:156.0 1st Qu.: 3.000
## Median :0.7300 4:1306 Median :201.0 Median : 3.000
## Mean :0.7206 5: 819 Mean :201.2 Mean : 3.526
## 3rd Qu.:0.8700 6: 361 3rd Qu.:245.0 3rd Qu.: 4.000
## Max. :1.0000 7: 76 Max. :310.0 Max. :10.000
##
## Work_accident left promotion_last_5years sales
## 0:3762 0:3397 0:4326 sales :1248
## 1: 660 1:1025 1: 96 technical : 833
## support : 621
## IT : 363
## product_mng: 270
## RandD : 240
## (Other) : 847
## salary left_predicted
## high : 374 0:3412
## low :2124 1:1010
## medium:1924
##
##
##
##
## Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4422
##
##
## | val$left_predicted
## val$left | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 3294 | 103 | 3397 |
## | 172.742 | 583.560 | |
## | 0.970 | 0.030 | 0.768 |
## | 0.965 | 0.102 | |
## | 0.745 | 0.023 | |
## -------------|-----------|-----------|-----------|
## 1 | 118 | 907 | 1025 |
## | 572.492 | 1934.003 | |
## | 0.115 | 0.885 | 0.232 |
## | 0.035 | 0.898 | |
## | 0.027 | 0.205 | |
## -------------|-----------|-----------|-----------|
## Column Total | 3412 | 1010 | 4422 |
## | 0.772 | 0.228 | |
## -------------|-----------|-----------|-----------|
##
##
fitlr1 <- glm(left ~ . , data = trainlr ,
family = binomial(link = 'logit'))
anova(fitlr1 , test = "Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: left
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 10611 11610.4
## satisfaction_level 1 1555.01 10610 10055.4 < 2.2e-16 ***
## last_evaluation 1 16.24 10609 10039.1 5.570e-05 ***
## number_project 1 77.10 10608 9962.0 < 2.2e-16 ***
## average_montly_hours 1 56.86 10607 9905.2 4.676e-14 ***
## time_spend_company 1 123.21 10606 9782.0 < 2.2e-16 ***
## Work_accident 1 247.35 10605 9534.6 < 2.2e-16 ***
## promotion_last_5years 1 44.27 10604 9490.3 2.855e-11 ***
## salary 1 281.52 10603 9208.8 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitlr2 <- glm(left~ satisfaction_level + number_project + average_montly_hours + time_spend_company + Work_accident + salary ,
data = trainlr ,
family = binomial(link = 'logit'))
anova(fitlr2 , test = "Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: left
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 10611 11610.4
## satisfaction_level 1 1555.01 10610 10055.4 < 2.2e-16 ***
## number_project 1 36.01 10609 10019.4 1.963e-09 ***
## average_montly_hours 1 83.45 10608 9935.9 < 2.2e-16 ***
## time_spend_company 1 129.75 10607 9806.2 < 2.2e-16 ***
## Work_accident 1 248.21 10606 9558.0 < 2.2e-16 ***
## salary 1 297.76 10605 9260.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitlr3 <- glm(left~ satisfaction_level + average_montly_hours + time_spend_company + Work_accident + salary ,
data = trainlr ,
family = binomial(link = 'logit'))
anova(fitlr3 , test = "Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: left
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 10611 11610.4
## satisfaction_level 1 1555.01 10610 10055.4 < 2.2e-16 ***
## average_montly_hours 1 22.55 10609 10032.8 2.043e-06 ***
## time_spend_company 1 94.23 10608 9938.6 < 2.2e-16 ***
## Work_accident 1 250.81 10607 9687.8 < 2.2e-16 ***
## salary 1 297.68 10606 9390.1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vallr$logit <- predict.glm(fitlr1 , vallr)
vallr <- vallr %>%
mutate(logit = 1.2250509 + satisfaction_level * (-4.2156799) + last_evaluation * 0.6785725 + number_project * (-0.3121132) + average_montly_hours * 0.0046474 + time_spend_company * 0.2456412 + Work_accident * (-1.5052185) + promotion_last_5years * (-1.6250757) + salary * ( -0.6619373) ,
predicted_left = 1/(1+exp(-logit)))
summary(vallr$predicted_left)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002145 0.075136 0.164613 0.234124 0.343160 0.900269
num_vallr_left <- sum(vallr$left)
cutoff <- vallr %>%
arrange(desc(predicted_left)) %>%
filter(row_number() == num_vallr_left)
cutoff <- as.numeric(cutoff[ , 'predicted_left'])
vallr <- vallr %>%
mutate(predicted_left01 = ifelse(predicted_left >= cutoff ,
1 , 0))
sum(vallr$predicted_left01)## [1] 1062
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4387
##
##
## | vallr$predicted_left01
## vallr$left | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 2869 | 457 | 3326 |
## | 48.084 | 150.544 | |
## | 0.863 | 0.137 | 0.758 |
## | 0.863 | 0.430 | |
## | 0.654 | 0.104 | |
## -------------|-----------|-----------|-----------|
## 1 | 456 | 605 | 1061 |
## | 150.732 | 471.923 | |
## | 0.430 | 0.570 | 0.242 |
## | 0.137 | 0.570 | |
## | 0.104 | 0.138 | |
## -------------|-----------|-----------|-----------|
## Column Total | 3325 | 1062 | 4387 |
## | 0.758 | 0.242 | |
## -------------|-----------|-----------|-----------|
##
##