#############################################
#Naive Bayes Classification
library(ggplot2)
library(caret)
library(tidyverse)
library(e1071)
library(rpart)
library(randomForest)
library(ROSE)
library(car)
#Load packages
#Suppress startup message for packages
ssh <- suppressPackageStartupMessages
Employee = read.csv("C:/Users/willi/Desktop/Georgetown/RStudio Datasource/Employee_Data_Project.csv")
Employee1 <- Employee %>% #Change categorical to numeric
mutate(Attrition = if_else(Attrition == "Yes", 1, 0)) #%>% #Attrition 1 = "No", 0 = "Yes"
#select(-"StandardHours", "EmployeeID")) #remove standard hours bc they are all the same
Employee1 <- subset(Employee1, select = -StandardHours)
Employee1 <- subset(Employee1, select = -EmployeeID)
###EDA
dim(Employee1) #4410 Observations, 16 variables
## [1] 4410 16
#1. Data splitting into training and test (60:20:20)
set.seed(2890) # for reproducibility
part <-sample(1:3, size=nrow(Employee1), prob=c(0.6, 0.20, 0.20), replace=TRUE)
#Create a train, validation and test from original data frame
emp_train <-Employee1[part == 1, ]
emp_valid <-Employee1[part == 2, ]
emp_test <-Employee1[part == 3, ]
sapply(emp_train,function(x) sum(is.na(x)))
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 10 6 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 14
## JobSatisfaction
## 13
sapply(emp_valid,function(x) sum(is.na(x)))
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 2 2 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 4
## JobSatisfaction
## 2
sapply(emp_test,function(x) sum(is.na(x)))
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 7 1 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 7
## JobSatisfaction
## 5
Replace Missing Values
#replace Number of Companies Worked
emp_train$NumCompaniesWorked[is.na(emp_train$NumCompaniesWorked)] = median(emp_train$NumCompaniesWorked, na.rm = TRUE)
emp_test$NumCompaniesWorked[is.na(emp_test$NumCompaniesWorked)] = median(emp_test$NumCompaniesWorked, na.rm = TRUE)
emp_valid$NumCompaniesWorked[is.na(emp_valid$NumCompaniesWorked)] = median(emp_valid$NumCompaniesWorked, na.rm = TRUE)
#Replacing TotalWorkingYears with median
emp_train$TotalWorkingYears[is.na(emp_train$TotalWorkingYears)] = median(emp_train$TotalWorkingYears, na.rm = TRUE)
emp_test$TotalWorkingYears[is.na(emp_test$TotalWorkingYears)] = median(emp_test$TotalWorkingYears, na.rm = TRUE)
emp_valid$TotalWorkingYears[is.na(emp_valid$TotalWorkingYears)] = median(emp_valid$TotalWorkingYears, na.rm = TRUE)
#Replacing EnvironmentSatisfaction with median
emp_train$EnvironmentSatisfaction[is.na(emp_train$EnvironmentSatisfaction)] = median(emp_train$EnvironmentSatisfaction, na.rm = TRUE)
emp_test$EnvironmentSatisfaction[is.na(emp_test$EnvironmentSatisfaction)] = median(emp_test$EnvironmentSatisfaction, na.rm = TRUE)
emp_valid$EnvironmentSatisfaction[is.na(emp_valid$EnvironmentSatisfaction)] = median(emp_valid$EnvironmentSatisfaction, na.rm = TRUE)
#Replacing JobSatisfaction with median
emp_train$JobSatisfaction[is.na(emp_train$JobSatisfaction)] = median(emp_train$JobSatisfaction, na.rm = TRUE)
emp_test$JobSatisfaction[is.na(emp_test$JobSatisfaction)] = median(emp_test$JobSatisfaction, na.rm = TRUE)
emp_valid$JobSatisfaction[is.na(emp_valid$JobSatisfaction)] = median(emp_valid$JobSatisfaction, na.rm = TRUE)
#Check again to make sure all replacements are done
sapply(emp_train,function(x) sum(is.na(x)))
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 0
## JobSatisfaction
## 0
sapply(emp_valid,function(x) sum(is.na(x)))
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 0
## JobSatisfaction
## 0
sapply(emp_test,function(x) sum(is.na(x)))
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 0
## JobSatisfaction
## 0
#Check for values that are blank
sapply(emp_train, function(x){sum(x=='')})
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 0
## JobSatisfaction
## 0
sapply(emp_valid, function(x){sum(x=='')})
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 0
## JobSatisfaction
## 0
sapply(emp_test, function(x){sum(x=='')})
## Age Attrition BusinessTravel
## 0 0 0
## DistanceFromHome Education Gender
## 0 0 0
## JobLevel MaritalStatus Income
## 0 0 0
## NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## YearsAtCompany YearsWithCurrManager EnvironmentSatisfaction
## 0 0 0
## JobSatisfaction
## 0
#2. Clean and prepare the data.
glimpse(emp_train)
## Rows: 2,738
## Columns: 16
## $ Age <int> 51, 31, 32, 32, 46, 28, 31, 25, 45, 55, 47, 28~
## $ Attrition <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0~
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Frequently", "Travel_~
## $ DistanceFromHome <int> 6, 10, 17, 10, 8, 11, 1, 7, 17, 14, 1, 1, 1, 3~
## $ Education <int> 2, 1, 4, 1, 3, 2, 3, 4, 2, 4, 1, 3, 3, 2, 3, 3~
## $ Gender <chr> "Female", "Female", "Male", "Male", "Female", ~
## $ JobLevel <int> 1, 1, 4, 1, 4, 2, 3, 4, 2, 1, 1, 1, 2, 1, 2, 1~
## $ MaritalStatus <chr> "Married", "Single", "Married", "Single", "Mar~
## $ Income <int> 131160, 41890, 193280, 23420, 40710, 58130, 20~
## $ NumCompaniesWorked <dbl> 1, 0, 1, 4, 3, 2, 0, 1, 0, 0, 1, 1, 4, 1, 2, 1~
## $ TotalWorkingYears <dbl> 1, 6, 5, 9, 28, 5, 10, 6, 21, 37, 10, 5, 7, 3,~
## $ TrainingTimesLastYear <int> 6, 3, 2, 2, 5, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 5~
## $ YearsAtCompany <int> 1, 5, 5, 6, 7, 0, 9, 6, 20, 36, 10, 5, 5, 3, 5~
## $ YearsWithCurrManager <int> 0, 4, 3, 4, 7, 0, 8, 5, 10, 13, 9, 4, 1, 0, 2,~
## $ EnvironmentSatisfaction <dbl> 3, 3, 2, 4, 3, 1, 2, 2, 3, 4, 1, 4, 3, 4, 1, 1~
## $ JobSatisfaction <int> 4, 2, 2, 1, 2, 3, 4, 1, 4, 1, 2, 4, 4, 3, 4, 1~
Changing Variable Types
#Changing categorical variables to factor and leaving integers
emp_train_cln <- emp_train %>%
mutate(Attrition = as.factor(Attrition),
BusinessTravel = as.numeric(as.factor(BusinessTravel)), #change character variables to numeric
DistanceFromHome = as.integer(DistanceFromHome),
Education = as.factor(Education),
Gender = as.numeric(as.factor(Gender)), #change character variables to numeric
JobLevel = as.factor(JobLevel),
MaritalStatus = as.numeric(as.factor(MaritalStatus)), #change character variables to numeric
Income = as.integer(Income),
NumCompaniesWorked = as.integer(NumCompaniesWorked),
TotalWorkingYears = as.integer(TotalWorkingYears),
TrainingTimesLastYear = as.integer(TrainingTimesLastYear),
YearsAtCompany = as.integer(YearsAtCompany),
YearsWithCurrManager = as.integer(YearsWithCurrManager),
EnvironmentSatisfaction = as.factor(EnvironmentSatisfaction),
JobSatisfaction = as.factor(JobSatisfaction))
emp_valid_cln <- emp_valid %>%
mutate(Attrition = as.factor(Attrition),
BusinessTravel = as.numeric(as.factor(BusinessTravel)), #change character variables to numeric
DistanceFromHome = as.integer(DistanceFromHome),
Education = as.factor(Education),
Gender = as.numeric(as.factor(Gender)), #change character variables to numeric
JobLevel = as.factor(JobLevel),
MaritalStatus = as.numeric(as.factor(MaritalStatus)), #change character variables to numeric
Income = as.integer(Income),
NumCompaniesWorked = as.integer(NumCompaniesWorked),
TotalWorkingYears = as.integer(TotalWorkingYears),
TrainingTimesLastYear = as.integer(TrainingTimesLastYear),
YearsAtCompany = as.integer(YearsAtCompany),
YearsWithCurrManager = as.integer(YearsWithCurrManager),
EnvironmentSatisfaction = as.factor(EnvironmentSatisfaction),
JobSatisfaction = as.factor(JobSatisfaction))
emp_test_cln <- emp_test %>%
mutate(Attrition = as.factor(Attrition),
BusinessTravel = as.numeric(as.factor(BusinessTravel)), #change character variables to numeric
DistanceFromHome = as.integer(DistanceFromHome),
Education = as.factor(Education),
Gender = as.numeric(as.factor(Gender)), #change character variables to numeric
JobLevel = as.factor(JobLevel),
MaritalStatus = as.numeric(as.factor(MaritalStatus)), #change character variables to numeric
Income = as.integer(Income),
NumCompaniesWorked = as.integer(NumCompaniesWorked),
TotalWorkingYears = as.integer(TotalWorkingYears),
TrainingTimesLastYear = as.integer(TrainingTimesLastYear),
YearsAtCompany = as.integer(YearsAtCompany),
YearsWithCurrManager = as.integer(YearsWithCurrManager),
EnvironmentSatisfaction = as.factor(EnvironmentSatisfaction),
JobSatisfaction = as.factor(JobSatisfaction))
glimpse(emp_test_cln)
## Rows: 815
## Columns: 16
## $ Age <int> 38, 36, 53, 55, 26, 37, 35, 36, 27, 32, 47, 43~
## $ Attrition <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ BusinessTravel <dbl> 1, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3~
## $ DistanceFromHome <int> 2, 28, 11, 1, 4, 9, 20, 8, 13, 7, 4, 7, 2, 10,~
## $ Education <fct> 5, 1, 4, 4, 3, 1, 2, 3, 2, 3, 3, 2, 4, 3, 2, 4~
## $ Gender <dbl> 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2~
## $ JobLevel <fct> 3, 1, 2, 1, 3, 1, 1, 3, 4, 2, 2, 2, 4, 2, 1, 1~
## $ MaritalStatus <dbl> 2, 2, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 1, 2, 1~
## $ Income <int> 83210, 33770, 21480, 67990, 68540, 15140, 5154~
## $ NumCompaniesWorked <int> 3, 0, 3, 3, 2, 1, 0, 4, 1, 7, 4, 8, 2, 1, 1, 1~
## $ TotalWorkingYears <int> 13, 16, 21, 12, 5, 4, 12, 4, 5, 10, 21, 10, 6,~
## $ TrainingTimesLastYear <int> 5, 2, 2, 2, 5, 3, 2, 2, 5, 2, 4, 3, 6, 2, 2, 2~
## $ YearsAtCompany <int> 8, 15, 5, 10, 3, 4, 11, 1, 5, 7, 3, 1, 4, 10, ~
## $ YearsWithCurrManager <int> 5, 11, 3, 8, 2, 2, 9, 0, 4, 7, 1, 0, 2, 2, 7, ~
## $ EnvironmentSatisfaction <fct> 4, 3, 3, 2, 1, 2, 3, 3, 4, 1, 4, 3, 2, 4, 2, 3~
## $ JobSatisfaction <fct> 4, 4, 3, 4, 2, 4, 2, 3, 3, 4, 3, 1, 4, 3, 2, 4~
typeof(emp_test_cln$Gender)
## [1] "double"
Remove correlated variables to ensure independence
emp_test_cln = subset(emp_test_cln, select = -Age)
emp_test_cln = subset(emp_test_cln, select = -YearsAtCompany)
emp_train_cln = subset(emp_train_cln, select = -Age)
emp_train_cln = subset(emp_train_cln, select = -YearsAtCompany)
emp_valid_cln = subset(emp_valid_cln, select = -Age)
emp_valid_cln = subset(emp_valid_cln, select = -YearsAtCompany)
glimpse(emp_test_cln)
## Rows: 815
## Columns: 14
## $ Attrition <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ BusinessTravel <dbl> 1, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3~
## $ DistanceFromHome <int> 2, 28, 11, 1, 4, 9, 20, 8, 13, 7, 4, 7, 2, 10,~
## $ Education <fct> 5, 1, 4, 4, 3, 1, 2, 3, 2, 3, 3, 2, 4, 3, 2, 4~
## $ Gender <dbl> 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2~
## $ JobLevel <fct> 3, 1, 2, 1, 3, 1, 1, 3, 4, 2, 2, 2, 4, 2, 1, 1~
## $ MaritalStatus <dbl> 2, 2, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 1, 2, 1~
## $ Income <int> 83210, 33770, 21480, 67990, 68540, 15140, 5154~
## $ NumCompaniesWorked <int> 3, 0, 3, 3, 2, 1, 0, 4, 1, 7, 4, 8, 2, 1, 1, 1~
## $ TotalWorkingYears <int> 13, 16, 21, 12, 5, 4, 12, 4, 5, 10, 21, 10, 6,~
## $ TrainingTimesLastYear <int> 5, 2, 2, 2, 5, 3, 2, 2, 5, 2, 4, 3, 6, 2, 2, 2~
## $ YearsWithCurrManager <int> 5, 11, 3, 8, 2, 2, 9, 0, 4, 7, 1, 0, 2, 2, 7, ~
## $ EnvironmentSatisfaction <fct> 4, 3, 3, 2, 1, 2, 3, 3, 4, 1, 4, 3, 2, 4, 2, 3~
## $ JobSatisfaction <fct> 4, 4, 3, 4, 2, 4, 2, 3, 3, 4, 3, 1, 4, 3, 2, 4~
table(emp_train_cln$Attrition)
##
## 0 1
## 2295 443
Downsampling
set.seed(2890)
emp_train_cln <- ovun.sample(Attrition ~ ., data = emp_train_cln, method = "under", N= 1000)$data
table(emp_train_cln$Attrition)
##
## 0 1
## 557 443
#3. Train the MODEL
model_naive <- naiveBayes(x = emp_train_cln %>% select(-Attrition), #predictor
y = emp_train_cln$Attrition, #target
laplace = 1)
pred_label_naive <- predict(model_naive, emp_test_cln, type = "class")
head(data.frame(actual = emp_test_cln$Attrition, prediction = pred_label_naive))
## actual prediction
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 1 0
## 6 0 0
Model Evaluation
emp_test_cln$Attrition = as.factor(emp_test_cln$Attrition)
mat1 <- confusionMatrix(data = pred_label_naive, reference = emp_test_cln$Attrition, positive = "1")
mat1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 464 53
## 1 218 80
##
## Accuracy : 0.6675
## 95% CI : (0.634, 0.6998)
## No Information Rate : 0.8368
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.188
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.60150
## Specificity : 0.68035
## Pos Pred Value : 0.26846
## Neg Pred Value : 0.89749
## Prevalence : 0.16319
## Detection Rate : 0.09816
## Detection Prevalence : 0.36564
## Balanced Accuracy : 0.64093
##
## 'Positive' Class : 1
##
#ROC
prob_attrit <- predict(model_naive, emp_test_cln, type = "raw")
#prepare dataframe for ROC
data_roc <- data.frame(prob = prob_attrit[,2], # probability of positive class(survived)
labels = as.numeric(emp_test_cln$Attrition == "1")) #get the label as the test data who survived
head(data_roc)
## prob labels
## 1 0.05658184 0
## 2 0.03436015 0
## 3 0.35047129 0
## 4 0.17859820 0
## 5 0.45145217 1
## 6 0.42141491 0
naive_roc <- ROCR::prediction(data_roc$prob, data_roc$labels)
# ROC curve
library(ROCR)
plot(performance(naive_roc, "tpr", "fpr"), #tpr = true positive rate, fpr = false positive rate
main = "ROC")
abline(a = 0, b = 1)

#AUC: values closer to 1 is better
auc_n <- performance(naive_roc, measure = "auc")
auc_n@y.values
## [[1]]
## [1] 0.7257954
#Plot Variable performance
#X <- caret::varImp(model_naive)
#plot(X)
# You can use oversampling to improve performance of the model (sensitivity metrics)
Model Building
#For comparing the outcome of the training and testing phase let's create separate variables that store the value of the response variable:
#create objects x which holds the predictor variables and y which holds the response variables
x = emp_train_cln[,-1]
y = emp_train_cln$Attrition
summary(emp_train_cln$Attrition)
## 0 1
## 557 443
emp_model = train(x,y,method='nb',trControl=trainControl(method='cv',number=10))
emp_model
## Naive Bayes
##
## 1000 samples
## 13 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 900, 900, 900, 900, 900, 901, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.6510033 0.2934780
## TRUE 0.6820443 0.3520298
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
## = 1.
Model Evaluation
#5. MODEL EVALUATION
#Predict testing set
emp_predict <- predict(emp_model, newdata = emp_test_cln)
#Get the confusion matrix to see accuracy value and other parameter values
emp_mat<-caret::confusionMatrix(emp_predict, emp_test_cln$Attrition, positive="1")
emp_mat
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 520 53
## 1 162 80
##
## Accuracy : 0.7362
## 95% CI : (0.7045, 0.7662)
## No Information Rate : 0.8368
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2737
##
## Mcnemar's Test P-Value : 1.764e-13
##
## Sensitivity : 0.60150
## Specificity : 0.76246
## Pos Pred Value : 0.33058
## Neg Pred Value : 0.90750
## Prevalence : 0.16319
## Detection Rate : 0.09816
## Detection Prevalence : 0.29693
## Balanced Accuracy : 0.68198
##
## 'Positive' Class : 1
##
str(emp_test_cln$Attrition)
## Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
#Plot Variable performance
X1 <- varImp(emp_model)
plot(X1)

Hyperparameter Tuning
#Hyperparameter tuning
# set up tuning grid
search_grid <- expand.grid(
usekernel = c(TRUE, FALSE),
fL = 1:5,
adjust = seq(0, 5, by = 1)
)
emp_model2 = train(x,y,method='nb', tuneGrid=search_grid, trControl=trainControl(method='cv',number=10))
emp_model2
## Naive Bayes
##
## 1000 samples
## 13 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 901, 899, 900, 900, 900, ...
## Resampling results across tuning parameters:
##
## usekernel fL adjust Accuracy Kappa
## FALSE 1 0 0.6380365 0.2677943
## FALSE 1 1 0.6380365 0.2677943
## FALSE 1 2 0.6380365 0.2677943
## FALSE 1 3 0.6380365 0.2677943
## FALSE 1 4 0.6380365 0.2677943
## FALSE 1 5 0.6380365 0.2677943
## FALSE 2 0 0.6380365 0.2677943
## FALSE 2 1 0.6380365 0.2677943
## FALSE 2 2 0.6380365 0.2677943
## FALSE 2 3 0.6380365 0.2677943
## FALSE 2 4 0.6380365 0.2677943
## FALSE 2 5 0.6380365 0.2677943
## FALSE 3 0 0.6380365 0.2677943
## FALSE 3 1 0.6380365 0.2677943
## FALSE 3 2 0.6380365 0.2677943
## FALSE 3 3 0.6380365 0.2677943
## FALSE 3 4 0.6380365 0.2677943
## FALSE 3 5 0.6380365 0.2677943
## FALSE 4 0 0.6380365 0.2677943
## FALSE 4 1 0.6380365 0.2677943
## FALSE 4 2 0.6380365 0.2677943
## FALSE 4 3 0.6380365 0.2677943
## FALSE 4 4 0.6380365 0.2677943
## FALSE 4 5 0.6380365 0.2677943
## FALSE 5 0 0.6390365 0.2700068
## FALSE 5 1 0.6390365 0.2700068
## FALSE 5 2 0.6390365 0.2700068
## FALSE 5 3 0.6390365 0.2700068
## FALSE 5 4 0.6390365 0.2700068
## FALSE 5 5 0.6390365 0.2700068
## TRUE 1 0 NaN NaN
## TRUE 1 1 0.6759585 0.3388790
## TRUE 1 2 0.6759686 0.3398352
## TRUE 1 3 0.6659680 0.3204472
## TRUE 1 4 0.6490173 0.2886365
## TRUE 1 5 0.6350365 0.2647942
## TRUE 2 0 NaN NaN
## TRUE 2 1 0.6759585 0.3388790
## TRUE 2 2 0.6769587 0.3417453
## TRUE 2 3 0.6659779 0.3203873
## TRUE 2 4 0.6499973 0.2914734
## TRUE 2 5 0.6370367 0.2687890
## TRUE 3 0 NaN NaN
## TRUE 3 1 0.6769585 0.3410781
## TRUE 3 2 0.6769587 0.3417453
## TRUE 3 3 0.6649779 0.3184664
## TRUE 3 4 0.6490072 0.2896186
## TRUE 3 5 0.6380569 0.2713043
## TRUE 4 0 NaN NaN
## TRUE 4 1 0.6769585 0.3410781
## TRUE 4 2 0.6769686 0.3420612
## TRUE 4 3 0.6649779 0.3184598
## TRUE 4 4 0.6490072 0.2896186
## TRUE 4 5 0.6390569 0.2731494
## TRUE 5 0 NaN NaN
## TRUE 5 1 0.6779585 0.3432653
## TRUE 5 2 0.6809290 0.3507386
## TRUE 5 3 0.6649779 0.3184598
## TRUE 5 4 0.6490072 0.2896186
## TRUE 5 5 0.6380569 0.2709503
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 5, usekernel = TRUE and adjust
## = 2.
#Top 5 models
emp_model2$results %>%
top_n(5, wt=Accuracy) %>%
arrange(desc(Accuracy))
## usekernel fL adjust Accuracy Kappa AccuracySD KappaSD
## 1 TRUE 5 2 0.6809290 0.3507386 0.05436542 0.10718888
## 2 TRUE 5 1 0.6779585 0.3432653 0.05305977 0.10309941
## 3 TRUE 4 2 0.6769686 0.3420612 0.05047900 0.09914334
## 4 TRUE 2 2 0.6769587 0.3417453 0.05246130 0.10333621
## 5 TRUE 3 2 0.6769587 0.3417453 0.05246130 0.10333621
pred <-predict(emp_model2, newdata=emp_test_cln)
confusionMatrix(pred,emp_test_cln$Attrition, positive="1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 497 50
## 1 185 83
##
## Accuracy : 0.7117
## 95% CI : (0.6792, 0.7426)
## No Information Rate : 0.8368
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2505
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6241
## Specificity : 0.7287
## Pos Pred Value : 0.3097
## Neg Pred Value : 0.9086
## Prevalence : 0.1632
## Detection Rate : 0.1018
## Detection Prevalence : 0.3288
## Balanced Accuracy : 0.6764
##
## 'Positive' Class : 1
##
#Plot Variable performance
X2 <- varImp(emp_model2)
plot(X2)

#ROC
prob_attrit2 <- predict(model_naive, emp_test_cln, type = "raw")
#prepare dataframe for ROC
data_roc2 <- data.frame(prob = prob_attrit2[,2], # probability of positive class
labels = as.numeric(emp_test_cln$Attrition == "1")) #get the label as the test data who survived
head(data_roc)
## prob labels
## 1 0.05658184 0
## 2 0.03436015 0
## 3 0.35047129 0
## 4 0.17859820 0
## 5 0.45145217 1
## 6 0.42141491 0
naive_roc2 <- ROCR::prediction(data_roc2$prob, data_roc2$labels)
# ROC curve
library(ROCR)
plot(performance(naive_roc2, "tpr", "fpr"), #tpr = true positive rate, fpr = false positive rate
main = "ROC")
abline(a = 0, b = 1)

#AUC: values closer to 1 is better
auc_n2 <- performance(naive_roc2, measure = "auc")
auc_n2@y.values
## [[1]]
## [1] 0.7257954