ML_2_Project_2

#############################################
#Naive Bayes Classification 
library(ggplot2)
library(caret)
library(tidyverse)
library(e1071)
library(rpart)
library(randomForest)
library(ROSE)
library(car)

#Load packages
#Suppress startup message for packages
ssh <- suppressPackageStartupMessages

Employee = read.csv("C:/Users/willi/Desktop/Georgetown/RStudio Datasource/Employee_Data_Project.csv")

Employee1 <- Employee %>%  #Change categorical to numeric
  mutate(Attrition = if_else(Attrition == "Yes", 1, 0)) #%>%    #Attrition 1 = "No", 0 = "Yes"
  #select(-"StandardHours", "EmployeeID")) #remove standard hours bc they are all the same 
Employee1 <- subset(Employee1, select = -StandardHours)
Employee1 <- subset(Employee1, select = -EmployeeID)
  
###EDA
dim(Employee1) #4410 Observations, 16 variables

## [1] 4410   16

#1. Data splitting into training and test (60:20:20)

set.seed(2890)  # for reproducibility
part <-sample(1:3, size=nrow(Employee1), prob=c(0.6, 0.20, 0.20), replace=TRUE)

#Create a train, validation and test from original data frame
emp_train <-Employee1[part == 1, ] 
emp_valid <-Employee1[part == 2, ]
emp_test <-Employee1[part == 3, ]

sapply(emp_train,function(x) sum(is.na(x)))

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                      10                       6                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                      14 
##         JobSatisfaction 
##                      13

sapply(emp_valid,function(x) sum(is.na(x)))

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                       2                       2                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                       4 
##         JobSatisfaction 
##                       2

sapply(emp_test,function(x) sum(is.na(x)))

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                       7                       1                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                       7 
##         JobSatisfaction 
##                       5

Replace Missing Values

#replace Number of Companies Worked 
emp_train$NumCompaniesWorked[is.na(emp_train$NumCompaniesWorked)] = median(emp_train$NumCompaniesWorked, na.rm = TRUE)
emp_test$NumCompaniesWorked[is.na(emp_test$NumCompaniesWorked)] = median(emp_test$NumCompaniesWorked, na.rm = TRUE)
emp_valid$NumCompaniesWorked[is.na(emp_valid$NumCompaniesWorked)] = median(emp_valid$NumCompaniesWorked, na.rm = TRUE)

#Replacing TotalWorkingYears with median
emp_train$TotalWorkingYears[is.na(emp_train$TotalWorkingYears)] = median(emp_train$TotalWorkingYears, na.rm = TRUE)
emp_test$TotalWorkingYears[is.na(emp_test$TotalWorkingYears)] = median(emp_test$TotalWorkingYears, na.rm = TRUE)
emp_valid$TotalWorkingYears[is.na(emp_valid$TotalWorkingYears)] = median(emp_valid$TotalWorkingYears, na.rm = TRUE)

#Replacing EnvironmentSatisfaction with median
emp_train$EnvironmentSatisfaction[is.na(emp_train$EnvironmentSatisfaction)] = median(emp_train$EnvironmentSatisfaction, na.rm = TRUE)
emp_test$EnvironmentSatisfaction[is.na(emp_test$EnvironmentSatisfaction)] = median(emp_test$EnvironmentSatisfaction, na.rm = TRUE)
emp_valid$EnvironmentSatisfaction[is.na(emp_valid$EnvironmentSatisfaction)] = median(emp_valid$EnvironmentSatisfaction, na.rm = TRUE)


#Replacing JobSatisfaction with median
emp_train$JobSatisfaction[is.na(emp_train$JobSatisfaction)] = median(emp_train$JobSatisfaction, na.rm = TRUE)
emp_test$JobSatisfaction[is.na(emp_test$JobSatisfaction)] = median(emp_test$JobSatisfaction, na.rm = TRUE)
emp_valid$JobSatisfaction[is.na(emp_valid$JobSatisfaction)] = median(emp_valid$JobSatisfaction, na.rm = TRUE)


#Check again to make sure all replacements are done
sapply(emp_train,function(x) sum(is.na(x)))

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                       0                       0                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                       0 
##         JobSatisfaction 
##                       0

sapply(emp_valid,function(x) sum(is.na(x)))

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                       0                       0                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                       0 
##         JobSatisfaction 
##                       0

sapply(emp_test,function(x) sum(is.na(x)))

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                       0                       0                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                       0 
##         JobSatisfaction 
##                       0

#Check for values that are blank

sapply(emp_train, function(x){sum(x=='')})

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                       0                       0                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                       0 
##         JobSatisfaction 
##                       0

sapply(emp_valid, function(x){sum(x=='')})

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                       0                       0                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                       0 
##         JobSatisfaction 
##                       0

sapply(emp_test, function(x){sum(x=='')})

##                     Age               Attrition          BusinessTravel 
##                       0                       0                       0 
##        DistanceFromHome               Education                  Gender 
##                       0                       0                       0 
##                JobLevel           MaritalStatus                  Income 
##                       0                       0                       0 
##      NumCompaniesWorked       TotalWorkingYears   TrainingTimesLastYear 
##                       0                       0                       0 
##          YearsAtCompany    YearsWithCurrManager EnvironmentSatisfaction 
##                       0                       0                       0 
##         JobSatisfaction 
##                       0

#2. Clean and prepare the data.

glimpse(emp_train)

## Rows: 2,738
## Columns: 16
## $ Age                     <int> 51, 31, 32, 32, 46, 28, 31, 25, 45, 55, 47, 28~
## $ Attrition               <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0~
## $ BusinessTravel          <chr> "Travel_Rarely", "Travel_Frequently", "Travel_~
## $ DistanceFromHome        <int> 6, 10, 17, 10, 8, 11, 1, 7, 17, 14, 1, 1, 1, 3~
## $ Education               <int> 2, 1, 4, 1, 3, 2, 3, 4, 2, 4, 1, 3, 3, 2, 3, 3~
## $ Gender                  <chr> "Female", "Female", "Male", "Male", "Female", ~
## $ JobLevel                <int> 1, 1, 4, 1, 4, 2, 3, 4, 2, 1, 1, 1, 2, 1, 2, 1~
## $ MaritalStatus           <chr> "Married", "Single", "Married", "Single", "Mar~
## $ Income                  <int> 131160, 41890, 193280, 23420, 40710, 58130, 20~
## $ NumCompaniesWorked      <dbl> 1, 0, 1, 4, 3, 2, 0, 1, 0, 0, 1, 1, 4, 1, 2, 1~
## $ TotalWorkingYears       <dbl> 1, 6, 5, 9, 28, 5, 10, 6, 21, 37, 10, 5, 7, 3,~
## $ TrainingTimesLastYear   <int> 6, 3, 2, 2, 5, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 5~
## $ YearsAtCompany          <int> 1, 5, 5, 6, 7, 0, 9, 6, 20, 36, 10, 5, 5, 3, 5~
## $ YearsWithCurrManager    <int> 0, 4, 3, 4, 7, 0, 8, 5, 10, 13, 9, 4, 1, 0, 2,~
## $ EnvironmentSatisfaction <dbl> 3, 3, 2, 4, 3, 1, 2, 2, 3, 4, 1, 4, 3, 4, 1, 1~
## $ JobSatisfaction         <int> 4, 2, 2, 1, 2, 3, 4, 1, 4, 1, 2, 4, 4, 3, 4, 1~

Changing Variable Types

#Changing categorical variables to factor and leaving integers
emp_train_cln <- emp_train %>% 
  mutate(Attrition = as.factor(Attrition),
         BusinessTravel = as.numeric(as.factor(BusinessTravel)), #change character variables to numeric
         DistanceFromHome = as.integer(DistanceFromHome),
         Education = as.factor(Education),
         Gender = as.numeric(as.factor(Gender)), #change character variables to numeric
         JobLevel = as.factor(JobLevel),
         MaritalStatus = as.numeric(as.factor(MaritalStatus)), #change character variables to numeric
         Income = as.integer(Income), 
         NumCompaniesWorked = as.integer(NumCompaniesWorked),
         TotalWorkingYears = as.integer(TotalWorkingYears),
         TrainingTimesLastYear = as.integer(TrainingTimesLastYear),
         YearsAtCompany = as.integer(YearsAtCompany),
         YearsWithCurrManager = as.integer(YearsWithCurrManager),
         EnvironmentSatisfaction = as.factor(EnvironmentSatisfaction),
         JobSatisfaction = as.factor(JobSatisfaction))

emp_valid_cln <- emp_valid %>% 
  mutate(Attrition = as.factor(Attrition),
         BusinessTravel = as.numeric(as.factor(BusinessTravel)), #change character variables to numeric
         DistanceFromHome = as.integer(DistanceFromHome),
         Education = as.factor(Education),
         Gender = as.numeric(as.factor(Gender)), #change character variables to numeric
         JobLevel = as.factor(JobLevel),
         MaritalStatus = as.numeric(as.factor(MaritalStatus)), #change character variables to numeric
         Income = as.integer(Income), 
         NumCompaniesWorked = as.integer(NumCompaniesWorked),
         TotalWorkingYears = as.integer(TotalWorkingYears),
         TrainingTimesLastYear = as.integer(TrainingTimesLastYear),
         YearsAtCompany = as.integer(YearsAtCompany),
         YearsWithCurrManager = as.integer(YearsWithCurrManager),
         EnvironmentSatisfaction = as.factor(EnvironmentSatisfaction),
         JobSatisfaction = as.factor(JobSatisfaction))

emp_test_cln <- emp_test %>% 
  mutate(Attrition = as.factor(Attrition),
         BusinessTravel = as.numeric(as.factor(BusinessTravel)), #change character variables to numeric
         DistanceFromHome = as.integer(DistanceFromHome),
         Education = as.factor(Education),
         Gender = as.numeric(as.factor(Gender)), #change character variables to numeric
         JobLevel = as.factor(JobLevel),
         MaritalStatus = as.numeric(as.factor(MaritalStatus)), #change character variables to numeric
         Income = as.integer(Income), 
         NumCompaniesWorked = as.integer(NumCompaniesWorked),
         TotalWorkingYears = as.integer(TotalWorkingYears),
         TrainingTimesLastYear = as.integer(TrainingTimesLastYear),
         YearsAtCompany = as.integer(YearsAtCompany),
         YearsWithCurrManager = as.integer(YearsWithCurrManager),
         EnvironmentSatisfaction = as.factor(EnvironmentSatisfaction),
         JobSatisfaction = as.factor(JobSatisfaction))

glimpse(emp_test_cln)

## Rows: 815
## Columns: 16
## $ Age                     <int> 38, 36, 53, 55, 26, 37, 35, 36, 27, 32, 47, 43~
## $ Attrition               <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ BusinessTravel          <dbl> 1, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3~
## $ DistanceFromHome        <int> 2, 28, 11, 1, 4, 9, 20, 8, 13, 7, 4, 7, 2, 10,~
## $ Education               <fct> 5, 1, 4, 4, 3, 1, 2, 3, 2, 3, 3, 2, 4, 3, 2, 4~
## $ Gender                  <dbl> 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2~
## $ JobLevel                <fct> 3, 1, 2, 1, 3, 1, 1, 3, 4, 2, 2, 2, 4, 2, 1, 1~
## $ MaritalStatus           <dbl> 2, 2, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 1, 2, 1~
## $ Income                  <int> 83210, 33770, 21480, 67990, 68540, 15140, 5154~
## $ NumCompaniesWorked      <int> 3, 0, 3, 3, 2, 1, 0, 4, 1, 7, 4, 8, 2, 1, 1, 1~
## $ TotalWorkingYears       <int> 13, 16, 21, 12, 5, 4, 12, 4, 5, 10, 21, 10, 6,~
## $ TrainingTimesLastYear   <int> 5, 2, 2, 2, 5, 3, 2, 2, 5, 2, 4, 3, 6, 2, 2, 2~
## $ YearsAtCompany          <int> 8, 15, 5, 10, 3, 4, 11, 1, 5, 7, 3, 1, 4, 10, ~
## $ YearsWithCurrManager    <int> 5, 11, 3, 8, 2, 2, 9, 0, 4, 7, 1, 0, 2, 2, 7, ~
## $ EnvironmentSatisfaction <fct> 4, 3, 3, 2, 1, 2, 3, 3, 4, 1, 4, 3, 2, 4, 2, 3~
## $ JobSatisfaction         <fct> 4, 4, 3, 4, 2, 4, 2, 3, 3, 4, 3, 1, 4, 3, 2, 4~

typeof(emp_test_cln$Gender)

## [1] "double"

Remove correlated variables to ensure independence

emp_test_cln = subset(emp_test_cln, select = -Age)
emp_test_cln = subset(emp_test_cln, select = -YearsAtCompany)

emp_train_cln = subset(emp_train_cln, select = -Age)
emp_train_cln = subset(emp_train_cln, select = -YearsAtCompany)

emp_valid_cln = subset(emp_valid_cln, select = -Age)
emp_valid_cln = subset(emp_valid_cln, select = -YearsAtCompany)

glimpse(emp_test_cln)

## Rows: 815
## Columns: 14
## $ Attrition               <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ BusinessTravel          <dbl> 1, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3~
## $ DistanceFromHome        <int> 2, 28, 11, 1, 4, 9, 20, 8, 13, 7, 4, 7, 2, 10,~
## $ Education               <fct> 5, 1, 4, 4, 3, 1, 2, 3, 2, 3, 3, 2, 4, 3, 2, 4~
## $ Gender                  <dbl> 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2~
## $ JobLevel                <fct> 3, 1, 2, 1, 3, 1, 1, 3, 4, 2, 2, 2, 4, 2, 1, 1~
## $ MaritalStatus           <dbl> 2, 2, 2, 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 1, 2, 1~
## $ Income                  <int> 83210, 33770, 21480, 67990, 68540, 15140, 5154~
## $ NumCompaniesWorked      <int> 3, 0, 3, 3, 2, 1, 0, 4, 1, 7, 4, 8, 2, 1, 1, 1~
## $ TotalWorkingYears       <int> 13, 16, 21, 12, 5, 4, 12, 4, 5, 10, 21, 10, 6,~
## $ TrainingTimesLastYear   <int> 5, 2, 2, 2, 5, 3, 2, 2, 5, 2, 4, 3, 6, 2, 2, 2~
## $ YearsWithCurrManager    <int> 5, 11, 3, 8, 2, 2, 9, 0, 4, 7, 1, 0, 2, 2, 7, ~
## $ EnvironmentSatisfaction <fct> 4, 3, 3, 2, 1, 2, 3, 3, 4, 1, 4, 3, 2, 4, 2, 3~
## $ JobSatisfaction         <fct> 4, 4, 3, 4, 2, 4, 2, 3, 3, 4, 3, 1, 4, 3, 2, 4~

table(emp_train_cln$Attrition)

## 
##    0    1 
## 2295  443

Downsampling

set.seed(2890)
emp_train_cln <- ovun.sample(Attrition ~ ., data = emp_train_cln, method = "under", N= 1000)$data
table(emp_train_cln$Attrition)

## 
##   0   1 
## 557 443

#3. Train the MODEL

model_naive <- naiveBayes(x = emp_train_cln %>% select(-Attrition), #predictor
                          y = emp_train_cln$Attrition, #target
                          laplace = 1) 

pred_label_naive <- predict(model_naive, emp_test_cln, type = "class")
head(data.frame(actual = emp_test_cln$Attrition, prediction = pred_label_naive))

##   actual prediction
## 1      0          0
## 2      0          0
## 3      0          0
## 4      0          0
## 5      1          0
## 6      0          0

Model Evaluation

emp_test_cln$Attrition = as.factor(emp_test_cln$Attrition)
mat1 <- confusionMatrix(data = pred_label_naive, reference = emp_test_cln$Attrition, positive = "1")
mat1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 464  53
##          1 218  80
##                                          
##                Accuracy : 0.6675         
##                  95% CI : (0.634, 0.6998)
##     No Information Rate : 0.8368         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.188          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.60150        
##             Specificity : 0.68035        
##          Pos Pred Value : 0.26846        
##          Neg Pred Value : 0.89749        
##              Prevalence : 0.16319        
##          Detection Rate : 0.09816        
##    Detection Prevalence : 0.36564        
##       Balanced Accuracy : 0.64093        
##                                          
##        'Positive' Class : 1              
##

#ROC 

prob_attrit <- predict(model_naive, emp_test_cln, type = "raw")

#prepare dataframe for  ROC
data_roc <- data.frame(prob = prob_attrit[,2], # probability of positive class(survived)
                       labels = as.numeric(emp_test_cln$Attrition == "1")) #get the label as the test data who survived
head(data_roc)

##         prob labels
## 1 0.05658184      0
## 2 0.03436015      0
## 3 0.35047129      0
## 4 0.17859820      0
## 5 0.45145217      1
## 6 0.42141491      0

naive_roc <- ROCR::prediction(data_roc$prob, data_roc$labels) 

# ROC curve
library(ROCR)
plot(performance(naive_roc, "tpr", "fpr"), #tpr = true positive rate, fpr = false positive rate
     main = "ROC")
abline(a = 0, b = 1)

#AUC: values closer to 1 is better
auc_n <- performance(naive_roc, measure = "auc")
auc_n@y.values

## [[1]]
## [1] 0.7257954

#Plot Variable performance
#X <- caret::varImp(model_naive)
#plot(X)

# You can use oversampling to improve performance of the model (sensitivity metrics)

Model Building

#For comparing the outcome of the training and testing phase let's create separate variables that store the value of the response variable:
#create objects x which holds the predictor variables and y which holds the response variables
x = emp_train_cln[,-1]
y = emp_train_cln$Attrition

summary(emp_train_cln$Attrition)

##   0   1 
## 557 443

emp_model = train(x,y,method='nb',trControl=trainControl(method='cv',number=10))
emp_model

## Naive Bayes 
## 
## 1000 samples
##   13 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 900, 900, 900, 900, 900, 901, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.6510033  0.2934780
##    TRUE      0.6820443  0.3520298
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
##  = 1.

Model Evaluation

#5. MODEL EVALUATION
#Predict testing set
emp_predict <- predict(emp_model, newdata = emp_test_cln)

#Get the confusion matrix to see accuracy value and other parameter values
emp_mat<-caret::confusionMatrix(emp_predict, emp_test_cln$Attrition, positive="1")
emp_mat

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 520  53
##          1 162  80
##                                           
##                Accuracy : 0.7362          
##                  95% CI : (0.7045, 0.7662)
##     No Information Rate : 0.8368          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2737          
##                                           
##  Mcnemar's Test P-Value : 1.764e-13       
##                                           
##             Sensitivity : 0.60150         
##             Specificity : 0.76246         
##          Pos Pred Value : 0.33058         
##          Neg Pred Value : 0.90750         
##              Prevalence : 0.16319         
##          Detection Rate : 0.09816         
##    Detection Prevalence : 0.29693         
##       Balanced Accuracy : 0.68198         
##                                           
##        'Positive' Class : 1               
##

str(emp_test_cln$Attrition)

##  Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...

#Plot Variable performance
X1 <- varImp(emp_model)
plot(X1)

Hyperparameter Tuning

#Hyperparameter tuning
# set up tuning grid

search_grid <- expand.grid(
  usekernel = c(TRUE, FALSE),
  fL = 1:5,
  adjust = seq(0, 5, by = 1)
)

emp_model2 = train(x,y,method='nb', tuneGrid=search_grid, trControl=trainControl(method='cv',number=10))
emp_model2

## Naive Bayes 
## 
## 1000 samples
##   13 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 899, 901, 899, 900, 900, 900, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  fL  adjust  Accuracy   Kappa    
##   FALSE      1   0       0.6380365  0.2677943
##   FALSE      1   1       0.6380365  0.2677943
##   FALSE      1   2       0.6380365  0.2677943
##   FALSE      1   3       0.6380365  0.2677943
##   FALSE      1   4       0.6380365  0.2677943
##   FALSE      1   5       0.6380365  0.2677943
##   FALSE      2   0       0.6380365  0.2677943
##   FALSE      2   1       0.6380365  0.2677943
##   FALSE      2   2       0.6380365  0.2677943
##   FALSE      2   3       0.6380365  0.2677943
##   FALSE      2   4       0.6380365  0.2677943
##   FALSE      2   5       0.6380365  0.2677943
##   FALSE      3   0       0.6380365  0.2677943
##   FALSE      3   1       0.6380365  0.2677943
##   FALSE      3   2       0.6380365  0.2677943
##   FALSE      3   3       0.6380365  0.2677943
##   FALSE      3   4       0.6380365  0.2677943
##   FALSE      3   5       0.6380365  0.2677943
##   FALSE      4   0       0.6380365  0.2677943
##   FALSE      4   1       0.6380365  0.2677943
##   FALSE      4   2       0.6380365  0.2677943
##   FALSE      4   3       0.6380365  0.2677943
##   FALSE      4   4       0.6380365  0.2677943
##   FALSE      4   5       0.6380365  0.2677943
##   FALSE      5   0       0.6390365  0.2700068
##   FALSE      5   1       0.6390365  0.2700068
##   FALSE      5   2       0.6390365  0.2700068
##   FALSE      5   3       0.6390365  0.2700068
##   FALSE      5   4       0.6390365  0.2700068
##   FALSE      5   5       0.6390365  0.2700068
##    TRUE      1   0             NaN        NaN
##    TRUE      1   1       0.6759585  0.3388790
##    TRUE      1   2       0.6759686  0.3398352
##    TRUE      1   3       0.6659680  0.3204472
##    TRUE      1   4       0.6490173  0.2886365
##    TRUE      1   5       0.6350365  0.2647942
##    TRUE      2   0             NaN        NaN
##    TRUE      2   1       0.6759585  0.3388790
##    TRUE      2   2       0.6769587  0.3417453
##    TRUE      2   3       0.6659779  0.3203873
##    TRUE      2   4       0.6499973  0.2914734
##    TRUE      2   5       0.6370367  0.2687890
##    TRUE      3   0             NaN        NaN
##    TRUE      3   1       0.6769585  0.3410781
##    TRUE      3   2       0.6769587  0.3417453
##    TRUE      3   3       0.6649779  0.3184664
##    TRUE      3   4       0.6490072  0.2896186
##    TRUE      3   5       0.6380569  0.2713043
##    TRUE      4   0             NaN        NaN
##    TRUE      4   1       0.6769585  0.3410781
##    TRUE      4   2       0.6769686  0.3420612
##    TRUE      4   3       0.6649779  0.3184598
##    TRUE      4   4       0.6490072  0.2896186
##    TRUE      4   5       0.6390569  0.2731494
##    TRUE      5   0             NaN        NaN
##    TRUE      5   1       0.6779585  0.3432653
##    TRUE      5   2       0.6809290  0.3507386
##    TRUE      5   3       0.6649779  0.3184598
##    TRUE      5   4       0.6490072  0.2896186
##    TRUE      5   5       0.6380569  0.2709503
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 5, usekernel = TRUE and adjust
##  = 2.

#Top 5 models
emp_model2$results %>%
  top_n(5, wt=Accuracy) %>%
  arrange(desc(Accuracy))

##   usekernel fL adjust  Accuracy     Kappa AccuracySD    KappaSD
## 1      TRUE  5      2 0.6809290 0.3507386 0.05436542 0.10718888
## 2      TRUE  5      1 0.6779585 0.3432653 0.05305977 0.10309941
## 3      TRUE  4      2 0.6769686 0.3420612 0.05047900 0.09914334
## 4      TRUE  2      2 0.6769587 0.3417453 0.05246130 0.10333621
## 5      TRUE  3      2 0.6769587 0.3417453 0.05246130 0.10333621

pred <-predict(emp_model2, newdata=emp_test_cln)
confusionMatrix(pred,emp_test_cln$Attrition, positive="1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 497  50
##          1 185  83
##                                           
##                Accuracy : 0.7117          
##                  95% CI : (0.6792, 0.7426)
##     No Information Rate : 0.8368          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2505          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6241          
##             Specificity : 0.7287          
##          Pos Pred Value : 0.3097          
##          Neg Pred Value : 0.9086          
##              Prevalence : 0.1632          
##          Detection Rate : 0.1018          
##    Detection Prevalence : 0.3288          
##       Balanced Accuracy : 0.6764          
##                                           
##        'Positive' Class : 1               
##

#Plot Variable performance
X2 <- varImp(emp_model2)
plot(X2)

#ROC 

prob_attrit2 <- predict(model_naive, emp_test_cln, type = "raw")

#prepare dataframe for  ROC
data_roc2 <- data.frame(prob = prob_attrit2[,2], # probability of positive class
                       labels = as.numeric(emp_test_cln$Attrition == "1")) #get the label as the test data who survived
head(data_roc)

##         prob labels
## 1 0.05658184      0
## 2 0.03436015      0
## 3 0.35047129      0
## 4 0.17859820      0
## 5 0.45145217      1
## 6 0.42141491      0

naive_roc2 <- ROCR::prediction(data_roc2$prob, data_roc2$labels) 

# ROC curve
library(ROCR)
plot(performance(naive_roc2, "tpr", "fpr"), #tpr = true positive rate, fpr = false positive rate
     main = "ROC")
abline(a = 0, b = 1)

#AUC: values closer to 1 is better
auc_n2 <- performance(naive_roc2, measure = "auc")
auc_n2@y.values

## [[1]]
## [1] 0.7257954

ML_2_Project_2_Helfrich

William Helfrich

6/8/2021

Replace Missing Values

Changing Variable Types

Remove correlated variables to ensure independence

Downsampling

Model Evaluation

Model Building

Model Evaluation

Hyperparameter Tuning