Preparation

# Load packages used in this session of R
library(class)
library(caret)   
library(tidyverse)
library(lattice)
library(ggplot2)
library(e1071)  #miscellaneous functions in statistics
library(plyr)
library(dplyr)
library(lattice)
library(readxl)
library(ROSE)
library(ROCR)
library(rminer)

Import and Cleaning

#Data Import and Clean

# I don't know what this is but we use it all the time.
knitr::opts_chunk$set(echo = TRUE)

# Read in the data
EDP <- read_excel("C:/Users/12177/Desktop/School/Module_3/Machine Learning II/data/Employee_Data_Class_Project.xlsx") %>%
  # Mutate these numbers into actual numeric.
  mutate(NumCompaniesWorked = as.numeric(NumCompaniesWorked)) %>%
  mutate(TotalWorkingYears = as.numeric(TotalWorkingYears)) %>%
  mutate(EnvironmentSatisfaction = as.numeric(EnvironmentSatisfaction)) %>%
  mutate(JobSatisfaction = as.numeric((JobSatisfaction))) 
## Warning in eval(cols[[col]], .data, parent.frame()): NAs introduced by coercion

## Warning in eval(cols[[col]], .data, parent.frame()): NAs introduced by coercion

## Warning in eval(cols[[col]], .data, parent.frame()): NAs introduced by coercion

## Warning in eval(cols[[col]], .data, parent.frame()): NAs introduced by coercion
# Change Attrition to use numbers  
EDP$Attrition = ifelse(EDP$Attrition == "Yes", 1 ,0)

# Make Business Travel use numbers
EDP$BusinessTravel <- revalue(EDP$BusinessTravel, c("Non-Travel" = 0))
EDP$BusinessTravel <- revalue(EDP$BusinessTravel, c("Travel_Rarely" = 1))
EDP$BusinessTravel <- revalue(EDP$BusinessTravel, c("Travel_Frequently" = 2))

# Make Gender use numbers
EDP$Gender <- revalue(EDP$Gender, c("Male" = 0))
EDP$Gender <- revalue(EDP$Gender, c("Female" = 1))

# Make Marital Status use numbers 
EDP$MaritalStatus <- revalue(EDP$MaritalStatus, c("Single" = 0))
EDP$MaritalStatus <- revalue(EDP$MaritalStatus, c("Married" = 1))
EDP$MaritalStatus <- revalue(EDP$MaritalStatus, c("Divorced" = 2))

# Convert everything to numeric
EDP$Attrition <- as.factor(EDP$Attrition)
EDP$BusinessTravel <- as.numeric(EDP$BusinessTravel)
EDP$Gender <- as.numeric(EDP$Gender)
EDP$MaritalStatus <- as.numeric(EDP$MaritalStatus)

# Remove NAs and use the columnar median
EDP$NumCompaniesWorked[is.na(EDP$NumCompaniesWorked)] = median(EDP$NumCompaniesWorked, na.rm = TRUE)
EDP$TotalWorkingYears[is.na(EDP$TotalWorkingYears)] = median(EDP$TotalWorkingYears, na.rm = TRUE)
EDP$EnvironmentSatisfaction[is.na(EDP$EnvironmentSatisfaction)] = median(EDP$EnvironmentSatisfaction, na.rm = TRUE)
EDP$JobSatisfaction[is.na(EDP$JobSatisfaction)] = median(EDP$JobSatisfaction, na.rm = TRUE)

Partition

# Create Data Partition

# Partition the data into training and testing sets
index <- sample(1:nrow(EDP),
                round (nrow (EDP) * 0.7))
# Training
train <- EDP[index, ]

# Downsampling the training data
train <- ovun.sample(Attrition ~ ., data = train, method = "under",N = 1000)$data
table(train$Attrition)
## 
##   0   1 
## 492 508
# Test data set / dataframe
test <- EDP[-index, ]

set.seed(123)
set.seed(123)

#Select data train and data test without employeeID or standard hours
train_k <- train %>%
  select(-c(EmployeeID, StandardHours))

test_k <- test %>%
  select(-c(EmployeeID, StandardHours))

#We select predictor and target
#Scale the predictor variable (Survived variable doesn't need to be scaled)
train_x <- scale(train_k[, -2]) 
test_x <- scale(test_k[, -2], center = attr(train_x, "scaled:center"), 
                scale = attr(train_x, "scaled:scale")) 

# Create target variable for attrition
train_y <- train_k$Attrition
test_y <- test_k$Attrition

k <- sqrt(nrow(train_x))
k
## [1] 31.62278
# initial K is 31


#str(train) # this shows that we need to tell R which columns contain factors
# it also shows us that there are some missing values. There are "?"s
# in the dataset. These are in the "ca" and "thal" columns...

Make the model

set.seed(123)

KNN_model <- knn(train = train_x, test = test_x, cl = train_y, k = 31)
head(KNN_model,10)
##  [1] 1 0 0 0 0 0 0 1 1 0
## Levels: 0 1
confusionMatrix(data = as.factor(KNN_model), reference = as.factor(test_y), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 728  72
##          1 392 131
##                                          
##                Accuracy : 0.6493         
##                  95% CI : (0.6229, 0.675)
##     No Information Rate : 0.8466         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1795         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.64532        
##             Specificity : 0.65000        
##          Pos Pred Value : 0.25048        
##          Neg Pred Value : 0.91000        
##              Prevalence : 0.15344        
##          Detection Rate : 0.09902        
##    Detection Prevalence : 0.39531        
##       Balanced Accuracy : 0.64766        
##                                          
##        'Positive' Class : 1              
## 

Variation of K value

set.seed(123)

#Accuracy for different values of k

#Create output vector with some rows and columns (to save output of different values of k)
#create a loop to go through different values of k and capture the accuracy from the confusion matrix and store results in output
output <- matrix(ncol=2, nrow=46)

for (k_val in 5:50){
  y_pred = knn(train = train_x
               , test = test_x
               , cl = train_y
               , k = k_val)
  
  x = confusionMatrix(table(as.factor(y_pred), as.factor(test_y))) 
  acc = x$overall[1]
  
  output[k_val-4, ] = c(k_val, acc)
  
}

#I then converted my vector to a data frame and plot the results
# Convert to data frame
df <- as.data.frame(output)
names(df) <- c("K_value", "Accuracy")
df
##    K_value  Accuracy
## 1        5 0.6250945
## 2        6 0.6266062
## 3        7 0.6447468
## 4        8 0.6470144
## 5        9 0.6424792
## 6       10 0.6266062
## 7       11 0.6228269
## 8       12 0.6402116
## 9       13 0.6470144
## 10      14 0.6432351
## 11      15 0.6485261
## 12      16 0.6553288
## 13      17 0.6394558
## 14      18 0.6500378
## 15      19 0.6507937
## 16      20 0.6560847
## 17      21 0.6523054
## 18      22 0.6462585
## 19      23 0.6462585
## 20      24 0.6538171
## 21      25 0.6553288
## 22      26 0.6462585
## 23      27 0.6500378
## 24      28 0.6500378
## 25      29 0.6485261
## 26      30 0.6515495
## 27      31 0.6470144
## 28      32 0.6530612
## 29      33 0.6500378
## 30      34 0.6538171
## 31      35 0.6538171
## 32      36 0.6598639
## 33      37 0.6613757
## 34      38 0.6606198
## 35      39 0.6545729
## 36      40 0.6568405
## 37      41 0.6591081
## 38      42 0.6613757
## 39      43 0.6636432
## 40      44 0.6613757
## 41      45 0.6621315
## 42      46 0.6628874
## 43      47 0.6606198
## 44      48 0.6568405
## 45      49 0.6560847
## 46      50 0.6613757
# Finding optimized K value

# Plot
ggplot(data=df, aes(x=K_value, y=Accuracy, group=1)) +
  geom_line(color="red")+
  geom_point()

set.seed(123)
#Adjusted K value...
# Final model with K = 26
KNN_model_final_K <- knn(train = train_x, test = test_x, cl = train_y, k = 26)

confusionMatrix(data = as.factor(KNN_model_final_K), reference = as.factor(test_y), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 722  67
##          1 398 136
##                                           
##                Accuracy : 0.6485          
##                  95% CI : (0.6221, 0.6743)
##     No Information Rate : 0.8466          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1887          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6700          
##             Specificity : 0.6446          
##          Pos Pred Value : 0.2547          
##          Neg Pred Value : 0.9151          
##              Prevalence : 0.1534          
##          Detection Rate : 0.1028          
##    Detection Prevalence : 0.4036          
##       Balanced Accuracy : 0.6573          
##                                           
##        'Positive' Class : 1               
## 
# 
# 
# #Predicting to test set
# pred2 <- predict(KNN_model_final_K, test_k)
# # Get the predicted probabilities of Attrition
# pred2.probs = predict(KNN_model_final_K,
#                     newdata = test_k,
#                     type="prob")
# # Create the ROC curve
# KNN_model_final_K.roc <- roc(test_k$Attrition, pred2.probs[,1])
# # Get the AUC and display it in the title of the plot
# AUC.Value = round(auc(KNN_model_final_K.roc), 4)
# plot(KNN_model_final_K.roc, col=c(6),
#      main = str_glue("ROC Curve for Decision Tree with bagging (AUC = {AUC.Value})"))

Using Library Caret

# train model
# Specify 10-fold cross validation
ctrl <- trainControl(method = "cv",  number = 10) 

# SET SEED
set.seed(123)

# CV KNN model
knn_cv <- train(
  Attrition ~ .,
  data = train_k,
  method = "knn",
  trControl = ctrl, preProcess = c("center","scale"), tuneLength = 20
)

# assess results
knn_cv
## k-Nearest Neighbors 
## 
## 1000 samples
##   15 predictor
##    2 classes: '0', '1' 
## 
## Pre-processing: centered (15), scaled (15) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 899, 900, 900, 900, 900, 901, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.6411462  0.2807716
##    7  0.6501771  0.2984526
##    9  0.6481464  0.2946856
##   11  0.6561074  0.3110094
##   13  0.6540569  0.3069246
##   15  0.6491068  0.2974317
##   17  0.6500662  0.2992230
##   19  0.6550070  0.3090259
##   21  0.6670870  0.3338970
##   23  0.6740575  0.3475966
##   25  0.6690472  0.3375561
##   27  0.6640668  0.3276653
##   29  0.6739973  0.3477711
##   31  0.6801272  0.3599706
##   33  0.6730567  0.3458866
##   35  0.6701070  0.3400554
##   37  0.6790975  0.3578920
##   39  0.6770676  0.3537018
##   41  0.6770377  0.3536796
##   43  0.6580270  0.3157373
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 31.
# plot most important variables
plot(varImp(knn_cv), 10) 

#predict(knn_cv, train_x)

# 
# #USE knn_cv on test data to show specificity and sensitivity.
# confusionMatrix(data = as.factor(predict(knn_cv, test_x)),
#                 reference = as.factor(test_y),
#                 positive = "1")

# # Lloyd's confusion matrix for Carat
# knnPredict <- predict(knn_cv,newdata =test_k )#Check model evaluation
# confusionMatrix(knnPredict, test_k$Attrition )


# ROC curve
predict_knn <- predict(knn_cv, test_x, type="prob")[,2]

roc_knn <- prediction(predict_knn, test_y)
knn_perform <- performance(roc_knn,measure = "tpr",
                             x.measure = "fpr")
par(mfrow=c(1,1))
plot(knn_perform)

#AUC
auc_knn <- performance(roc_knn,measure="auc")
auc_knn <- auc_knn@y.values[[1]]
auc_knn
## [1] 0.6388987

BREAK SPACE BETWEEN MODELS

# NOTHING HERE THIS IS A SPACE

#SVM Model Begin

# Begin on the SVM Model

SVM1 = svm(formula = Attrition ~ .,
          data = train_k,
          type = 'C-classification',
          kernel = 'linear', 
          scale=TRUE) 

summary(SVM1)
## 
## Call:
## svm(formula = Attrition ~ ., data = train_k, type = "C-classification", 
##     kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  709
## 
##  ( 351 358 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
#indicates that a linear kernel was used with cost=1, 
#and that there were 283 support vectors,141 in one class and 142 in the other. 
#What if we instead used a smaller value of the cost parameter?

#which observations are support vectors
head(SVM1$index)
## [1] 2 3 5 6 7 8
SVM2 = svm(formula = Attrition ~ .,
           data = train_k,
           type = 'C-classification',
           kernel = 'radial', 
           cost=0.1,
           scale=TRUE) 

#290 support vectors (wider margin)
summary(SVM1)
## 
## Call:
## svm(formula = Attrition ~ ., data = train_k, type = "C-classification", 
##     kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  709
## 
##  ( 351 358 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
summary(SVM2)
## 
## Call:
## svm(formula = Attrition ~ ., data = train_k, type = "C-classification", 
##     kernel = "radial", cost = 0.1, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.1 
## 
## Number of Support Vectors:  887
## 
##  ( 444 443 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
#3. Tuning SVM Model

#The e1071 library includes a built-in function, tune(), to perform cross validation. 
#By default, tune() performs ten-fold cross-validation on a set of models of interest. 
#In order to use this function, we pass in relevant information about the set of models that are under consideration. 

set.seed(123)
tune.out=tune(svm, Attrition~.,data=train_k ,kernel ="linear",
              ranges =list(cost=c(0.001 , 0.01, 0.1, 1,5,10,100) ))
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.01
## 
## - best performance: 0.333 
## 
## - Detailed performance results:
##    cost error dispersion
## 1 1e-03 0.349 0.04280446
## 2 1e-02 0.333 0.04922736
## 3 1e-01 0.343 0.04217688
## 4 1e+00 0.334 0.04141927
## 5 5e+00 0.333 0.04083844
## 6 1e+01 0.333 0.04083844
## 7 1e+02 0.333 0.04083844
#lowest cross validation error for .01 cost parameter

#The tune() function stores the best model obtained, which can be accessed as follows:
  
bestmod = tune.out$best.model
summary(bestmod)
## 
## Call:
## best.tune(method = svm, train.x = Attrition ~ ., data = train_k, 
##     ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  771
## 
##  ( 387 384 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
SVM_pred = predict(bestmod, newdata = test[-2])



#4. Check model evaluation
test_k2y <- test$Attrition 

confusionMatrix(data = as.factor(SVM_pred), reference = as.factor(test_k2y), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 693  60
##          1 427 143
##                                           
##                Accuracy : 0.6319          
##                  95% CI : (0.6053, 0.6579)
##     No Information Rate : 0.8466          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1857          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7044          
##             Specificity : 0.6188          
##          Pos Pred Value : 0.2509          
##          Neg Pred Value : 0.9203          
##              Prevalence : 0.1534          
##          Detection Rate : 0.1081          
##    Detection Prevalence : 0.4308          
##       Balanced Accuracy : 0.6616          
##                                           
##        'Positive' Class : 1               
## 
#5. If we change to Gaussian Kernel, are we going to see an improvement in our model?

#we add the gamma hyperparameter, 
#Gamma decides that how much curvature we want in a decision boundary. 
#Gamma high means more curvature. Gamma low means less curvature. 
SVMR2 = svm(formula = Attrition ~ .,
          data = train_k,
          type = 'C-classification',
          cost=1,
          gamma=0.05,
          kernel = 'radial') 
summary(SVMR2)
## 
## Call:
## svm(formula = Attrition ~ ., data = train_k, type = "C-classification", 
##     cost = 1, gamma = 0.05, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  721
## 
##  ( 368 353 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
#We can perform cross-validation using tune() to select the best choice of γ and cost for an SVM with a radial kernel
set.seed (123)
tune.out2=tune(svm , Attrition ~., data=train_k, kernel ="radial",
              ranges =list(cost=c(0.01, 0.1 ,1 ,10 ,100), gamma=c(0.5,1,2,3) ))
summary (tune.out2)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10     1
## 
## - best performance: 0.075 
## 
## - Detailed performance results:
##     cost gamma error dispersion
## 1  1e-02   0.5 0.514 0.04351245
## 2  1e-01   0.5 0.514 0.04351245
## 3  1e+00   0.5 0.083 0.03267687
## 4  1e+01   0.5 0.085 0.03407508
## 5  1e+02   0.5 0.085 0.03407508
## 6  1e-02   1.0 0.514 0.04351245
## 7  1e-01   1.0 0.514 0.04351245
## 8  1e+00   1.0 0.077 0.03267687
## 9  1e+01   1.0 0.075 0.03205897
## 10 1e+02   1.0 0.075 0.03205897
## 11 1e-02   2.0 0.514 0.04351245
## 12 1e-01   2.0 0.514 0.04351245
## 13 1e+00   2.0 0.081 0.02960856
## 14 1e+01   2.0 0.079 0.03212822
## 15 1e+02   2.0 0.079 0.03212822
## 16 1e-02   3.0 0.514 0.04351245
## 17 1e-01   3.0 0.514 0.04351245
## 18 1e+00   3.0 0.082 0.03084009
## 19 1e+01   3.0 0.082 0.03084009
## 20 1e+02   3.0 0.082 0.03084009
bestmod2 = tune.out2$best.model
summary(bestmod2)
## 
## Call:
## best.tune(method = svm, train.x = Attrition ~ ., data = train_k, 
##     ranges = list(cost = c(0.01, 0.1, 1, 10, 100), gamma = c(0.5, 
##         1, 2, 3)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10 
## 
## Number of Support Vectors:  671
## 
##  ( 430 241 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
SVMR_pred2 = predict(bestmod2, newdata = test[-2])

test$Attrition <- as.factor(test$Attrition) 

#Check model evaluation
confusionMatrix(data = as.factor(SVMR_pred2), reference = as.factor(test_k2y), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1117   16
##          1    3  187
##                                           
##                Accuracy : 0.9856          
##                  95% CI : (0.9777, 0.9913)
##     No Information Rate : 0.8466          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9432          
##                                           
##  Mcnemar's Test P-Value : 0.005905        
##                                           
##             Sensitivity : 0.9212          
##             Specificity : 0.9973          
##          Pos Pred Value : 0.9842          
##          Neg Pred Value : 0.9859          
##              Prevalence : 0.1534          
##          Detection Rate : 0.1413          
##    Detection Prevalence : 0.1436          
##       Balanced Accuracy : 0.9593          
##                                           
##        'Positive' Class : 1               
## 
#Much better
rocplot =function (pred , truth , ...){
  predob = prediction (pred , truth )
  perf = performance (predob , "tpr", "fpr")
  plot(perf ,...)}

svm.opt=svm(Attrition ~., data=train_k, kernel ="radial", gamma =0.5, cost=1, decision.values =TRUE)
fitted = attributes(predict(svm.opt, train_k, decision.values =TRUE))$decision.values*-1
par(mfrow =c(1,2))
rocplot (fitted, train_k$Attrition, main="Training Data")

#SVM appears to be producing accurate predictions. 
#We are really more interested in the level of prediction accuracy on the test data. 
#When we compute the ROC curves on the test data, the model with γ = 2 appears to provide the most accurate results.

fitted =attributes (predict (svm.opt ,test, decision.values =T))$decision.values*-1
rocplot (fitted ,test$Attrition, main ="Test Data")

#fitted =attributes (predict (svm.opt ,test, decision.values =T))$decision.values
#rocplot (fitted ,test$Attrition, add=T,col ="red")

fitted <- predict(svm.opt, newdata = test_k, decision.values = TRUE) %>% 
  attr("decision.values")*-1
pred <- ROCR::prediction(fitted[,1], test_k[["Attrition"]])
performance(pred, "auc")@y.values
## [[1]]
## [1] 0.9627639
#For SVM variable importance doesnt work with Caret so use rminer package

model <- fit(Attrition ~., data=train_k, model="svm", kpar="automatic", C=1)

svm.imp <- Importance(model, data=train_k)

par(mfrow =c(1,1))
L=list(runs=1,sen=t(svm.imp$imp),sresponses=svm.imp$sresponses)
mgraph(L,graph="IMP",leg=names(train_k),col="gray",Grid=10)