# Load packages used in this session of R
library(class)
library(caret)
library(tidyverse)
library(lattice)
library(ggplot2)
library(e1071) #miscellaneous functions in statistics
library(plyr)
library(dplyr)
library(lattice)
library(readxl)
library(ROSE)
library(ROCR)
library(rminer)
#Data Import and Clean
# I don't know what this is but we use it all the time.
knitr::opts_chunk$set(echo = TRUE)
# Read in the data
EDP <- read_excel("C:/Users/12177/Desktop/School/Module_3/Machine Learning II/data/Employee_Data_Class_Project.xlsx") %>%
# Mutate these numbers into actual numeric.
mutate(NumCompaniesWorked = as.numeric(NumCompaniesWorked)) %>%
mutate(TotalWorkingYears = as.numeric(TotalWorkingYears)) %>%
mutate(EnvironmentSatisfaction = as.numeric(EnvironmentSatisfaction)) %>%
mutate(JobSatisfaction = as.numeric((JobSatisfaction)))
## Warning in eval(cols[[col]], .data, parent.frame()): NAs introduced by coercion
## Warning in eval(cols[[col]], .data, parent.frame()): NAs introduced by coercion
## Warning in eval(cols[[col]], .data, parent.frame()): NAs introduced by coercion
## Warning in eval(cols[[col]], .data, parent.frame()): NAs introduced by coercion
# Change Attrition to use numbers
EDP$Attrition = ifelse(EDP$Attrition == "Yes", 1 ,0)
# Make Business Travel use numbers
EDP$BusinessTravel <- revalue(EDP$BusinessTravel, c("Non-Travel" = 0))
EDP$BusinessTravel <- revalue(EDP$BusinessTravel, c("Travel_Rarely" = 1))
EDP$BusinessTravel <- revalue(EDP$BusinessTravel, c("Travel_Frequently" = 2))
# Make Gender use numbers
EDP$Gender <- revalue(EDP$Gender, c("Male" = 0))
EDP$Gender <- revalue(EDP$Gender, c("Female" = 1))
# Make Marital Status use numbers
EDP$MaritalStatus <- revalue(EDP$MaritalStatus, c("Single" = 0))
EDP$MaritalStatus <- revalue(EDP$MaritalStatus, c("Married" = 1))
EDP$MaritalStatus <- revalue(EDP$MaritalStatus, c("Divorced" = 2))
# Convert everything to numeric
EDP$Attrition <- as.factor(EDP$Attrition)
EDP$BusinessTravel <- as.numeric(EDP$BusinessTravel)
EDP$Gender <- as.numeric(EDP$Gender)
EDP$MaritalStatus <- as.numeric(EDP$MaritalStatus)
# Remove NAs and use the columnar median
EDP$NumCompaniesWorked[is.na(EDP$NumCompaniesWorked)] = median(EDP$NumCompaniesWorked, na.rm = TRUE)
EDP$TotalWorkingYears[is.na(EDP$TotalWorkingYears)] = median(EDP$TotalWorkingYears, na.rm = TRUE)
EDP$EnvironmentSatisfaction[is.na(EDP$EnvironmentSatisfaction)] = median(EDP$EnvironmentSatisfaction, na.rm = TRUE)
EDP$JobSatisfaction[is.na(EDP$JobSatisfaction)] = median(EDP$JobSatisfaction, na.rm = TRUE)
# Create Data Partition
# Partition the data into training and testing sets
index <- sample(1:nrow(EDP),
round (nrow (EDP) * 0.7))
# Training
train <- EDP[index, ]
# Downsampling the training data
train <- ovun.sample(Attrition ~ ., data = train, method = "under",N = 1000)$data
table(train$Attrition)
##
## 0 1
## 492 508
# Test data set / dataframe
test <- EDP[-index, ]
set.seed(123)
set.seed(123)
#Select data train and data test without employeeID or standard hours
train_k <- train %>%
select(-c(EmployeeID, StandardHours))
test_k <- test %>%
select(-c(EmployeeID, StandardHours))
#We select predictor and target
#Scale the predictor variable (Survived variable doesn't need to be scaled)
train_x <- scale(train_k[, -2])
test_x <- scale(test_k[, -2], center = attr(train_x, "scaled:center"),
scale = attr(train_x, "scaled:scale"))
# Create target variable for attrition
train_y <- train_k$Attrition
test_y <- test_k$Attrition
k <- sqrt(nrow(train_x))
k
## [1] 31.62278
# initial K is 31
#str(train) # this shows that we need to tell R which columns contain factors
# it also shows us that there are some missing values. There are "?"s
# in the dataset. These are in the "ca" and "thal" columns...
set.seed(123)
KNN_model <- knn(train = train_x, test = test_x, cl = train_y, k = 31)
head(KNN_model,10)
## [1] 1 0 0 0 0 0 0 1 1 0
## Levels: 0 1
confusionMatrix(data = as.factor(KNN_model), reference = as.factor(test_y),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 728 72
## 1 392 131
##
## Accuracy : 0.6493
## 95% CI : (0.6229, 0.675)
## No Information Rate : 0.8466
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1795
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.64532
## Specificity : 0.65000
## Pos Pred Value : 0.25048
## Neg Pred Value : 0.91000
## Prevalence : 0.15344
## Detection Rate : 0.09902
## Detection Prevalence : 0.39531
## Balanced Accuracy : 0.64766
##
## 'Positive' Class : 1
##
set.seed(123)
#Accuracy for different values of k
#Create output vector with some rows and columns (to save output of different values of k)
#create a loop to go through different values of k and capture the accuracy from the confusion matrix and store results in output
output <- matrix(ncol=2, nrow=46)
for (k_val in 5:50){
y_pred = knn(train = train_x
, test = test_x
, cl = train_y
, k = k_val)
x = confusionMatrix(table(as.factor(y_pred), as.factor(test_y)))
acc = x$overall[1]
output[k_val-4, ] = c(k_val, acc)
}
#I then converted my vector to a data frame and plot the results
# Convert to data frame
df <- as.data.frame(output)
names(df) <- c("K_value", "Accuracy")
df
## K_value Accuracy
## 1 5 0.6250945
## 2 6 0.6266062
## 3 7 0.6447468
## 4 8 0.6470144
## 5 9 0.6424792
## 6 10 0.6266062
## 7 11 0.6228269
## 8 12 0.6402116
## 9 13 0.6470144
## 10 14 0.6432351
## 11 15 0.6485261
## 12 16 0.6553288
## 13 17 0.6394558
## 14 18 0.6500378
## 15 19 0.6507937
## 16 20 0.6560847
## 17 21 0.6523054
## 18 22 0.6462585
## 19 23 0.6462585
## 20 24 0.6538171
## 21 25 0.6553288
## 22 26 0.6462585
## 23 27 0.6500378
## 24 28 0.6500378
## 25 29 0.6485261
## 26 30 0.6515495
## 27 31 0.6470144
## 28 32 0.6530612
## 29 33 0.6500378
## 30 34 0.6538171
## 31 35 0.6538171
## 32 36 0.6598639
## 33 37 0.6613757
## 34 38 0.6606198
## 35 39 0.6545729
## 36 40 0.6568405
## 37 41 0.6591081
## 38 42 0.6613757
## 39 43 0.6636432
## 40 44 0.6613757
## 41 45 0.6621315
## 42 46 0.6628874
## 43 47 0.6606198
## 44 48 0.6568405
## 45 49 0.6560847
## 46 50 0.6613757
# Finding optimized K value
# Plot
ggplot(data=df, aes(x=K_value, y=Accuracy, group=1)) +
geom_line(color="red")+
geom_point()
set.seed(123)
#Adjusted K value...
# Final model with K = 26
KNN_model_final_K <- knn(train = train_x, test = test_x, cl = train_y, k = 26)
confusionMatrix(data = as.factor(KNN_model_final_K), reference = as.factor(test_y),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 722 67
## 1 398 136
##
## Accuracy : 0.6485
## 95% CI : (0.6221, 0.6743)
## No Information Rate : 0.8466
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1887
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6700
## Specificity : 0.6446
## Pos Pred Value : 0.2547
## Neg Pred Value : 0.9151
## Prevalence : 0.1534
## Detection Rate : 0.1028
## Detection Prevalence : 0.4036
## Balanced Accuracy : 0.6573
##
## 'Positive' Class : 1
##
#
#
# #Predicting to test set
# pred2 <- predict(KNN_model_final_K, test_k)
# # Get the predicted probabilities of Attrition
# pred2.probs = predict(KNN_model_final_K,
# newdata = test_k,
# type="prob")
# # Create the ROC curve
# KNN_model_final_K.roc <- roc(test_k$Attrition, pred2.probs[,1])
# # Get the AUC and display it in the title of the plot
# AUC.Value = round(auc(KNN_model_final_K.roc), 4)
# plot(KNN_model_final_K.roc, col=c(6),
# main = str_glue("ROC Curve for Decision Tree with bagging (AUC = {AUC.Value})"))
# train model
# Specify 10-fold cross validation
ctrl <- trainControl(method = "cv", number = 10)
# SET SEED
set.seed(123)
# CV KNN model
knn_cv <- train(
Attrition ~ .,
data = train_k,
method = "knn",
trControl = ctrl, preProcess = c("center","scale"), tuneLength = 20
)
# assess results
knn_cv
## k-Nearest Neighbors
##
## 1000 samples
## 15 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (15), scaled (15)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 900, 900, 900, 900, 901, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.6411462 0.2807716
## 7 0.6501771 0.2984526
## 9 0.6481464 0.2946856
## 11 0.6561074 0.3110094
## 13 0.6540569 0.3069246
## 15 0.6491068 0.2974317
## 17 0.6500662 0.2992230
## 19 0.6550070 0.3090259
## 21 0.6670870 0.3338970
## 23 0.6740575 0.3475966
## 25 0.6690472 0.3375561
## 27 0.6640668 0.3276653
## 29 0.6739973 0.3477711
## 31 0.6801272 0.3599706
## 33 0.6730567 0.3458866
## 35 0.6701070 0.3400554
## 37 0.6790975 0.3578920
## 39 0.6770676 0.3537018
## 41 0.6770377 0.3536796
## 43 0.6580270 0.3157373
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 31.
# plot most important variables
plot(varImp(knn_cv), 10)
#predict(knn_cv, train_x)
#
# #USE knn_cv on test data to show specificity and sensitivity.
# confusionMatrix(data = as.factor(predict(knn_cv, test_x)),
# reference = as.factor(test_y),
# positive = "1")
# # Lloyd's confusion matrix for Carat
# knnPredict <- predict(knn_cv,newdata =test_k )#Check model evaluation
# confusionMatrix(knnPredict, test_k$Attrition )
# ROC curve
predict_knn <- predict(knn_cv, test_x, type="prob")[,2]
roc_knn <- prediction(predict_knn, test_y)
knn_perform <- performance(roc_knn,measure = "tpr",
x.measure = "fpr")
par(mfrow=c(1,1))
plot(knn_perform)
#AUC
auc_knn <- performance(roc_knn,measure="auc")
auc_knn <- auc_knn@y.values[[1]]
auc_knn
## [1] 0.6388987
# NOTHING HERE THIS IS A SPACE
#SVM Model Begin
# Begin on the SVM Model
SVM1 = svm(formula = Attrition ~ .,
data = train_k,
type = 'C-classification',
kernel = 'linear',
scale=TRUE)
summary(SVM1)
##
## Call:
## svm(formula = Attrition ~ ., data = train_k, type = "C-classification",
## kernel = "linear", scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 709
##
## ( 351 358 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
#indicates that a linear kernel was used with cost=1,
#and that there were 283 support vectors,141 in one class and 142 in the other.
#What if we instead used a smaller value of the cost parameter?
#which observations are support vectors
head(SVM1$index)
## [1] 2 3 5 6 7 8
SVM2 = svm(formula = Attrition ~ .,
data = train_k,
type = 'C-classification',
kernel = 'radial',
cost=0.1,
scale=TRUE)
#290 support vectors (wider margin)
summary(SVM1)
##
## Call:
## svm(formula = Attrition ~ ., data = train_k, type = "C-classification",
## kernel = "linear", scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 709
##
## ( 351 358 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
summary(SVM2)
##
## Call:
## svm(formula = Attrition ~ ., data = train_k, type = "C-classification",
## kernel = "radial", cost = 0.1, scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.1
##
## Number of Support Vectors: 887
##
## ( 444 443 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
#3. Tuning SVM Model
#The e1071 library includes a built-in function, tune(), to perform cross validation.
#By default, tune() performs ten-fold cross-validation on a set of models of interest.
#In order to use this function, we pass in relevant information about the set of models that are under consideration.
set.seed(123)
tune.out=tune(svm, Attrition~.,data=train_k ,kernel ="linear",
ranges =list(cost=c(0.001 , 0.01, 0.1, 1,5,10,100) ))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.333
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.349 0.04280446
## 2 1e-02 0.333 0.04922736
## 3 1e-01 0.343 0.04217688
## 4 1e+00 0.334 0.04141927
## 5 5e+00 0.333 0.04083844
## 6 1e+01 0.333 0.04083844
## 7 1e+02 0.333 0.04083844
#lowest cross validation error for .01 cost parameter
#The tune() function stores the best model obtained, which can be accessed as follows:
bestmod = tune.out$best.model
summary(bestmod)
##
## Call:
## best.tune(method = svm, train.x = Attrition ~ ., data = train_k,
## ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 771
##
## ( 387 384 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
SVM_pred = predict(bestmod, newdata = test[-2])
#4. Check model evaluation
test_k2y <- test$Attrition
confusionMatrix(data = as.factor(SVM_pred), reference = as.factor(test_k2y),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 693 60
## 1 427 143
##
## Accuracy : 0.6319
## 95% CI : (0.6053, 0.6579)
## No Information Rate : 0.8466
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1857
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7044
## Specificity : 0.6188
## Pos Pred Value : 0.2509
## Neg Pred Value : 0.9203
## Prevalence : 0.1534
## Detection Rate : 0.1081
## Detection Prevalence : 0.4308
## Balanced Accuracy : 0.6616
##
## 'Positive' Class : 1
##
#5. If we change to Gaussian Kernel, are we going to see an improvement in our model?
#we add the gamma hyperparameter,
#Gamma decides that how much curvature we want in a decision boundary.
#Gamma high means more curvature. Gamma low means less curvature.
SVMR2 = svm(formula = Attrition ~ .,
data = train_k,
type = 'C-classification',
cost=1,
gamma=0.05,
kernel = 'radial')
summary(SVMR2)
##
## Call:
## svm(formula = Attrition ~ ., data = train_k, type = "C-classification",
## cost = 1, gamma = 0.05, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 721
##
## ( 368 353 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
#We can perform cross-validation using tune() to select the best choice of γ and cost for an SVM with a radial kernel
set.seed (123)
tune.out2=tune(svm , Attrition ~., data=train_k, kernel ="radial",
ranges =list(cost=c(0.01, 0.1 ,1 ,10 ,100), gamma=c(0.5,1,2,3) ))
summary (tune.out2)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 1
##
## - best performance: 0.075
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-02 0.5 0.514 0.04351245
## 2 1e-01 0.5 0.514 0.04351245
## 3 1e+00 0.5 0.083 0.03267687
## 4 1e+01 0.5 0.085 0.03407508
## 5 1e+02 0.5 0.085 0.03407508
## 6 1e-02 1.0 0.514 0.04351245
## 7 1e-01 1.0 0.514 0.04351245
## 8 1e+00 1.0 0.077 0.03267687
## 9 1e+01 1.0 0.075 0.03205897
## 10 1e+02 1.0 0.075 0.03205897
## 11 1e-02 2.0 0.514 0.04351245
## 12 1e-01 2.0 0.514 0.04351245
## 13 1e+00 2.0 0.081 0.02960856
## 14 1e+01 2.0 0.079 0.03212822
## 15 1e+02 2.0 0.079 0.03212822
## 16 1e-02 3.0 0.514 0.04351245
## 17 1e-01 3.0 0.514 0.04351245
## 18 1e+00 3.0 0.082 0.03084009
## 19 1e+01 3.0 0.082 0.03084009
## 20 1e+02 3.0 0.082 0.03084009
bestmod2 = tune.out2$best.model
summary(bestmod2)
##
## Call:
## best.tune(method = svm, train.x = Attrition ~ ., data = train_k,
## ranges = list(cost = c(0.01, 0.1, 1, 10, 100), gamma = c(0.5,
## 1, 2, 3)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 671
##
## ( 430 241 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
SVMR_pred2 = predict(bestmod2, newdata = test[-2])
test$Attrition <- as.factor(test$Attrition)
#Check model evaluation
confusionMatrix(data = as.factor(SVMR_pred2), reference = as.factor(test_k2y),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1117 16
## 1 3 187
##
## Accuracy : 0.9856
## 95% CI : (0.9777, 0.9913)
## No Information Rate : 0.8466
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9432
##
## Mcnemar's Test P-Value : 0.005905
##
## Sensitivity : 0.9212
## Specificity : 0.9973
## Pos Pred Value : 0.9842
## Neg Pred Value : 0.9859
## Prevalence : 0.1534
## Detection Rate : 0.1413
## Detection Prevalence : 0.1436
## Balanced Accuracy : 0.9593
##
## 'Positive' Class : 1
##
#Much better
rocplot =function (pred , truth , ...){
predob = prediction (pred , truth )
perf = performance (predob , "tpr", "fpr")
plot(perf ,...)}
svm.opt=svm(Attrition ~., data=train_k, kernel ="radial", gamma =0.5, cost=1, decision.values =TRUE)
fitted = attributes(predict(svm.opt, train_k, decision.values =TRUE))$decision.values*-1
par(mfrow =c(1,2))
rocplot (fitted, train_k$Attrition, main="Training Data")
#SVM appears to be producing accurate predictions.
#We are really more interested in the level of prediction accuracy on the test data.
#When we compute the ROC curves on the test data, the model with γ = 2 appears to provide the most accurate results.
fitted =attributes (predict (svm.opt ,test, decision.values =T))$decision.values*-1
rocplot (fitted ,test$Attrition, main ="Test Data")
#fitted =attributes (predict (svm.opt ,test, decision.values =T))$decision.values
#rocplot (fitted ,test$Attrition, add=T,col ="red")
fitted <- predict(svm.opt, newdata = test_k, decision.values = TRUE) %>%
attr("decision.values")*-1
pred <- ROCR::prediction(fitted[,1], test_k[["Attrition"]])
performance(pred, "auc")@y.values
## [[1]]
## [1] 0.9627639
#For SVM variable importance doesnt work with Caret so use rminer package
model <- fit(Attrition ~., data=train_k, model="svm", kpar="automatic", C=1)
svm.imp <- Importance(model, data=train_k)
par(mfrow =c(1,1))
L=list(runs=1,sen=t(svm.imp$imp),sresponses=svm.imp$sresponses)
mgraph(L,graph="IMP",leg=names(train_k),col="gray",Grid=10)