library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(class)
library(e1071)
data = read.csv("C:/Users/gerde/OneDrive/Desktop/MSDSCaseStudy_Edit1.csv")
summary(data)
## ID AGE ATTRITION BUSINESSTRAVEL
## Min. : 1.0 Min. :18.00 Length:870 Length:870
## 1st Qu.:218.2 1st Qu.:30.00 Class :character Class :character
## Median :435.5 Median :35.00 Mode :character Mode :character
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
## DAILYRATE DEPARTMENT DISTANCEFROMHOME EDUCATION
## Min. : 103.0 Length:870 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Class :character 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Mode :character Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
## EDUCATIONFIELD EMPLOYEECOUNT EMPLOYEENUMBER ENVIRONMENTSATISFACTION
## Length:870 Min. :1 Min. : 1.0 Min. :1.000
## Class :character 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Mode :character Median :1 Median :1039.0 Median :3.000
## Mean :1 Mean :1029.8 Mean :2.701
## 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Max. :1 Max. :2064.0 Max. :4.000
## GENDER HOURLYRATE JOBINVOLVEMENT JOBLEVEL
## Length:870 Min. : 30.00 Min. :1.000 Min. :1.000
## Class :character 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Mode :character Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
## JOBROLE JOBSATISFACTION MARITALSTATUS MONTHLYINCOME
## Length:870 Min. :1.000 Length:870 Min. : 1081
## Class :character 1st Qu.:2.000 Class :character 1st Qu.: 2840
## Mode :character Median :3.000 Mode :character Median : 4946
## Mean :2.709 Mean : 6390
## 3rd Qu.:4.000 3rd Qu.: 8182
## Max. :4.000 Max. :19999
## MONTHLYRATE NUMCOMPANIESWORKED OVER18 OVERTIME
## Min. : 2094 Min. :0.000 Length:870 Length:870
## 1st Qu.: 8092 1st Qu.:1.000 Class :character Class :character
## Median :14074 Median :2.000 Mode :character Mode :character
## Mean :14326 Mean :2.728
## 3rd Qu.:20456 3rd Qu.:4.000
## Max. :26997 Max. :9.000
## PERCENTSALARYHIKE PERFORMANCERATING RELATIONSHIPSATISFACTION STANDARDHOURS
## Min. :11.0 Min. :3.000 Min. :1.000 Min. :80
## 1st Qu.:12.0 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80
## Median :14.0 Median :3.000 Median :3.000 Median :80
## Mean :15.2 Mean :3.152 Mean :2.707 Mean :80
## 3rd Qu.:18.0 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80
## Max. :25.0 Max. :4.000 Max. :4.000 Max. :80
## STOCKOPTIONLEVEL TOTALWORKINGYEARS TRAININGTIMESLASTYEAR WORKLIFEBALANCE
## Min. :0.0000 Min. : 0.00 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000
## Median :1.0000 Median :10.00 Median :3.000 Median :3.000
## Mean :0.7839 Mean :11.05 Mean :2.832 Mean :2.782
## 3rd Qu.:1.0000 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.0000 Max. :40.00 Max. :6.000 Max. :4.000
## YEARSATCOMPANY YEARSINCURRENTROLE YEARSSINCELASTPROMOTION
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 0.000
## Median : 5.000 Median : 3.000 Median : 1.000
## Mean : 6.962 Mean : 4.205 Mean : 2.169
## 3rd Qu.:10.000 3rd Qu.: 7.000 3rd Qu.: 3.000
## Max. :40.000 Max. :18.000 Max. :15.000
## YEARSWITHCURRMANAGER ATTRITION_BINARY OVERALLSATISFACTION EDU
## Min. : 0.00 Min. :0.0000 Min. : 3.000 Length:870
## 1st Qu.: 2.00 1st Qu.:0.0000 1st Qu.: 7.000 Class :character
## Median : 3.00 Median :0.0000 Median : 8.000 Mode :character
## Mean : 4.14 Mean :0.1609 Mean : 8.117
## 3rd Qu.: 7.00 3rd Qu.:0.0000 3rd Qu.: 9.000
## Max. :17.00 Max. :1.0000 Max. :12.000
## ENV_SAT JOB_INV JOB_SAT PERF_RATE
## Length:870 Length:870 Length:870 Length:870
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## REL_SAT WLB AGE_BINS DAILYRATE_BINS
## Length:870 Length:870 Length:870 Length:870
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## COMMUTE_BINS HOURLYRATE_BINS MONTHLYINCOME_BINS MONTHLYRATE_BINS
## Length:870 Length:870 Length:870 Length:870
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PERCENTSALARYHIKE_BINS TOTALWORKINGYEARS_BINS YEARSATCOMPANY_BINS
## Length:870 Length:870 Length:870
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
cat_cols = c("DEPARTMENT","GENDER","JOBLEVEL","JOBROLE","MARITALSTATUS","OVERTIME","ENV_SAT","JOB_INV","WLB","AGE_BINS","COMMUTE_BINS","PERCENTSALARYHIKE_BINS","TOTALWORKINGYEARS_BINS")
target = "ATTRITION_BINARY"
categorical_data = data[, c(cat_cols, target)]
set.seed(44) # chosen because its my age
trainIndex = createDataPartition(data$ATTRITION_BINARY, p = 0.85, list = FALSE)
train_cat = categorical_data[trainIndex, ]
test_cat = categorical_data[-trainIndex, ]
nb_model = naiveBayes(ATTRITION_BINARY ~ ., data = train_cat)
nb_predictions = predict(nb_model, test_cat)
actual_values = factor(test_cat$ATTRITION_BINARY, levels = c(0, 1)) # Ensure same levels as predictions
confusionMatrix(nb_predictions, actual_values)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 104 7
## 1 5 14
##
## Accuracy : 0.9077
## 95% CI : (0.8443, 0.9514)
## No Information Rate : 0.8385
## P-Value [Acc > NIR] : 0.01651
##
## Kappa : 0.6456
##
## Mcnemar's Test P-Value : 0.77283
##
## Sensitivity : 0.9541
## Specificity : 0.6667
## Pos Pred Value : 0.9369
## Neg Pred Value : 0.7368
## Prevalence : 0.8385
## Detection Rate : 0.8000
## Detection Prevalence : 0.8538
## Balanced Accuracy : 0.8104
##
## 'Positive' Class : 0
##
new_data = read.csv("C:/Users/gerde/OneDrive/Desktop/MSDSCaseStudy_TEST.csv")
head(new_data)
## ID AGE BUSINESSTRAVEL DAILYRATE DEPARTMENT DISTANCEFROMHOME
## 1 1171 35 Travel_Rarely 750 Research & Development 28
## 2 1172 33 Travel_Rarely 147 Human Resources 2
## 3 1173 26 Travel_Rarely 1330 Research & Development 21
## 4 1174 55 Travel_Rarely 1311 Research & Development 2
## 5 1175 29 Travel_Rarely 1246 Sales 19
## 6 1176 51 Travel_Frequently 1456 Research & Development 1
## EDUCATION EDUCATIONFIELD EMPLOYEECOUNT EMPLOYEENUMBER
## 1 3 Life Sciences 1 1596
## 2 3 Human Resources 1 1207
## 3 3 Medical 1 1107
## 4 3 Life Sciences 1 505
## 5 3 Life Sciences 1 1497
## 6 4 Medical 1 145
## ENVIRONMENTSATISFACTION GENDER HOURLYRATE JOBINVOLVEMENT JOBLEVEL
## 1 2 Male 46 4 2
## 2 2 Male 99 3 1
## 3 1 Male 37 3 1
## 4 3 Female 97 3 4
## 5 3 Male 77 2 2
## 6 1 Female 30 2 3
## JOBROLE JOBSATISFACTION MARITALSTATUS MONTHLYINCOME
## 1 Laboratory Technician 3 Married 3407
## 2 Human Resources 3 Married 3600
## 3 Laboratory Technician 3 Divorced 2377
## 4 Manager 4 Single 16659
## 5 Sales Executive 3 Divorced 8620
## 6 Healthcare Representative 1 Single 7484
## MONTHLYRATE NUMCOMPANIESWORKED OVER18 OVERTIME PERCENTSALARYHIKE
## 1 25348 1 Y No 17
## 2 8429 1 Y No 13
## 3 19373 1 Y No 20
## 4 23258 2 Y Yes 13
## 5 23757 1 Y No 14
## 6 25796 3 Y No 20
## PERFORMANCERATING RELATIONSHIPSATISFACTION STANDARDHOURS STOCKOPTIONLEVEL
## 1 3 4 80 2
## 2 3 4 80 1
## 3 4 3 80 1
## 4 3 3 80 0
## 5 3 3 80 2
## 6 4 3 80 0
## TOTALWORKINGYEARS TRAININGTIMESLASTYEAR WORKLIFEBALANCE YEARSATCOMPANY
## 1 10 3 2 10
## 2 5 2 3 5
## 3 1 0 2 1
## 4 30 2 3 5
## 5 10 3 3 10
## 6 23 1 2 13
## YEARSINCURRENTROLE YEARSSINCELASTPROMOTION YEARSWITHCURRMANAGER
## 1 9 6 8
## 2 4 1 4
## 3 1 0 0
## 4 4 1 2
## 5 7 0 4
## 6 12 12 8
## OVERALLSATISFACTION EDU ENV_SAT JOB_INV JOB_SAT PERF_RATE
## 1 9 Bachelor Medium Very High High Excellent
## 2 9 Bachelor Medium High High Excellent
## 3 7 Bachelor Low High High Outstanding
## 4 10 Bachelor High High Very High Excellent
## 5 9 Bachelor High Medium High Excellent
## 6 5 Master Low Medium Low Outstanding
## REL_SAT WLB AGE_BINS DAILYRATE_BINS COMMUTE_BINS HOURLYRATE_BINS
## 1 Very High Good 35-40 600-800 25-30 40-50
## 2 Very High Better 30-35 Less Than 200 0-5 90 and up
## 3 High Good 25-30 1200-1400 20-25 30-40
## 4 High Better 55-60 1200-1400 0-5 90 and up
## 5 High Better 25-30 1200-1400 15-20 70-80
## 6 High Good 50-55 1400 and up 0-5 30-40
## MONTHLYINCOME_BINS MONTHLYRATE_BINS PERCENTSALARYHIKE_BINS
## 1 3000-4000 25000-27500 15-20%
## 2 3000-4000 7500-10000 10-15%
## 3 2000-3000 17500-20000 20-25%
## 4 16000-17000 22500-25000 10-15%
## 5 8000-9000 22500-25000 10-15%
## 6 7000-8000 25000-27500 20-25%
## TOTALWORKINGYEARS_BINS YEARSATCOMPANY_BINS
## 1 10-15 10-15
## 2 5-10 5-10
## 3 Less Than 5 Less Than 5
## 4 30-35 5-10
## 5 10-15 10-15
## 6 20-25 10-15
cat_new_data = new_data[, c(cat_cols)]
head(cat_new_data)
## DEPARTMENT GENDER JOBLEVEL JOBROLE
## 1 Research & Development Male 2 Laboratory Technician
## 2 Human Resources Male 1 Human Resources
## 3 Research & Development Male 1 Laboratory Technician
## 4 Research & Development Female 4 Manager
## 5 Sales Male 2 Sales Executive
## 6 Research & Development Female 3 Healthcare Representative
## MARITALSTATUS OVERTIME ENV_SAT JOB_INV WLB AGE_BINS COMMUTE_BINS
## 1 Married No Medium Very High Good 35-40 25-30
## 2 Married No Medium High Better 30-35 0-5
## 3 Divorced No Low High Good 25-30 20-25
## 4 Single Yes High High Better 55-60 0-5
## 5 Divorced No High Medium Better 25-30 15-20
## 6 Single No Low Medium Good 50-55 0-5
## PERCENTSALARYHIKE_BINS TOTALWORKINGYEARS_BINS
## 1 15-20% 10-15
## 2 10-15% 5-10
## 3 20-25% Less Than 5
## 4 10-15% 30-35
## 5 10-15% 10-15
## 6 20-25% 20-25
final_predictions = predict(nb_model, cat_new_data)
new_data$Predicted_Attrition = final_predictions
head(new_data)
## ID AGE BUSINESSTRAVEL DAILYRATE DEPARTMENT DISTANCEFROMHOME
## 1 1171 35 Travel_Rarely 750 Research & Development 28
## 2 1172 33 Travel_Rarely 147 Human Resources 2
## 3 1173 26 Travel_Rarely 1330 Research & Development 21
## 4 1174 55 Travel_Rarely 1311 Research & Development 2
## 5 1175 29 Travel_Rarely 1246 Sales 19
## 6 1176 51 Travel_Frequently 1456 Research & Development 1
## EDUCATION EDUCATIONFIELD EMPLOYEECOUNT EMPLOYEENUMBER
## 1 3 Life Sciences 1 1596
## 2 3 Human Resources 1 1207
## 3 3 Medical 1 1107
## 4 3 Life Sciences 1 505
## 5 3 Life Sciences 1 1497
## 6 4 Medical 1 145
## ENVIRONMENTSATISFACTION GENDER HOURLYRATE JOBINVOLVEMENT JOBLEVEL
## 1 2 Male 46 4 2
## 2 2 Male 99 3 1
## 3 1 Male 37 3 1
## 4 3 Female 97 3 4
## 5 3 Male 77 2 2
## 6 1 Female 30 2 3
## JOBROLE JOBSATISFACTION MARITALSTATUS MONTHLYINCOME
## 1 Laboratory Technician 3 Married 3407
## 2 Human Resources 3 Married 3600
## 3 Laboratory Technician 3 Divorced 2377
## 4 Manager 4 Single 16659
## 5 Sales Executive 3 Divorced 8620
## 6 Healthcare Representative 1 Single 7484
## MONTHLYRATE NUMCOMPANIESWORKED OVER18 OVERTIME PERCENTSALARYHIKE
## 1 25348 1 Y No 17
## 2 8429 1 Y No 13
## 3 19373 1 Y No 20
## 4 23258 2 Y Yes 13
## 5 23757 1 Y No 14
## 6 25796 3 Y No 20
## PERFORMANCERATING RELATIONSHIPSATISFACTION STANDARDHOURS STOCKOPTIONLEVEL
## 1 3 4 80 2
## 2 3 4 80 1
## 3 4 3 80 1
## 4 3 3 80 0
## 5 3 3 80 2
## 6 4 3 80 0
## TOTALWORKINGYEARS TRAININGTIMESLASTYEAR WORKLIFEBALANCE YEARSATCOMPANY
## 1 10 3 2 10
## 2 5 2 3 5
## 3 1 0 2 1
## 4 30 2 3 5
## 5 10 3 3 10
## 6 23 1 2 13
## YEARSINCURRENTROLE YEARSSINCELASTPROMOTION YEARSWITHCURRMANAGER
## 1 9 6 8
## 2 4 1 4
## 3 1 0 0
## 4 4 1 2
## 5 7 0 4
## 6 12 12 8
## OVERALLSATISFACTION EDU ENV_SAT JOB_INV JOB_SAT PERF_RATE
## 1 9 Bachelor Medium Very High High Excellent
## 2 9 Bachelor Medium High High Excellent
## 3 7 Bachelor Low High High Outstanding
## 4 10 Bachelor High High Very High Excellent
## 5 9 Bachelor High Medium High Excellent
## 6 5 Master Low Medium Low Outstanding
## REL_SAT WLB AGE_BINS DAILYRATE_BINS COMMUTE_BINS HOURLYRATE_BINS
## 1 Very High Good 35-40 600-800 25-30 40-50
## 2 Very High Better 30-35 Less Than 200 0-5 90 and up
## 3 High Good 25-30 1200-1400 20-25 30-40
## 4 High Better 55-60 1200-1400 0-5 90 and up
## 5 High Better 25-30 1200-1400 15-20 70-80
## 6 High Good 50-55 1400 and up 0-5 30-40
## MONTHLYINCOME_BINS MONTHLYRATE_BINS PERCENTSALARYHIKE_BINS
## 1 3000-4000 25000-27500 15-20%
## 2 3000-4000 7500-10000 10-15%
## 3 2000-3000 17500-20000 20-25%
## 4 16000-17000 22500-25000 10-15%
## 5 8000-9000 22500-25000 10-15%
## 6 7000-8000 25000-27500 20-25%
## TOTALWORKINGYEARS_BINS YEARSATCOMPANY_BINS Predicted_Attrition
## 1 10-15 10-15 0
## 2 5-10 5-10 0
## 3 Less Than 5 Less Than 5 0
## 4 30-35 5-10 0
## 5 10-15 10-15 0
## 6 20-25 10-15 0
write.csv(new_data, "C:/Users/gerde/OneDrive/Desktop/new_data_with_predictions.csv", row.names = FALSE)
# Load necessary libraries
library(caret)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(e1071) # For the Naive Bayes model
# Load and summarize data
data <- read.csv("C:/Users/gerde/OneDrive/Desktop/MSDSCaseStudy_Edit1.csv")
summary(data)
## ID AGE ATTRITION BUSINESSTRAVEL
## Min. : 1.0 Min. :18.00 Length:870 Length:870
## 1st Qu.:218.2 1st Qu.:30.00 Class :character Class :character
## Median :435.5 Median :35.00 Mode :character Mode :character
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
## DAILYRATE DEPARTMENT DISTANCEFROMHOME EDUCATION
## Min. : 103.0 Length:870 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Class :character 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Mode :character Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
## EDUCATIONFIELD EMPLOYEECOUNT EMPLOYEENUMBER ENVIRONMENTSATISFACTION
## Length:870 Min. :1 Min. : 1.0 Min. :1.000
## Class :character 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Mode :character Median :1 Median :1039.0 Median :3.000
## Mean :1 Mean :1029.8 Mean :2.701
## 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Max. :1 Max. :2064.0 Max. :4.000
## GENDER HOURLYRATE JOBINVOLVEMENT JOBLEVEL
## Length:870 Min. : 30.00 Min. :1.000 Min. :1.000
## Class :character 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Mode :character Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
## JOBROLE JOBSATISFACTION MARITALSTATUS MONTHLYINCOME
## Length:870 Min. :1.000 Length:870 Min. : 1081
## Class :character 1st Qu.:2.000 Class :character 1st Qu.: 2840
## Mode :character Median :3.000 Mode :character Median : 4946
## Mean :2.709 Mean : 6390
## 3rd Qu.:4.000 3rd Qu.: 8182
## Max. :4.000 Max. :19999
## MONTHLYRATE NUMCOMPANIESWORKED OVER18 OVERTIME
## Min. : 2094 Min. :0.000 Length:870 Length:870
## 1st Qu.: 8092 1st Qu.:1.000 Class :character Class :character
## Median :14074 Median :2.000 Mode :character Mode :character
## Mean :14326 Mean :2.728
## 3rd Qu.:20456 3rd Qu.:4.000
## Max. :26997 Max. :9.000
## PERCENTSALARYHIKE PERFORMANCERATING RELATIONSHIPSATISFACTION STANDARDHOURS
## Min. :11.0 Min. :3.000 Min. :1.000 Min. :80
## 1st Qu.:12.0 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80
## Median :14.0 Median :3.000 Median :3.000 Median :80
## Mean :15.2 Mean :3.152 Mean :2.707 Mean :80
## 3rd Qu.:18.0 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80
## Max. :25.0 Max. :4.000 Max. :4.000 Max. :80
## STOCKOPTIONLEVEL TOTALWORKINGYEARS TRAININGTIMESLASTYEAR WORKLIFEBALANCE
## Min. :0.0000 Min. : 0.00 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000
## Median :1.0000 Median :10.00 Median :3.000 Median :3.000
## Mean :0.7839 Mean :11.05 Mean :2.832 Mean :2.782
## 3rd Qu.:1.0000 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.0000 Max. :40.00 Max. :6.000 Max. :4.000
## YEARSATCOMPANY YEARSINCURRENTROLE YEARSSINCELASTPROMOTION
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 0.000
## Median : 5.000 Median : 3.000 Median : 1.000
## Mean : 6.962 Mean : 4.205 Mean : 2.169
## 3rd Qu.:10.000 3rd Qu.: 7.000 3rd Qu.: 3.000
## Max. :40.000 Max. :18.000 Max. :15.000
## YEARSWITHCURRMANAGER ATTRITION_BINARY OVERALLSATISFACTION EDU
## Min. : 0.00 Min. :0.0000 Min. : 3.000 Length:870
## 1st Qu.: 2.00 1st Qu.:0.0000 1st Qu.: 7.000 Class :character
## Median : 3.00 Median :0.0000 Median : 8.000 Mode :character
## Mean : 4.14 Mean :0.1609 Mean : 8.117
## 3rd Qu.: 7.00 3rd Qu.:0.0000 3rd Qu.: 9.000
## Max. :17.00 Max. :1.0000 Max. :12.000
## ENV_SAT JOB_INV JOB_SAT PERF_RATE
## Length:870 Length:870 Length:870 Length:870
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## REL_SAT WLB AGE_BINS DAILYRATE_BINS
## Length:870 Length:870 Length:870 Length:870
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## COMMUTE_BINS HOURLYRATE_BINS MONTHLYINCOME_BINS MONTHLYRATE_BINS
## Length:870 Length:870 Length:870 Length:870
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PERCENTSALARYHIKE_BINS TOTALWORKINGYEARS_BINS YEARSATCOMPANY_BINS
## Length:870 Length:870 Length:870
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
# Preprocess the data: Define categorical columns and target
cat_cols <- c("DEPARTMENT", "GENDER", "JOBLEVEL", "JOBROLE", "MARITALSTATUS",
"OVERTIME", "ENV_SAT", "JOB_INV", "WLB", "AGE_BINS",
"COMMUTE_BINS", "PERCENTSALARYHIKE_BINS", "TOTALWORKINGYEARS_BINS")
target <- "ATTRITION_BINARY"
categorical_data <- data[, c(cat_cols, target)]
# Split data into training and testing sets
set.seed(44) # chosen because it's the user's age
trainIndex <- createDataPartition(categorical_data$ATTRITION_BINARY, p = 0.85, list = FALSE)
train_cat <- categorical_data[trainIndex, ]
test_cat <- categorical_data[-trainIndex, ]
# Ensure both the predicted and actual columns are factors with the same levels
train_cat$ATTRITION_BINARY <- factor(train_cat$ATTRITION_BINARY, levels = c(0, 1))
test_cat$ATTRITION_BINARY <- factor(test_cat$ATTRITION_BINARY, levels = c(0, 1))
# Train the Naive Bayes model on categorical data
nb_model <- naiveBayes(ATTRITION_BINARY ~ ., data = train_cat)
nb_predictions <- predict(nb_model, test_cat)
nb_predictions <- factor(nb_predictions, levels = c(0, 1)) # Set levels for predictions
# Generate the confusion matrix and convert it to a data frame for plotting
conf_matrix <- confusionMatrix(nb_predictions, test_cat$ATTRITION_BINARY)
conf_df <- as.data.frame(conf_matrix$table)
# Rename columns for readability in the plot
conf_df <- conf_df %>%
rename(Predicted = Prediction, Actual = Reference) %>%
mutate(
Prediction_Label = ifelse(Predicted == 1, "Predicted Attrition", "Predicted No Attrition"),
Reference_Label = ifelse(Actual == 1, "Actual Attrition", "Actual No Attrition")
)
# Plotting function for the confusion matrix
plot_confusion_matrix <- function(conf_df, title) {
ggplot(conf_df, aes(x = Reference_Label, y = Prediction_Label, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
geom_text(aes(label = Freq), color = "white", size = 5) +
theme_minimal() +
labs(title = title, x = "Actual", y = "Predicted") +
theme(plot.title = element_text(hjust = 0.5))
}
# Display the confusion matrix plot
plot_confusion_matrix(conf_df, "Confusion Matrix - Full Dataset")
SELECT *, CASE WHEN ATTRITION = ‘Yes’ THEN 1 ELSE 0 END AS ATTRITION_BINARY, ENVIRONMENTSATISFACTION + JOBSATISFACTION + RELATIONSHIPSATISFACTION AS OVERALLSATISFACTION, CASE WHEN EDUCATION = 1 THEN ‘Below College’ WHEN EDUCATION = 2 THEN ‘Some College’ WHEN EDUCATION = 3 THEN ‘Bachelor’ WHEN EDUCATION = 4 THEN ‘Master’ WHEN EDUCATION = 5 THEN ‘Doctor’ ELSE ‘N/A’ END AS EDU, CASE WHEN ENVIRONMENTSATISFACTION = 1 THEN ‘Low’ WHEN ENVIRONMENTSATISFACTION = 2 THEN ‘Medium’ WHEN ENVIRONMENTSATISFACTION = 3 THEN ‘High’ WHEN ENVIRONMENTSATISFACTION = 4 THEN ‘Very High’ ELSE ‘N/A’ END AS ENV_SAT, CASE WHEN JOBINVOLVEMENT = 1 THEN ‘Low’ WHEN JOBINVOLVEMENT = 2 THEN ‘Medium’ WHEN JOBINVOLVEMENT = 3 THEN ‘High’ WHEN JOBINVOLVEMENT = 4 THEN ‘Very High’ ELSE ‘N/A’ END AS JOB_INV, CASE WHEN JOBSATISFACTION = 1 THEN ‘Low’ WHEN JOBSATISFACTION = 2 THEN ‘Medium’ WHEN JOBSATISFACTION = 3 THEN ‘High’ WHEN JOBSATISFACTION = 4 THEN ‘Very High’ ELSE ‘N/A’ END AS JOB_SAT, CASE WHEN PERFORMANCERATING = 1 THEN ‘Low’ WHEN PERFORMANCERATING = 2 THEN ‘Good’ WHEN PERFORMANCERATING = 3 THEN ‘Excellent’ WHEN PERFORMANCERATING = 4 THEN ‘Outstanding’ ELSE ‘N/A’ END AS PERF_RATE, CASE WHEN RELATIONSHIPSATISFACTION = 1 THEN ‘Low’ WHEN RELATIONSHIPSATISFACTION = 2 THEN ‘Medium’ WHEN RELATIONSHIPSATISFACTION = 3 THEN ‘High’ WHEN RELATIONSHIPSATISFACTION = 4 THEN ‘Very High’ ELSE ‘N/A’ END AS REL_SAT, CASE WHEN WORKLIFEBALANCE = 1 THEN ‘Bad’ WHEN WORKLIFEBALANCE = 2 THEN ‘Good’ WHEN WORKLIFEBALANCE = 3 THEN ‘Better’ WHEN WORKLIFEBALANCE = 4 THEN ‘Best’ ELSE ‘N/A’ END AS WLB, CASE WHEN AGE < 20 THEN ‘Less Than 20’ WHEN AGE >= 20 AND AGE < 25 THEN ‘20-25’ WHEN AGE >= 25 AND AGE < 30 THEN ‘25-30’ WHEN AGE >= 30 AND AGE < 35 THEN ‘30-35’ WHEN AGE >= 35 AND AGE < 40 THEN ‘35-40’ WHEN AGE >= 40 AND AGE < 45 THEN ‘40-45’ WHEN AGE >= 45 AND AGE < 50 THEN ‘45-50’ WHEN AGE >= 50 AND AGE < 55 THEN ‘50-55’ WHEN AGE >= 55 AND AGE < 60 THEN ‘55-60’ WHEN AGE >= 60 THEN ‘60 And Up’ END AS AGE_BINS, CASE WHEN DAILYRATE <= 200 THEN ’ Less Than 200’ WHEN DAILYRATE >= 200 AND DAILYRATE <= 400 THEN ‘200-400’ WHEN DAILYRATE > 400 AND DAILYRATE <=600 THEN ‘400-600’ WHEN DAILYRATE > 600 AND DAILYRATE <=800 THEN ‘600-800’ WHEN DAILYRATE > 800 AND DAILYRATE <=1000 THEN ‘800-1000’ WHEN DAILYRATE > 1000 AND DAILYRATE <=1200 THEN ‘1000-1200’ WHEN DAILYRATE > 1200 AND DAILYRATE <=1400 THEN ‘1200-1400’ WHEN DAILYRATE > 1400 THEN ‘1400 and up’ END AS DAILYRATE_BINS, CASE WHEN DISTANCEFROMHOME <= 5 THEN ‘0-5’ WHEN DISTANCEFROMHOME > 5 AND DISTANCEFROMHOME <= 10 THEN ‘5-10’ WHEN DISTANCEFROMHOME > 10 AND DISTANCEFROMHOME <= 15 THEN ‘10-15’ WHEN DISTANCEFROMHOME > 15 AND DISTANCEFROMHOME <= 20 THEN ‘15-20’ WHEN DISTANCEFROMHOME > 20 AND DISTANCEFROMHOME <= 25 THEN ‘20-25’ WHEN DISTANCEFROMHOME > 25 AND DISTANCEFROMHOME <= 30 THEN ‘25-30’ WHEN DISTANCEFROMHOME > 30 THEN ‘30+’ END AS COMMUTE_BINS, CASE WHEN HOURLYRATE < 30 THEN ‘Less Than 30’ WHEN HOURLYRATE >= 30 AND HOURLYRATE < 40 THEN ‘30-40’ WHEN HOURLYRATE >= 40 AND HOURLYRATE < 50 THEN ‘40-50’ WHEN HOURLYRATE >= 50 AND HOURLYRATE < 60 THEN ‘50-60’ WHEN HOURLYRATE >= 60 AND HOURLYRATE < 70 THEN ‘60-70’ WHEN HOURLYRATE >= 70 AND HOURLYRATE < 80 THEN ‘70-80’ WHEN HOURLYRATE >= 80 AND HOURLYRATE < 90 THEN ‘80-90’ WHEN HOURLYRATE >= 90 THEN ‘90 and up’ END AS HOURLYRATE_BINS, CASE WHEN MONTHLYINCOME < 1000 THEN ‘Less Than 2500’ WHEN MONTHLYINCOME >= 1000 AND MONTHLYINCOME < 2000 THEN ‘1000-2000’ WHEN MONTHLYINCOME >= 2000 AND MONTHLYINCOME < 3000 THEN ‘2000-3000’ WHEN MONTHLYINCOME >= 3000 AND MONTHLYINCOME < 4000 THEN ‘3000-4000’ WHEN MONTHLYINCOME >= 4000 AND MONTHLYINCOME < 5000 THEN ‘4000-5000’ WHEN MONTHLYINCOME >= 5000 AND MONTHLYINCOME < 6000 THEN ‘5000-6000’ WHEN MONTHLYINCOME >= 6000 AND MONTHLYINCOME < 7000 THEN ‘6000-7000’ WHEN MONTHLYINCOME >= 7000 AND MONTHLYINCOME < 8000 THEN ‘7000-8000’ WHEN MONTHLYINCOME >= 8000 AND MONTHLYINCOME < 9000 THEN ‘8000-9000’ WHEN MONTHLYINCOME >= 9000 AND MONTHLYINCOME < 10000 THEN ‘9000-10000’ WHEN MONTHLYINCOME >= 10000 AND MONTHLYINCOME < 11000 THEN ‘10000-11000’ WHEN MONTHLYINCOME >= 11000 AND MONTHLYINCOME < 12000 THEN ‘11000-12000’ WHEN MONTHLYINCOME >= 12000 AND MONTHLYINCOME < 13000 THEN ‘12000-13000’ WHEN MONTHLYINCOME >= 13000 AND MONTHLYINCOME < 14000 THEN ‘13000-14000’ WHEN MONTHLYINCOME >= 14000 AND MONTHLYINCOME < 15000 THEN ‘14000-15000’ WHEN MONTHLYINCOME >= 15000 AND MONTHLYINCOME < 16000 THEN ‘15000-16000’ WHEN MONTHLYINCOME >= 16000 AND MONTHLYINCOME < 17000 THEN ‘16000-17000’ WHEN MONTHLYINCOME >= 17000 AND MONTHLYINCOME < 18000 THEN ‘17000-18000’ WHEN MONTHLYINCOME >= 18000 AND MONTHLYINCOME < 19000 THEN ‘18000-19000’ WHEN MONTHLYINCOME >= 19000 AND MONTHLYINCOME < 20000 THEN ‘19000-20000’ WHEN MONTHLYINCOME >= 20000 THEN ‘Greater Than 20000’ END AS MONTHLYINCOME_BINS, CASE WHEN MONTHLYRATE < 2500 THEN ’ Less Than 2500’ WHEN MONTHLYRATE >= 2500 AND MONTHLYRATE < 5000 THEN ‘2500-5000’ WHEN MONTHLYRATE >= 5000 AND MONTHLYRATE < 7500 THEN ‘5000-7500’ WHEN MONTHLYRATE >= 7500 AND MONTHLYRATE < 10000 THEN ‘7500-10000’ WHEN MONTHLYRATE >= 10000 AND MONTHLYRATE < 12500 THEN ‘10000-12500’ WHEN MONTHLYRATE >= 12500 AND MONTHLYRATE < 15000 THEN ‘12500-15000’ WHEN MONTHLYRATE >= 15000 AND MONTHLYRATE < 17500 THEN ‘15000-17500’ WHEN MONTHLYRATE >= 17500 AND MONTHLYRATE < 20000 THEN ‘17500-20000’ WHEN MONTHLYRATE >= 20000 AND MONTHLYRATE < 22500 THEN ‘20000-22500’ WHEN MONTHLYRATE >= 22500 AND MONTHLYRATE < 25000 THEN ‘22500-25000’ WHEN MONTHLYRATE >= 25000 AND MONTHLYRATE < 27500 THEN ‘25000-27500’ WHEN MONTHLYRATE >= 27500 THEN ’ Greater Than 27500’ END AS MONTHLYRATE_BINS, CASE WHEN PERCENTSALARYHIKE < 5 THEN ‘Less Than 5%’ WHEN PERCENTSALARYHIKE >= 5 AND PERCENTSALARYHIKE < 10 THEN ‘5-10%’ WHEN PERCENTSALARYHIKE >= 10 AND PERCENTSALARYHIKE < 15 THEN ‘10-15%’ WHEN PERCENTSALARYHIKE >= 15 AND PERCENTSALARYHIKE < 20 THEN ‘15-20%’ WHEN PERCENTSALARYHIKE >= 20 AND PERCENTSALARYHIKE < 25 THEN ‘20-25%’ WHEN PERCENTSALARYHIKE >= 25 THEN ‘25% and up’ END AS PERCENTSALARYHIKE_BINS, CASE WHEN TOTALWORKINGYEARS < 5 THEN ‘Less Than 5’ WHEN TOTALWORKINGYEARS >= 5 AND TOTALWORKINGYEARS < 10 THEN ‘5-10’ WHEN TOTALWORKINGYEARS >= 10 AND TOTALWORKINGYEARS < 15 THEN ‘10-15’ WHEN TOTALWORKINGYEARS >= 15 AND TOTALWORKINGYEARS < 20 THEN ‘15-20’ WHEN TOTALWORKINGYEARS >= 20 AND TOTALWORKINGYEARS < 25 THEN ‘20-25’ WHEN TOTALWORKINGYEARS >= 25 AND TOTALWORKINGYEARS < 30 THEN ‘25-30’ WHEN TOTALWORKINGYEARS >= 30 AND TOTALWORKINGYEARS < 35 THEN ‘30-35’ WHEN TOTALWORKINGYEARS >= 35 AND TOTALWORKINGYEARS < 40 THEN ‘35-40’ WHEN TOTALWORKINGYEARS >= 40 THEN ‘40 and up’ END AS TOTALWORKINGYEARS_BINS, CASE WHEN YEARSATCOMPANY < 5 THEN ‘Less Than 5’ WHEN YEARSATCOMPANY >= 5 AND YEARSATCOMPANY < 10 THEN ‘5-10’ WHEN YEARSATCOMPANY >= 10 AND YEARSATCOMPANY < 15 THEN ‘10-15’ WHEN YEARSATCOMPANY >= 15 AND YEARSATCOMPANY < 20 THEN ‘15-20’ WHEN YEARSATCOMPANY >= 20 AND YEARSATCOMPANY < 25 THEN ‘20-25’ WHEN YEARSATCOMPANY >= 25 AND YEARSATCOMPANY < 30 THEN ‘25-30’ WHEN YEARSATCOMPANY >= 30 AND YEARSATCOMPANY < 35 THEN ‘30-35’ WHEN YEARSATCOMPANY >= 35 AND YEARSATCOMPANY < 40 THEN ‘35-40’ WHEN YEARSATCOMPANY >= 40 THEN ‘40 and up’ END AS YEARSATCOMPANY_BINS FROM PROD_MKTBCI.MKTBUSDB.GC0965_MSDSCASESTUDY_TEST