Frito-Lay Attrition Model

Load necessary libraries
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(class)
library(e1071)
Load the data
data = read.csv("C:/Users/gerde/OneDrive/Desktop/MSDSCaseStudy_Edit1.csv")
summary(data)
##        ID             AGE         ATTRITION         BUSINESSTRAVEL    
##  Min.   :  1.0   Min.   :18.00   Length:870         Length:870        
##  1st Qu.:218.2   1st Qu.:30.00   Class :character   Class :character  
##  Median :435.5   Median :35.00   Mode  :character   Mode  :character  
##  Mean   :435.5   Mean   :36.83                                        
##  3rd Qu.:652.8   3rd Qu.:43.00                                        
##  Max.   :870.0   Max.   :60.00                                        
##    DAILYRATE       DEPARTMENT        DISTANCEFROMHOME   EDUCATION    
##  Min.   : 103.0   Length:870         Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Class :character   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Mode  :character   Median : 7.000   Median :3.000  
##  Mean   : 815.2                      Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                      3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                      Max.   :29.000   Max.   :5.000  
##  EDUCATIONFIELD     EMPLOYEECOUNT EMPLOYEENUMBER   ENVIRONMENTSATISFACTION
##  Length:870         Min.   :1     Min.   :   1.0   Min.   :1.000          
##  Class :character   1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
##  Mode  :character   Median :1     Median :1039.0   Median :3.000          
##                     Mean   :1     Mean   :1029.8   Mean   :2.701          
##                     3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
##                     Max.   :1     Max.   :2064.0   Max.   :4.000          
##     GENDER            HOURLYRATE     JOBINVOLVEMENT     JOBLEVEL    
##  Length:870         Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  Class :character   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##  Mode  :character   Median : 66.00   Median :3.000   Median :2.000  
##                     Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##                     3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##                     Max.   :100.00   Max.   :4.000   Max.   :5.000  
##    JOBROLE          JOBSATISFACTION MARITALSTATUS      MONTHLYINCOME  
##  Length:870         Min.   :1.000   Length:870         Min.   : 1081  
##  Class :character   1st Qu.:2.000   Class :character   1st Qu.: 2840  
##  Mode  :character   Median :3.000   Mode  :character   Median : 4946  
##                     Mean   :2.709                      Mean   : 6390  
##                     3rd Qu.:4.000                      3rd Qu.: 8182  
##                     Max.   :4.000                      Max.   :19999  
##   MONTHLYRATE    NUMCOMPANIESWORKED    OVER18            OVERTIME        
##  Min.   : 2094   Min.   :0.000      Length:870         Length:870        
##  1st Qu.: 8092   1st Qu.:1.000      Class :character   Class :character  
##  Median :14074   Median :2.000      Mode  :character   Mode  :character  
##  Mean   :14326   Mean   :2.728                                           
##  3rd Qu.:20456   3rd Qu.:4.000                                           
##  Max.   :26997   Max.   :9.000                                           
##  PERCENTSALARYHIKE PERFORMANCERATING RELATIONSHIPSATISFACTION STANDARDHOURS
##  Min.   :11.0      Min.   :3.000     Min.   :1.000            Min.   :80   
##  1st Qu.:12.0      1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80   
##  Median :14.0      Median :3.000     Median :3.000            Median :80   
##  Mean   :15.2      Mean   :3.152     Mean   :2.707            Mean   :80   
##  3rd Qu.:18.0      3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80   
##  Max.   :25.0      Max.   :4.000     Max.   :4.000            Max.   :80   
##  STOCKOPTIONLEVEL TOTALWORKINGYEARS TRAININGTIMESLASTYEAR WORKLIFEBALANCE
##  Min.   :0.0000   Min.   : 0.00     Min.   :0.000         Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000  
##  Median :1.0000   Median :10.00     Median :3.000         Median :3.000  
##  Mean   :0.7839   Mean   :11.05     Mean   :2.832         Mean   :2.782  
##  3rd Qu.:1.0000   3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000  
##  Max.   :3.0000   Max.   :40.00     Max.   :6.000         Max.   :4.000  
##  YEARSATCOMPANY   YEARSINCURRENTROLE YEARSSINCELASTPROMOTION
##  Min.   : 0.000   Min.   : 0.000     Min.   : 0.000         
##  1st Qu.: 3.000   1st Qu.: 2.000     1st Qu.: 0.000         
##  Median : 5.000   Median : 3.000     Median : 1.000         
##  Mean   : 6.962   Mean   : 4.205     Mean   : 2.169         
##  3rd Qu.:10.000   3rd Qu.: 7.000     3rd Qu.: 3.000         
##  Max.   :40.000   Max.   :18.000     Max.   :15.000         
##  YEARSWITHCURRMANAGER ATTRITION_BINARY OVERALLSATISFACTION     EDU           
##  Min.   : 0.00        Min.   :0.0000   Min.   : 3.000      Length:870        
##  1st Qu.: 2.00        1st Qu.:0.0000   1st Qu.: 7.000      Class :character  
##  Median : 3.00        Median :0.0000   Median : 8.000      Mode  :character  
##  Mean   : 4.14        Mean   :0.1609   Mean   : 8.117                        
##  3rd Qu.: 7.00        3rd Qu.:0.0000   3rd Qu.: 9.000                        
##  Max.   :17.00        Max.   :1.0000   Max.   :12.000                        
##    ENV_SAT            JOB_INV            JOB_SAT           PERF_RATE        
##  Length:870         Length:870         Length:870         Length:870        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    REL_SAT              WLB              AGE_BINS         DAILYRATE_BINS    
##  Length:870         Length:870         Length:870         Length:870        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  COMMUTE_BINS       HOURLYRATE_BINS    MONTHLYINCOME_BINS MONTHLYRATE_BINS  
##  Length:870         Length:870         Length:870         Length:870        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  PERCENTSALARYHIKE_BINS TOTALWORKINGYEARS_BINS YEARSATCOMPANY_BINS
##  Length:870             Length:870             Length:870         
##  Class :character       Class :character       Class :character   
##  Mode  :character       Mode  :character       Mode  :character   
##                                                                   
##                                                                   
## 
Preprocess the data
Split data into categorical (for Naive Bayes) with the binary attrition column as the target
cat_cols = c("DEPARTMENT","GENDER","JOBLEVEL","JOBROLE","MARITALSTATUS","OVERTIME","ENV_SAT","JOB_INV","WLB","AGE_BINS","COMMUTE_BINS","PERCENTSALARYHIKE_BINS","TOTALWORKINGYEARS_BINS")
target = "ATTRITION_BINARY"
categorical_data = data[, c(cat_cols, target)]
Step 3: Split data into training and testing sets
set.seed(44) # chosen because its my age
trainIndex = createDataPartition(data$ATTRITION_BINARY, p = 0.85, list = FALSE)
train_cat = categorical_data[trainIndex, ]
test_cat = categorical_data[-trainIndex, ]
Step 4: Train the Naive Bayes model on categorical data
nb_model = naiveBayes(ATTRITION_BINARY ~ ., data = train_cat)
nb_predictions = predict(nb_model, test_cat)

actual_values = factor(test_cat$ATTRITION_BINARY, levels = c(0, 1))  # Ensure same levels as predictions
Confusion Matrix
confusionMatrix(nb_predictions, actual_values)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 104   7
##          1   5  14
##                                           
##                Accuracy : 0.9077          
##                  95% CI : (0.8443, 0.9514)
##     No Information Rate : 0.8385          
##     P-Value [Acc > NIR] : 0.01651         
##                                           
##                   Kappa : 0.6456          
##                                           
##  Mcnemar's Test P-Value : 0.77283         
##                                           
##             Sensitivity : 0.9541          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.9369          
##          Neg Pred Value : 0.7368          
##              Prevalence : 0.8385          
##          Detection Rate : 0.8000          
##    Detection Prevalence : 0.8538          
##       Balanced Accuracy : 0.8104          
##                                           
##        'Positive' Class : 0               
## 
Bring in the no attrition value data set
new_data = read.csv("C:/Users/gerde/OneDrive/Desktop/MSDSCaseStudy_TEST.csv")
head(new_data)
##     ID AGE    BUSINESSTRAVEL DAILYRATE             DEPARTMENT DISTANCEFROMHOME
## 1 1171  35     Travel_Rarely       750 Research & Development               28
## 2 1172  33     Travel_Rarely       147        Human Resources                2
## 3 1173  26     Travel_Rarely      1330 Research & Development               21
## 4 1174  55     Travel_Rarely      1311 Research & Development                2
## 5 1175  29     Travel_Rarely      1246                  Sales               19
## 6 1176  51 Travel_Frequently      1456 Research & Development                1
##   EDUCATION  EDUCATIONFIELD EMPLOYEECOUNT EMPLOYEENUMBER
## 1         3   Life Sciences             1           1596
## 2         3 Human Resources             1           1207
## 3         3         Medical             1           1107
## 4         3   Life Sciences             1            505
## 5         3   Life Sciences             1           1497
## 6         4         Medical             1            145
##   ENVIRONMENTSATISFACTION GENDER HOURLYRATE JOBINVOLVEMENT JOBLEVEL
## 1                       2   Male         46              4        2
## 2                       2   Male         99              3        1
## 3                       1   Male         37              3        1
## 4                       3 Female         97              3        4
## 5                       3   Male         77              2        2
## 6                       1 Female         30              2        3
##                     JOBROLE JOBSATISFACTION MARITALSTATUS MONTHLYINCOME
## 1     Laboratory Technician               3       Married          3407
## 2           Human Resources               3       Married          3600
## 3     Laboratory Technician               3      Divorced          2377
## 4                   Manager               4        Single         16659
## 5           Sales Executive               3      Divorced          8620
## 6 Healthcare Representative               1        Single          7484
##   MONTHLYRATE NUMCOMPANIESWORKED OVER18 OVERTIME PERCENTSALARYHIKE
## 1       25348                  1      Y       No                17
## 2        8429                  1      Y       No                13
## 3       19373                  1      Y       No                20
## 4       23258                  2      Y      Yes                13
## 5       23757                  1      Y       No                14
## 6       25796                  3      Y       No                20
##   PERFORMANCERATING RELATIONSHIPSATISFACTION STANDARDHOURS STOCKOPTIONLEVEL
## 1                 3                        4            80                2
## 2                 3                        4            80                1
## 3                 4                        3            80                1
## 4                 3                        3            80                0
## 5                 3                        3            80                2
## 6                 4                        3            80                0
##   TOTALWORKINGYEARS TRAININGTIMESLASTYEAR WORKLIFEBALANCE YEARSATCOMPANY
## 1                10                     3               2             10
## 2                 5                     2               3              5
## 3                 1                     0               2              1
## 4                30                     2               3              5
## 5                10                     3               3             10
## 6                23                     1               2             13
##   YEARSINCURRENTROLE YEARSSINCELASTPROMOTION YEARSWITHCURRMANAGER
## 1                  9                       6                    8
## 2                  4                       1                    4
## 3                  1                       0                    0
## 4                  4                       1                    2
## 5                  7                       0                    4
## 6                 12                      12                    8
##   OVERALLSATISFACTION      EDU ENV_SAT   JOB_INV   JOB_SAT   PERF_RATE
## 1                   9 Bachelor  Medium Very High      High   Excellent
## 2                   9 Bachelor  Medium      High      High   Excellent
## 3                   7 Bachelor     Low      High      High Outstanding
## 4                  10 Bachelor    High      High Very High   Excellent
## 5                   9 Bachelor    High    Medium      High   Excellent
## 6                   5   Master     Low    Medium       Low Outstanding
##     REL_SAT    WLB AGE_BINS DAILYRATE_BINS COMMUTE_BINS HOURLYRATE_BINS
## 1 Very High   Good    35-40        600-800        25-30           40-50
## 2 Very High Better    30-35  Less Than 200          0-5       90 and up
## 3      High   Good    25-30      1200-1400        20-25           30-40
## 4      High Better    55-60      1200-1400          0-5       90 and up
## 5      High Better    25-30      1200-1400        15-20           70-80
## 6      High   Good    50-55    1400 and up          0-5           30-40
##   MONTHLYINCOME_BINS MONTHLYRATE_BINS PERCENTSALARYHIKE_BINS
## 1          3000-4000      25000-27500                 15-20%
## 2          3000-4000       7500-10000                 10-15%
## 3          2000-3000      17500-20000                 20-25%
## 4        16000-17000      22500-25000                 10-15%
## 5          8000-9000      22500-25000                 10-15%
## 6          7000-8000      25000-27500                 20-25%
##   TOTALWORKINGYEARS_BINS YEARSATCOMPANY_BINS
## 1                  10-15               10-15
## 2                   5-10                5-10
## 3            Less Than 5         Less Than 5
## 4                  30-35                5-10
## 5                  10-15               10-15
## 6                  20-25               10-15
Split out the categorical data for the nb_model
cat_new_data = new_data[, c(cat_cols)]
head(cat_new_data)
##               DEPARTMENT GENDER JOBLEVEL                   JOBROLE
## 1 Research & Development   Male        2     Laboratory Technician
## 2        Human Resources   Male        1           Human Resources
## 3 Research & Development   Male        1     Laboratory Technician
## 4 Research & Development Female        4                   Manager
## 5                  Sales   Male        2           Sales Executive
## 6 Research & Development Female        3 Healthcare Representative
##   MARITALSTATUS OVERTIME ENV_SAT   JOB_INV    WLB AGE_BINS COMMUTE_BINS
## 1       Married       No  Medium Very High   Good    35-40        25-30
## 2       Married       No  Medium      High Better    30-35          0-5
## 3      Divorced       No     Low      High   Good    25-30        20-25
## 4        Single      Yes    High      High Better    55-60          0-5
## 5      Divorced       No    High    Medium Better    25-30        15-20
## 6        Single       No     Low    Medium   Good    50-55          0-5
##   PERCENTSALARYHIKE_BINS TOTALWORKINGYEARS_BINS
## 1                 15-20%                  10-15
## 2                 10-15%                   5-10
## 3                 20-25%            Less Than 5
## 4                 10-15%                  30-35
## 5                 10-15%                  10-15
## 6                 20-25%                  20-25
Run the nb_model on the categorized test data set
final_predictions = predict(nb_model, cat_new_data)
Add the predictions as a new column in the no attrition column data set
new_data$Predicted_Attrition = final_predictions
head(new_data)
##     ID AGE    BUSINESSTRAVEL DAILYRATE             DEPARTMENT DISTANCEFROMHOME
## 1 1171  35     Travel_Rarely       750 Research & Development               28
## 2 1172  33     Travel_Rarely       147        Human Resources                2
## 3 1173  26     Travel_Rarely      1330 Research & Development               21
## 4 1174  55     Travel_Rarely      1311 Research & Development                2
## 5 1175  29     Travel_Rarely      1246                  Sales               19
## 6 1176  51 Travel_Frequently      1456 Research & Development                1
##   EDUCATION  EDUCATIONFIELD EMPLOYEECOUNT EMPLOYEENUMBER
## 1         3   Life Sciences             1           1596
## 2         3 Human Resources             1           1207
## 3         3         Medical             1           1107
## 4         3   Life Sciences             1            505
## 5         3   Life Sciences             1           1497
## 6         4         Medical             1            145
##   ENVIRONMENTSATISFACTION GENDER HOURLYRATE JOBINVOLVEMENT JOBLEVEL
## 1                       2   Male         46              4        2
## 2                       2   Male         99              3        1
## 3                       1   Male         37              3        1
## 4                       3 Female         97              3        4
## 5                       3   Male         77              2        2
## 6                       1 Female         30              2        3
##                     JOBROLE JOBSATISFACTION MARITALSTATUS MONTHLYINCOME
## 1     Laboratory Technician               3       Married          3407
## 2           Human Resources               3       Married          3600
## 3     Laboratory Technician               3      Divorced          2377
## 4                   Manager               4        Single         16659
## 5           Sales Executive               3      Divorced          8620
## 6 Healthcare Representative               1        Single          7484
##   MONTHLYRATE NUMCOMPANIESWORKED OVER18 OVERTIME PERCENTSALARYHIKE
## 1       25348                  1      Y       No                17
## 2        8429                  1      Y       No                13
## 3       19373                  1      Y       No                20
## 4       23258                  2      Y      Yes                13
## 5       23757                  1      Y       No                14
## 6       25796                  3      Y       No                20
##   PERFORMANCERATING RELATIONSHIPSATISFACTION STANDARDHOURS STOCKOPTIONLEVEL
## 1                 3                        4            80                2
## 2                 3                        4            80                1
## 3                 4                        3            80                1
## 4                 3                        3            80                0
## 5                 3                        3            80                2
## 6                 4                        3            80                0
##   TOTALWORKINGYEARS TRAININGTIMESLASTYEAR WORKLIFEBALANCE YEARSATCOMPANY
## 1                10                     3               2             10
## 2                 5                     2               3              5
## 3                 1                     0               2              1
## 4                30                     2               3              5
## 5                10                     3               3             10
## 6                23                     1               2             13
##   YEARSINCURRENTROLE YEARSSINCELASTPROMOTION YEARSWITHCURRMANAGER
## 1                  9                       6                    8
## 2                  4                       1                    4
## 3                  1                       0                    0
## 4                  4                       1                    2
## 5                  7                       0                    4
## 6                 12                      12                    8
##   OVERALLSATISFACTION      EDU ENV_SAT   JOB_INV   JOB_SAT   PERF_RATE
## 1                   9 Bachelor  Medium Very High      High   Excellent
## 2                   9 Bachelor  Medium      High      High   Excellent
## 3                   7 Bachelor     Low      High      High Outstanding
## 4                  10 Bachelor    High      High Very High   Excellent
## 5                   9 Bachelor    High    Medium      High   Excellent
## 6                   5   Master     Low    Medium       Low Outstanding
##     REL_SAT    WLB AGE_BINS DAILYRATE_BINS COMMUTE_BINS HOURLYRATE_BINS
## 1 Very High   Good    35-40        600-800        25-30           40-50
## 2 Very High Better    30-35  Less Than 200          0-5       90 and up
## 3      High   Good    25-30      1200-1400        20-25           30-40
## 4      High Better    55-60      1200-1400          0-5       90 and up
## 5      High Better    25-30      1200-1400        15-20           70-80
## 6      High   Good    50-55    1400 and up          0-5           30-40
##   MONTHLYINCOME_BINS MONTHLYRATE_BINS PERCENTSALARYHIKE_BINS
## 1          3000-4000      25000-27500                 15-20%
## 2          3000-4000       7500-10000                 10-15%
## 3          2000-3000      17500-20000                 20-25%
## 4        16000-17000      22500-25000                 10-15%
## 5          8000-9000      22500-25000                 10-15%
## 6          7000-8000      25000-27500                 20-25%
##   TOTALWORKINGYEARS_BINS YEARSATCOMPANY_BINS Predicted_Attrition
## 1                  10-15               10-15                   0
## 2                   5-10                5-10                   0
## 3            Less Than 5         Less Than 5                   0
## 4                  30-35                5-10                   0
## 5                  10-15               10-15                   0
## 6                  20-25               10-15                   0
Write the updated data frame to a new CSV file
write.csv(new_data, "C:/Users/gerde/OneDrive/Desktop/new_data_with_predictions.csv", row.names = FALSE)
Plot Confusion Matrix
# Load necessary libraries
library(caret)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(e1071)  # For the Naive Bayes model

# Load and summarize data
data <- read.csv("C:/Users/gerde/OneDrive/Desktop/MSDSCaseStudy_Edit1.csv")
summary(data)
##        ID             AGE         ATTRITION         BUSINESSTRAVEL    
##  Min.   :  1.0   Min.   :18.00   Length:870         Length:870        
##  1st Qu.:218.2   1st Qu.:30.00   Class :character   Class :character  
##  Median :435.5   Median :35.00   Mode  :character   Mode  :character  
##  Mean   :435.5   Mean   :36.83                                        
##  3rd Qu.:652.8   3rd Qu.:43.00                                        
##  Max.   :870.0   Max.   :60.00                                        
##    DAILYRATE       DEPARTMENT        DISTANCEFROMHOME   EDUCATION    
##  Min.   : 103.0   Length:870         Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Class :character   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Mode  :character   Median : 7.000   Median :3.000  
##  Mean   : 815.2                      Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                      3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                      Max.   :29.000   Max.   :5.000  
##  EDUCATIONFIELD     EMPLOYEECOUNT EMPLOYEENUMBER   ENVIRONMENTSATISFACTION
##  Length:870         Min.   :1     Min.   :   1.0   Min.   :1.000          
##  Class :character   1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
##  Mode  :character   Median :1     Median :1039.0   Median :3.000          
##                     Mean   :1     Mean   :1029.8   Mean   :2.701          
##                     3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
##                     Max.   :1     Max.   :2064.0   Max.   :4.000          
##     GENDER            HOURLYRATE     JOBINVOLVEMENT     JOBLEVEL    
##  Length:870         Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  Class :character   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##  Mode  :character   Median : 66.00   Median :3.000   Median :2.000  
##                     Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##                     3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##                     Max.   :100.00   Max.   :4.000   Max.   :5.000  
##    JOBROLE          JOBSATISFACTION MARITALSTATUS      MONTHLYINCOME  
##  Length:870         Min.   :1.000   Length:870         Min.   : 1081  
##  Class :character   1st Qu.:2.000   Class :character   1st Qu.: 2840  
##  Mode  :character   Median :3.000   Mode  :character   Median : 4946  
##                     Mean   :2.709                      Mean   : 6390  
##                     3rd Qu.:4.000                      3rd Qu.: 8182  
##                     Max.   :4.000                      Max.   :19999  
##   MONTHLYRATE    NUMCOMPANIESWORKED    OVER18            OVERTIME        
##  Min.   : 2094   Min.   :0.000      Length:870         Length:870        
##  1st Qu.: 8092   1st Qu.:1.000      Class :character   Class :character  
##  Median :14074   Median :2.000      Mode  :character   Mode  :character  
##  Mean   :14326   Mean   :2.728                                           
##  3rd Qu.:20456   3rd Qu.:4.000                                           
##  Max.   :26997   Max.   :9.000                                           
##  PERCENTSALARYHIKE PERFORMANCERATING RELATIONSHIPSATISFACTION STANDARDHOURS
##  Min.   :11.0      Min.   :3.000     Min.   :1.000            Min.   :80   
##  1st Qu.:12.0      1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80   
##  Median :14.0      Median :3.000     Median :3.000            Median :80   
##  Mean   :15.2      Mean   :3.152     Mean   :2.707            Mean   :80   
##  3rd Qu.:18.0      3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80   
##  Max.   :25.0      Max.   :4.000     Max.   :4.000            Max.   :80   
##  STOCKOPTIONLEVEL TOTALWORKINGYEARS TRAININGTIMESLASTYEAR WORKLIFEBALANCE
##  Min.   :0.0000   Min.   : 0.00     Min.   :0.000         Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000  
##  Median :1.0000   Median :10.00     Median :3.000         Median :3.000  
##  Mean   :0.7839   Mean   :11.05     Mean   :2.832         Mean   :2.782  
##  3rd Qu.:1.0000   3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000  
##  Max.   :3.0000   Max.   :40.00     Max.   :6.000         Max.   :4.000  
##  YEARSATCOMPANY   YEARSINCURRENTROLE YEARSSINCELASTPROMOTION
##  Min.   : 0.000   Min.   : 0.000     Min.   : 0.000         
##  1st Qu.: 3.000   1st Qu.: 2.000     1st Qu.: 0.000         
##  Median : 5.000   Median : 3.000     Median : 1.000         
##  Mean   : 6.962   Mean   : 4.205     Mean   : 2.169         
##  3rd Qu.:10.000   3rd Qu.: 7.000     3rd Qu.: 3.000         
##  Max.   :40.000   Max.   :18.000     Max.   :15.000         
##  YEARSWITHCURRMANAGER ATTRITION_BINARY OVERALLSATISFACTION     EDU           
##  Min.   : 0.00        Min.   :0.0000   Min.   : 3.000      Length:870        
##  1st Qu.: 2.00        1st Qu.:0.0000   1st Qu.: 7.000      Class :character  
##  Median : 3.00        Median :0.0000   Median : 8.000      Mode  :character  
##  Mean   : 4.14        Mean   :0.1609   Mean   : 8.117                        
##  3rd Qu.: 7.00        3rd Qu.:0.0000   3rd Qu.: 9.000                        
##  Max.   :17.00        Max.   :1.0000   Max.   :12.000                        
##    ENV_SAT            JOB_INV            JOB_SAT           PERF_RATE        
##  Length:870         Length:870         Length:870         Length:870        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    REL_SAT              WLB              AGE_BINS         DAILYRATE_BINS    
##  Length:870         Length:870         Length:870         Length:870        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  COMMUTE_BINS       HOURLYRATE_BINS    MONTHLYINCOME_BINS MONTHLYRATE_BINS  
##  Length:870         Length:870         Length:870         Length:870        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  PERCENTSALARYHIKE_BINS TOTALWORKINGYEARS_BINS YEARSATCOMPANY_BINS
##  Length:870             Length:870             Length:870         
##  Class :character       Class :character       Class :character   
##  Mode  :character       Mode  :character       Mode  :character   
##                                                                   
##                                                                   
## 
# Preprocess the data: Define categorical columns and target
cat_cols <- c("DEPARTMENT", "GENDER", "JOBLEVEL", "JOBROLE", "MARITALSTATUS", 
              "OVERTIME", "ENV_SAT", "JOB_INV", "WLB", "AGE_BINS", 
              "COMMUTE_BINS", "PERCENTSALARYHIKE_BINS", "TOTALWORKINGYEARS_BINS")
target <- "ATTRITION_BINARY"

categorical_data <- data[, c(cat_cols, target)]

# Split data into training and testing sets
set.seed(44)  # chosen because it's the user's age
trainIndex <- createDataPartition(categorical_data$ATTRITION_BINARY, p = 0.85, list = FALSE)
train_cat <- categorical_data[trainIndex, ]
test_cat <- categorical_data[-trainIndex, ]

# Ensure both the predicted and actual columns are factors with the same levels
train_cat$ATTRITION_BINARY <- factor(train_cat$ATTRITION_BINARY, levels = c(0, 1))
test_cat$ATTRITION_BINARY <- factor(test_cat$ATTRITION_BINARY, levels = c(0, 1))

# Train the Naive Bayes model on categorical data
nb_model <- naiveBayes(ATTRITION_BINARY ~ ., data = train_cat)
nb_predictions <- predict(nb_model, test_cat)
nb_predictions <- factor(nb_predictions, levels = c(0, 1))  # Set levels for predictions

# Generate the confusion matrix and convert it to a data frame for plotting
conf_matrix <- confusionMatrix(nb_predictions, test_cat$ATTRITION_BINARY)
conf_df <- as.data.frame(conf_matrix$table)

# Rename columns for readability in the plot
conf_df <- conf_df %>%
  rename(Predicted = Prediction, Actual = Reference) %>%
  mutate(
    Prediction_Label = ifelse(Predicted == 1, "Predicted Attrition", "Predicted No Attrition"),
    Reference_Label = ifelse(Actual == 1, "Actual Attrition", "Actual No Attrition")
  )

# Plotting function for the confusion matrix
plot_confusion_matrix <- function(conf_df, title) {
  ggplot(conf_df, aes(x = Reference_Label, y = Prediction_Label, fill = Freq)) +
    geom_tile(color = "white") +
    scale_fill_gradient(low = "lightblue", high = "darkblue") +
    geom_text(aes(label = Freq), color = "white", size = 5) +
    theme_minimal() +
    labs(title = title, x = "Actual", y = "Predicted") +
    theme(plot.title = element_text(hjust = 0.5))
}

# Display the confusion matrix plot
plot_confusion_matrix(conf_df, "Confusion Matrix - Full Dataset")

SQL Code for feature engineering

SELECT *, CASE WHEN ATTRITION = ‘Yes’ THEN 1 ELSE 0 END AS ATTRITION_BINARY, ENVIRONMENTSATISFACTION + JOBSATISFACTION + RELATIONSHIPSATISFACTION AS OVERALLSATISFACTION, CASE WHEN EDUCATION = 1 THEN ‘Below College’ WHEN EDUCATION = 2 THEN ‘Some College’ WHEN EDUCATION = 3 THEN ‘Bachelor’ WHEN EDUCATION = 4 THEN ‘Master’ WHEN EDUCATION = 5 THEN ‘Doctor’ ELSE ‘N/A’ END AS EDU, CASE WHEN ENVIRONMENTSATISFACTION = 1 THEN ‘Low’ WHEN ENVIRONMENTSATISFACTION = 2 THEN ‘Medium’ WHEN ENVIRONMENTSATISFACTION = 3 THEN ‘High’ WHEN ENVIRONMENTSATISFACTION = 4 THEN ‘Very High’ ELSE ‘N/A’ END AS ENV_SAT, CASE WHEN JOBINVOLVEMENT = 1 THEN ‘Low’ WHEN JOBINVOLVEMENT = 2 THEN ‘Medium’ WHEN JOBINVOLVEMENT = 3 THEN ‘High’ WHEN JOBINVOLVEMENT = 4 THEN ‘Very High’ ELSE ‘N/A’ END AS JOB_INV, CASE WHEN JOBSATISFACTION = 1 THEN ‘Low’ WHEN JOBSATISFACTION = 2 THEN ‘Medium’ WHEN JOBSATISFACTION = 3 THEN ‘High’ WHEN JOBSATISFACTION = 4 THEN ‘Very High’ ELSE ‘N/A’ END AS JOB_SAT, CASE WHEN PERFORMANCERATING = 1 THEN ‘Low’ WHEN PERFORMANCERATING = 2 THEN ‘Good’ WHEN PERFORMANCERATING = 3 THEN ‘Excellent’ WHEN PERFORMANCERATING = 4 THEN ‘Outstanding’ ELSE ‘N/A’ END AS PERF_RATE, CASE WHEN RELATIONSHIPSATISFACTION = 1 THEN ‘Low’ WHEN RELATIONSHIPSATISFACTION = 2 THEN ‘Medium’ WHEN RELATIONSHIPSATISFACTION = 3 THEN ‘High’ WHEN RELATIONSHIPSATISFACTION = 4 THEN ‘Very High’ ELSE ‘N/A’ END AS REL_SAT, CASE WHEN WORKLIFEBALANCE = 1 THEN ‘Bad’ WHEN WORKLIFEBALANCE = 2 THEN ‘Good’ WHEN WORKLIFEBALANCE = 3 THEN ‘Better’ WHEN WORKLIFEBALANCE = 4 THEN ‘Best’ ELSE ‘N/A’ END AS WLB, CASE WHEN AGE < 20 THEN ‘Less Than 20’ WHEN AGE >= 20 AND AGE < 25 THEN ‘20-25’ WHEN AGE >= 25 AND AGE < 30 THEN ‘25-30’ WHEN AGE >= 30 AND AGE < 35 THEN ‘30-35’ WHEN AGE >= 35 AND AGE < 40 THEN ‘35-40’ WHEN AGE >= 40 AND AGE < 45 THEN ‘40-45’ WHEN AGE >= 45 AND AGE < 50 THEN ‘45-50’ WHEN AGE >= 50 AND AGE < 55 THEN ‘50-55’ WHEN AGE >= 55 AND AGE < 60 THEN ‘55-60’ WHEN AGE >= 60 THEN ‘60 And Up’ END AS AGE_BINS, CASE WHEN DAILYRATE <= 200 THEN ’ Less Than 200’ WHEN DAILYRATE >= 200 AND DAILYRATE <= 400 THEN ‘200-400’ WHEN DAILYRATE > 400 AND DAILYRATE <=600 THEN ‘400-600’ WHEN DAILYRATE > 600 AND DAILYRATE <=800 THEN ‘600-800’ WHEN DAILYRATE > 800 AND DAILYRATE <=1000 THEN ‘800-1000’ WHEN DAILYRATE > 1000 AND DAILYRATE <=1200 THEN ‘1000-1200’ WHEN DAILYRATE > 1200 AND DAILYRATE <=1400 THEN ‘1200-1400’ WHEN DAILYRATE > 1400 THEN ‘1400 and up’ END AS DAILYRATE_BINS, CASE WHEN DISTANCEFROMHOME <= 5 THEN ‘0-5’ WHEN DISTANCEFROMHOME > 5 AND DISTANCEFROMHOME <= 10 THEN ‘5-10’ WHEN DISTANCEFROMHOME > 10 AND DISTANCEFROMHOME <= 15 THEN ‘10-15’ WHEN DISTANCEFROMHOME > 15 AND DISTANCEFROMHOME <= 20 THEN ‘15-20’ WHEN DISTANCEFROMHOME > 20 AND DISTANCEFROMHOME <= 25 THEN ‘20-25’ WHEN DISTANCEFROMHOME > 25 AND DISTANCEFROMHOME <= 30 THEN ‘25-30’ WHEN DISTANCEFROMHOME > 30 THEN ‘30+’ END AS COMMUTE_BINS, CASE WHEN HOURLYRATE < 30 THEN ‘Less Than 30’ WHEN HOURLYRATE >= 30 AND HOURLYRATE < 40 THEN ‘30-40’ WHEN HOURLYRATE >= 40 AND HOURLYRATE < 50 THEN ‘40-50’ WHEN HOURLYRATE >= 50 AND HOURLYRATE < 60 THEN ‘50-60’ WHEN HOURLYRATE >= 60 AND HOURLYRATE < 70 THEN ‘60-70’ WHEN HOURLYRATE >= 70 AND HOURLYRATE < 80 THEN ‘70-80’ WHEN HOURLYRATE >= 80 AND HOURLYRATE < 90 THEN ‘80-90’ WHEN HOURLYRATE >= 90 THEN ‘90 and up’ END AS HOURLYRATE_BINS, CASE WHEN MONTHLYINCOME < 1000 THEN ‘Less Than 2500’ WHEN MONTHLYINCOME >= 1000 AND MONTHLYINCOME < 2000 THEN ‘1000-2000’ WHEN MONTHLYINCOME >= 2000 AND MONTHLYINCOME < 3000 THEN ‘2000-3000’ WHEN MONTHLYINCOME >= 3000 AND MONTHLYINCOME < 4000 THEN ‘3000-4000’ WHEN MONTHLYINCOME >= 4000 AND MONTHLYINCOME < 5000 THEN ‘4000-5000’ WHEN MONTHLYINCOME >= 5000 AND MONTHLYINCOME < 6000 THEN ‘5000-6000’ WHEN MONTHLYINCOME >= 6000 AND MONTHLYINCOME < 7000 THEN ‘6000-7000’ WHEN MONTHLYINCOME >= 7000 AND MONTHLYINCOME < 8000 THEN ‘7000-8000’ WHEN MONTHLYINCOME >= 8000 AND MONTHLYINCOME < 9000 THEN ‘8000-9000’ WHEN MONTHLYINCOME >= 9000 AND MONTHLYINCOME < 10000 THEN ‘9000-10000’ WHEN MONTHLYINCOME >= 10000 AND MONTHLYINCOME < 11000 THEN ‘10000-11000’ WHEN MONTHLYINCOME >= 11000 AND MONTHLYINCOME < 12000 THEN ‘11000-12000’ WHEN MONTHLYINCOME >= 12000 AND MONTHLYINCOME < 13000 THEN ‘12000-13000’ WHEN MONTHLYINCOME >= 13000 AND MONTHLYINCOME < 14000 THEN ‘13000-14000’ WHEN MONTHLYINCOME >= 14000 AND MONTHLYINCOME < 15000 THEN ‘14000-15000’ WHEN MONTHLYINCOME >= 15000 AND MONTHLYINCOME < 16000 THEN ‘15000-16000’ WHEN MONTHLYINCOME >= 16000 AND MONTHLYINCOME < 17000 THEN ‘16000-17000’ WHEN MONTHLYINCOME >= 17000 AND MONTHLYINCOME < 18000 THEN ‘17000-18000’ WHEN MONTHLYINCOME >= 18000 AND MONTHLYINCOME < 19000 THEN ‘18000-19000’ WHEN MONTHLYINCOME >= 19000 AND MONTHLYINCOME < 20000 THEN ‘19000-20000’ WHEN MONTHLYINCOME >= 20000 THEN ‘Greater Than 20000’ END AS MONTHLYINCOME_BINS, CASE WHEN MONTHLYRATE < 2500 THEN ’ Less Than 2500’ WHEN MONTHLYRATE >= 2500 AND MONTHLYRATE < 5000 THEN ‘2500-5000’ WHEN MONTHLYRATE >= 5000 AND MONTHLYRATE < 7500 THEN ‘5000-7500’ WHEN MONTHLYRATE >= 7500 AND MONTHLYRATE < 10000 THEN ‘7500-10000’ WHEN MONTHLYRATE >= 10000 AND MONTHLYRATE < 12500 THEN ‘10000-12500’ WHEN MONTHLYRATE >= 12500 AND MONTHLYRATE < 15000 THEN ‘12500-15000’ WHEN MONTHLYRATE >= 15000 AND MONTHLYRATE < 17500 THEN ‘15000-17500’ WHEN MONTHLYRATE >= 17500 AND MONTHLYRATE < 20000 THEN ‘17500-20000’ WHEN MONTHLYRATE >= 20000 AND MONTHLYRATE < 22500 THEN ‘20000-22500’ WHEN MONTHLYRATE >= 22500 AND MONTHLYRATE < 25000 THEN ‘22500-25000’ WHEN MONTHLYRATE >= 25000 AND MONTHLYRATE < 27500 THEN ‘25000-27500’ WHEN MONTHLYRATE >= 27500 THEN ’ Greater Than 27500’ END AS MONTHLYRATE_BINS, CASE WHEN PERCENTSALARYHIKE < 5 THEN ‘Less Than 5%’ WHEN PERCENTSALARYHIKE >= 5 AND PERCENTSALARYHIKE < 10 THEN ‘5-10%’ WHEN PERCENTSALARYHIKE >= 10 AND PERCENTSALARYHIKE < 15 THEN ‘10-15%’ WHEN PERCENTSALARYHIKE >= 15 AND PERCENTSALARYHIKE < 20 THEN ‘15-20%’ WHEN PERCENTSALARYHIKE >= 20 AND PERCENTSALARYHIKE < 25 THEN ‘20-25%’ WHEN PERCENTSALARYHIKE >= 25 THEN ‘25% and up’ END AS PERCENTSALARYHIKE_BINS, CASE WHEN TOTALWORKINGYEARS < 5 THEN ‘Less Than 5’ WHEN TOTALWORKINGYEARS >= 5 AND TOTALWORKINGYEARS < 10 THEN ‘5-10’ WHEN TOTALWORKINGYEARS >= 10 AND TOTALWORKINGYEARS < 15 THEN ‘10-15’ WHEN TOTALWORKINGYEARS >= 15 AND TOTALWORKINGYEARS < 20 THEN ‘15-20’ WHEN TOTALWORKINGYEARS >= 20 AND TOTALWORKINGYEARS < 25 THEN ‘20-25’ WHEN TOTALWORKINGYEARS >= 25 AND TOTALWORKINGYEARS < 30 THEN ‘25-30’ WHEN TOTALWORKINGYEARS >= 30 AND TOTALWORKINGYEARS < 35 THEN ‘30-35’ WHEN TOTALWORKINGYEARS >= 35 AND TOTALWORKINGYEARS < 40 THEN ‘35-40’ WHEN TOTALWORKINGYEARS >= 40 THEN ‘40 and up’ END AS TOTALWORKINGYEARS_BINS, CASE WHEN YEARSATCOMPANY < 5 THEN ‘Less Than 5’ WHEN YEARSATCOMPANY >= 5 AND YEARSATCOMPANY < 10 THEN ‘5-10’ WHEN YEARSATCOMPANY >= 10 AND YEARSATCOMPANY < 15 THEN ‘10-15’ WHEN YEARSATCOMPANY >= 15 AND YEARSATCOMPANY < 20 THEN ‘15-20’ WHEN YEARSATCOMPANY >= 20 AND YEARSATCOMPANY < 25 THEN ‘20-25’ WHEN YEARSATCOMPANY >= 25 AND YEARSATCOMPANY < 30 THEN ‘25-30’ WHEN YEARSATCOMPANY >= 30 AND YEARSATCOMPANY < 35 THEN ‘30-35’ WHEN YEARSATCOMPANY >= 35 AND YEARSATCOMPANY < 40 THEN ‘35-40’ WHEN YEARSATCOMPANY >= 40 THEN ‘40 and up’ END AS YEARSATCOMPANY_BINS FROM PROD_MKTBCI.MKTBUSDB.GC0965_MSDSCASESTUDY_TEST