library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(readr)
library(stringr)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(gtools)
library(caret)
## Loading required package: lattice
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:gtools':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
library(performance)
library(ggpubr)


library(logistf)
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ forcats 0.5.2
## ✔ purrr   0.3.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ scales::col_factor() masks readr::col_factor()
## ✖ purrr::discard()     masks scales::discard()
## ✖ plotly::filter()     masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ purrr::lift()        masks caret::lift()
## ✖ car::recode()        masks dplyr::recode()
## ✖ purrr::some()        masks car::some()
library(corrplot)
## corrplot 0.92 loaded
library(lattice)
library(caret)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:plotly':
## 
##     select
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(caTools)



library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loaded gam 1.20.2
library(tidyverse)
library(car)
library(broom)
library(DescTools)
## 
## Attaching package: 'DescTools'
## 
## The following object is masked from 'package:foreach':
## 
##     %:%
## 
## The following objects are masked from 'package:MLmetrics':
## 
##     AUC, Gini, MAE, MAPE, MSE, RMSE
## 
## The following object is masked from 'package:car':
## 
##     Recode
## 
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
library(ROCR)
## 
## Attaching package: 'ROCR'
## 
## The following object is masked from 'package:performance':
## 
##     performance
library(lmtest)
library(readr)
healthcare <- read_csv("watson_healthcare_modified.csv")
## Rows: 1676 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): EmployeeID, Age, DailyRate, DistanceFromHome, Education, EmployeeC...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(healthcare)
## spec_tbl_df [1,676 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ EmployeeID              : num [1:1676] 1313919 1200302 1060315 1272912 1414939 ...
##  $ Age                     : num [1:1676] 41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : chr [1:1676] "No" "No" "Yes" "No" ...
##  $ BusinessTravel          : chr [1:1676] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
##  $ DailyRate               : num [1:1676] 1102 279 1373 1392 591 ...
##  $ Department              : chr [1:1676] "Cardiology" "Maternity" "Maternity" "Maternity" ...
##  $ DistanceFromHome        : num [1:1676] 1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : num [1:1676] 2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : chr [1:1676] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
##  $ EmployeeCount           : num [1:1676] 1 1 1 1 1 1 1 1 1 1 ...
##  $ EnvironmentSatisfaction : num [1:1676] 2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : chr [1:1676] "Female" "Male" "Male" "Female" ...
##  $ HourlyRate              : num [1:1676] 94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : num [1:1676] 3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : num [1:1676] 2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : chr [1:1676] "Nurse" "Other" "Nurse" "Other" ...
##  $ JobSatisfaction         : num [1:1676] 4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : chr [1:1676] "Single" "Married" "Single" "Married" ...
##  $ MonthlyIncome           : num [1:1676] 5993 5130 2090 2909 3468 ...
##  $ MonthlyRate             : num [1:1676] 19479 24907 2396 23159 16632 ...
##  $ NumCompaniesWorked      : num [1:1676] 8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : chr [1:1676] "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr [1:1676] "Yes" "No" "Yes" "Yes" ...
##  $ PercentSalaryHike       : num [1:1676] 11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : num [1:1676] 3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: num [1:1676] 1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : num [1:1676] 80 80 80 80 80 80 80 80 80 80 ...
##  $ Shift                   : num [1:1676] 0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : num [1:1676] 8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : num [1:1676] 0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : num [1:1676] 1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : num [1:1676] 6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : num [1:1676] 4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : num [1:1676] 0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : num [1:1676] 5 7 0 0 2 6 0 0 8 7 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   EmployeeID = col_double(),
##   ..   Age = col_double(),
##   ..   Attrition = col_character(),
##   ..   BusinessTravel = col_character(),
##   ..   DailyRate = col_double(),
##   ..   Department = col_character(),
##   ..   DistanceFromHome = col_double(),
##   ..   Education = col_double(),
##   ..   EducationField = col_character(),
##   ..   EmployeeCount = col_double(),
##   ..   EnvironmentSatisfaction = col_double(),
##   ..   Gender = col_character(),
##   ..   HourlyRate = col_double(),
##   ..   JobInvolvement = col_double(),
##   ..   JobLevel = col_double(),
##   ..   JobRole = col_character(),
##   ..   JobSatisfaction = col_double(),
##   ..   MaritalStatus = col_character(),
##   ..   MonthlyIncome = col_double(),
##   ..   MonthlyRate = col_double(),
##   ..   NumCompaniesWorked = col_double(),
##   ..   Over18 = col_character(),
##   ..   OverTime = col_character(),
##   ..   PercentSalaryHike = col_double(),
##   ..   PerformanceRating = col_double(),
##   ..   RelationshipSatisfaction = col_double(),
##   ..   StandardHours = col_double(),
##   ..   Shift = col_double(),
##   ..   TotalWorkingYears = col_double(),
##   ..   TrainingTimesLastYear = col_double(),
##   ..   WorkLifeBalance = col_double(),
##   ..   YearsAtCompany = col_double(),
##   ..   YearsInCurrentRole = col_double(),
##   ..   YearsSinceLastPromotion = col_double(),
##   ..   YearsWithCurrManager = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Check if there is any missing

anyNA(healthcare)
## [1] FALSE

Check if any duplicate data

healthcare[duplicated(healthcare),]
## # A tibble: 0 × 35
## # … with 35 variables: EmployeeID <dbl>, Age <dbl>, Attrition <chr>,
## #   BusinessTravel <chr>, DailyRate <dbl>, Department <chr>,
## #   DistanceFromHome <dbl>, Education <dbl>, EducationField <chr>,
## #   EmployeeCount <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## #   HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## #   JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## #   MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, …
# no duplicated data
glimpse(healthcare)
## Rows: 1,676
## Columns: 35
## $ EmployeeID               <dbl> 1313919, 1200302, 1060315, 1272912, 1414939, …
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition                <chr> "No", "No", "Yes", "No", "No", "No", "No", "N…
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department               <chr> "Cardiology", "Maternity", "Maternity", "Mate…
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole                  <chr> "Nurse", "Other", "Nurse", "Other", "Nurse", …
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ Shift                    <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …
#This dataset consist of 35 features (variables) and 1,676 observations (rows data). There are 9 categorical columns and 26 numerical columns.
# Make age a categorical variable 

Here are the summary of the raw dataset:

summary(healthcare 
        %>% select_if(is.numeric))
##    EmployeeID           Age          DailyRate      DistanceFromHome
##  Min.   :1025177   Min.   :18.00   Min.   : 102.0   Min.   : 1.000  
##  1st Qu.:1235832   1st Qu.:30.00   1st Qu.: 465.0   1st Qu.: 2.000  
##  Median :1464606   Median :36.00   Median : 796.5   Median : 7.000  
##  Mean   :1456796   Mean   :36.87   Mean   : 800.6   Mean   : 9.222  
##  3rd Qu.:1667992   3rd Qu.:43.00   3rd Qu.:1157.0   3rd Qu.:14.000  
##  Max.   :1886378   Max.   :60.00   Max.   :1499.0   Max.   :29.000  
##    Education     EmployeeCount EnvironmentSatisfaction   HourlyRate    
##  Min.   :1.000   Min.   :1     Min.   :1.000           Min.   : 30.00  
##  1st Qu.:2.000   1st Qu.:1     1st Qu.:2.000           1st Qu.: 48.00  
##  Median :3.000   Median :1     Median :3.000           Median : 65.50  
##  Mean   :2.908   Mean   :1     Mean   :2.715           Mean   : 65.47  
##  3rd Qu.:4.000   3rd Qu.:1     3rd Qu.:4.000           3rd Qu.: 83.00  
##  Max.   :5.000   Max.   :1     Max.   :4.000           Max.   :100.00  
##  JobInvolvement     JobLevel     JobSatisfaction MonthlyIncome  
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   : 1009  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.: 2928  
##  Median :3.000   Median :2.000   Median :3.000   Median : 4899  
##  Mean   :2.725   Mean   :2.067   Mean   :2.739   Mean   : 6517  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.: 8380  
##  Max.   :4.000   Max.   :5.000   Max.   :4.000   Max.   :19999  
##   MonthlyRate    NumCompaniesWorked PercentSalaryHike PerformanceRating
##  Min.   : 2094   Min.   :0.000      Min.   :11.0      Min.   :3.00     
##  1st Qu.: 7993   1st Qu.:1.000      1st Qu.:12.0      1st Qu.:3.00     
##  Median :14270   Median :2.000      Median :14.0      Median :3.00     
##  Mean   :14287   Mean   :2.662      Mean   :15.2      Mean   :3.15     
##  3rd Qu.:20462   3rd Qu.:4.000      3rd Qu.:18.0      3rd Qu.:3.00     
##  Max.   :26999   Max.   :9.000      Max.   :25.0      Max.   :4.00     
##  RelationshipSatisfaction StandardHours     Shift        TotalWorkingYears
##  Min.   :1.000            Min.   :80    Min.   :0.0000   Min.   : 0.00    
##  1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000   1st Qu.: 6.00    
##  Median :3.000            Median :80    Median :1.0000   Median :10.00    
##  Mean   :2.718            Mean   :80    Mean   :0.8061   Mean   :11.34    
##  3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000   3rd Qu.:15.00    
##  Max.   :4.000            Max.   :80    Max.   :3.0000   Max.   :40.00    
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         Min.   :1.000   Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         Median :3.000   Median : 5.000   Median : 3.000    
##  Mean   :2.805         Mean   :2.766   Mean   : 7.033   Mean   : 4.265    
##  3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.:10.000   3rd Qu.: 7.000    
##  Max.   :6.000         Max.   :4.000   Max.   :40.000   Max.   :18.000    
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.0            Min.   : 0.000      
##  1st Qu.: 0.0            1st Qu.: 2.000      
##  Median : 1.0            Median : 3.000      
##  Mean   : 2.2            Mean   : 4.135      
##  3rd Qu.: 3.0            3rd Qu.: 7.000      
##  Max.   :15.0            Max.   :17.000

There are some variables that can be removed as they do not give useful information nor relevant to the dependent variable

#healthcare <- healthcare %>% select(-c("Over18", "EmployeeCount", "EmployeeID", "StandardHours", "HourlyRate", "MonthlyRate", "DailyRate"))

healthcare <- healthcare [-c(22, 10, 1, 27, 13, 20, 5)]
str(healthcare)
## tibble [1,676 × 28] (S3: tbl_df/tbl/data.frame)
##  $ Age                     : num [1:1676] 41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : chr [1:1676] "No" "No" "Yes" "No" ...
##  $ BusinessTravel          : chr [1:1676] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
##  $ Department              : chr [1:1676] "Cardiology" "Maternity" "Maternity" "Maternity" ...
##  $ DistanceFromHome        : num [1:1676] 1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : num [1:1676] 2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : chr [1:1676] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
##  $ EnvironmentSatisfaction : num [1:1676] 2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : chr [1:1676] "Female" "Male" "Male" "Female" ...
##  $ JobInvolvement          : num [1:1676] 3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : num [1:1676] 2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : chr [1:1676] "Nurse" "Other" "Nurse" "Other" ...
##  $ JobSatisfaction         : num [1:1676] 4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : chr [1:1676] "Single" "Married" "Single" "Married" ...
##  $ MonthlyIncome           : num [1:1676] 5993 5130 2090 2909 3468 ...
##  $ NumCompaniesWorked      : num [1:1676] 8 1 6 1 9 0 4 1 0 6 ...
##  $ OverTime                : chr [1:1676] "Yes" "No" "Yes" "Yes" ...
##  $ PercentSalaryHike       : num [1:1676] 11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : num [1:1676] 3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: num [1:1676] 1 4 2 3 4 3 1 2 2 2 ...
##  $ Shift                   : num [1:1676] 0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : num [1:1676] 8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : num [1:1676] 0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : num [1:1676] 1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : num [1:1676] 6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : num [1:1676] 4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : num [1:1676] 0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : num [1:1676] 5 7 0 0 2 6 0 0 8 7 ...
# Now the number of columns reduced from 35 to 28. Now let’s check if there is a missing value:

Here is the first 6 data.

head(healthcare)
## # A tibble: 6 × 28
##     Age Attrition Busin…¹ Depar…² Dista…³ Educa…⁴ Educa…⁵ Envir…⁶ Gender JobIn…⁷
##   <dbl> <chr>     <chr>   <chr>     <dbl>   <dbl> <chr>     <dbl> <chr>    <dbl>
## 1    41 No        Travel… Cardio…       1       2 Life S…       2 Female       3
## 2    49 No        Travel… Matern…       8       1 Life S…       3 Male         2
## 3    37 Yes       Travel… Matern…       2       2 Other         4 Male         2
## 4    33 No        Travel… Matern…       3       4 Life S…       4 Female       3
## 5    27 No        Travel… Matern…       2       1 Medical       1 Male         3
## 6    32 No        Travel… Matern…       2       2 Life S…       4 Male         3
## # … with 18 more variables: JobLevel <dbl>, JobRole <chr>,
## #   JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## #   NumCompaniesWorked <dbl>, OverTime <chr>, PercentSalaryHike <dbl>,
## #   PerformanceRating <dbl>, RelationshipSatisfaction <dbl>, Shift <dbl>,
## #   TotalWorkingYears <dbl>, TrainingTimesLastYear <dbl>,
## #   WorkLifeBalance <dbl>, YearsAtCompany <dbl>, YearsInCurrentRole <dbl>,
## #   YearsSinceLastPromotion <dbl>, YearsWithCurrManager <dbl>, and …

There was a level called admin and adminstration, so I made them same level

#table(healthcare$JobRole)

healthcare$JobRole<- car::Recode(healthcare$JobRole,
                     recodes="'Admin'='Administrative'",
                    as.factor=T)

Socio Demographic Associated variable- Gender,Education Background, Age

detach(package: MASS, unload = TRUE)
## Warning: 'MASS' namespace cannot be unloaded:
##   namespace 'MASS' is imported by 'DescTools', 'ipred' so cannot be unloaded
#library(conflicted)
#conflict_prefer("select", "dplyr")

d1_plot <- healthcare %>% 
  select(Gender, Attrition) %>% 
  count(Gender, Attrition) %>% 
  ggplot(aes(x=Gender, y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.2, reverse = F), size=4) +
  labs(fill = "Attrition", y="Count") + theme_minimal() +
  theme(axis.text.x = element_text(angle = 40),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank())
## Warning: Ignoring unknown aesthetics: fill
d2_plot <- healthcare %>%
  select(EducationField, Attrition) %>% 
  count(EducationField, Attrition) %>% 
  ggplot(aes(x=EducationField, y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F), size=4) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 40),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(fill = "Attrition", y = "")
## Warning: Ignoring unknown aesthetics: fill
d3_plot <- healthcare %>% 
  select(Department, Attrition) %>% count(Department, Attrition) %>% 
  ggplot(aes(x=Department, y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 40),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  scale_x_discrete(labels=c("Cardiology","Maternity","Neurology")) +
  labs(fill = "Attrition", y = "Count")
## Warning: Ignoring unknown aesthetics: fill
d4_plot <- healthcare %>% 
  select(JobRole, Attrition) %>% count(JobRole, Attrition) %>% 
  ggplot(aes(x=JobRole, y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.3, reverse = F),size=3.5) +
  coord_flip() +
  theme_minimal() + 
  theme(#axis.text.x = element_text(angle = 90),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(fill = "Attrition",
       x = "Job Role", y = "Count")
## Warning: Ignoring unknown aesthetics: fill
d5_plot <- healthcare %>%
  select(JobLevel, Attrition) %>% count(JobLevel, Attrition) %>% 
  ggplot(aes(x=JobLevel, y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.8, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(fill = "Attrition",
       x = "Job Level", y="")
## Warning: Ignoring unknown aesthetics: fill
d6_plot <- healthcare %>%
  mutate(Age = as.factor(
    ifelse(Age < 20, "18-19",
        ifelse((Age >= 20) & (Age <= 25), "20-25",
          ifelse((Age >= 26) & (Age <= 30), "26-30",
            ifelse((Age >= 31) & (Age <= 35), "31-35",
              ifelse((Age >= 36) & (Age <= 40), "36-40",
                ifelse((Age >= 41) & (Age <= 45), "41-45",
                  ifelse((Age >= 46) & (Age <= 50), "46-50",
                    ifelse((Age >= 51) & (Age <= 55), "51-55", ">55"
                    )
                  )
                )
              )
            )
          )
        )
      )
    ) 
  ) %>% 
  group_by(Age, Attrition) %>% count(Age, Attrition) %>% 
  ggplot(aes(x=factor(Age, levels = c("18-19", "20-25", "26-30", "31-35", "36-40",
                                      "41-45", "46-50", "51-55", ">55")), 
             y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.3, reverse = F),size=3) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90),
         panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(fill = "Attrition", x = "Age", y = "")
## Warning: Ignoring unknown aesthetics: fill
demography_plot <- ggarrange(d1_plot, d2_plot, d3_plot, d5_plot,
                      ncol = 2, nrow = 2, 
                      common.legend = T, 
                      legend = "bottom")
demography_plot

demography_plot2 <- ggarrange(d4_plot, d6_plot,
                      ncol = 2, nrow = 1, 
                      common.legend = T, 
                      legend = "bottom")
demography_plot2

Worker Experiences variables

# 1. TotalWorkingYears
plot_exp1 <- healthcare %>% select(TotalWorkingYears, Attrition) %>% 
  mutate(binning_workingyears = as.factor(
                                ifelse(TotalWorkingYears <= 4, "0-4", 
                                  ifelse((TotalWorkingYears >= 5 & TotalWorkingYears <= 9), "5-9",
                                    ifelse((TotalWorkingYears >= 10 & TotalWorkingYears <= 14), "10-14",
                                      ifelse((TotalWorkingYears >= 15 & TotalWorkingYears <= 19), "15-19",
                                        ifelse((TotalWorkingYears >= 20 & TotalWorkingYears <= 24), "20-24",
                                          ifelse((TotalWorkingYears >= 25 & TotalWorkingYears <= 29), "25-29",
                                            ifelse((TotalWorkingYears >= 26 & TotalWorkingYears <= 29), "26-29",
                                              ifelse((TotalWorkingYears >= 30 & TotalWorkingYears <= 34), "30-34",
                                                ifelse((TotalWorkingYears >= 35 & TotalWorkingYears <= 39), "35-39", ">=40"
                                                      )
                                                    )
                                                  )
                                                )
                                              )
                                            )
                                          )
                                        )
                                      )
                                )
         ) %>% 
  count(binning_workingyears, Attrition) %>% 
  ggplot(aes(x=factor(binning_workingyears, levels = c("0-4", "5-9", "10-14", "15-19", "20-24",
                                                       "25-29", "26-29", "30-34", "35-39", ">=40")), 
             y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.3, reverse = F), size=3) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 40),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(
    title="Total Working Experience",
    fill = "Attrition", x = "Total Working (Years)", y = ""
    )
## Warning: Ignoring unknown aesthetics: fill
# 2.NumCompaniesWorked
plot_exp2 <- healthcare %>% select(NumCompaniesWorked, Attrition) %>% 
  mutate(binning = as.factor(
                                ifelse(NumCompaniesWorked <= 1, "0-1", 
                                  ifelse((NumCompaniesWorked >= 2 & NumCompaniesWorked <= 3), "2-3",
                                    ifelse((NumCompaniesWorked >= 4 & NumCompaniesWorked <= 5), "4-5",
                                      ifelse((NumCompaniesWorked >= 6 & NumCompaniesWorked <= 7), "6-7", ">7"
                                            )
                                          )
                                        )
                                      )
                                )
         ) %>% 
  count(binning, Attrition) %>% 
  ggplot(aes(x=factor(binning, levels = c("0-1", "2-3", "4-5", "6-7",">7")), 
             y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 40),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(
    title="Total Company Worked",
    fill = "Attrition", x = "Total Company Worked", y = ""
    )
## Warning: Ignoring unknown aesthetics: fill
# 3.YearsAtCompany
plot_exp3 <- healthcare %>% select(YearsAtCompany, Attrition) %>% 
  mutate(binning_years = as.factor(
                                ifelse(YearsAtCompany <= 4, "0-4", 
                                  ifelse((YearsAtCompany >= 5 & YearsAtCompany <= 9), "5-9",
                                    ifelse((YearsAtCompany >= 10 & YearsAtCompany <= 14), "10-14",
                                      ifelse((YearsAtCompany >= 15 & YearsAtCompany <= 19), "15-19",
                                        ifelse((YearsAtCompany >= 20 & YearsAtCompany <= 24), "20-24",
                                          ifelse((YearsAtCompany >= 25 & YearsAtCompany <= 29), "25-29",
                                            ifelse((YearsAtCompany >= 26 & YearsAtCompany <= 29), "26-29",
                                              ifelse((YearsAtCompany >= 30 & YearsAtCompany <= 34), "30-34",
                                                ifelse((YearsAtCompany >= 35 & YearsAtCompany <= 39), "35-39", ">=40"
                                                      )
                                                    )
                                                  )
                                                )
                                              )
                                            )
                                          )
                                        )
                                      )
                                )
         ) %>% 
  count(binning_years, Attrition) %>% 
  ggplot(aes(x=factor(binning_years, levels = c("0-4", "5-9", "10-14", "15-19", "20-24",
                                                       "25-29", "26-29", "30-34", "35-39", ">=40")), 
             y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.2, reverse = F), size=3) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 40),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(
    title="Years at Company",
    fill = "Attrition", x = "Years at Company", y = ""
    )
## Warning: Ignoring unknown aesthetics: fill
# 4.NumCompaniesWorked
plot_exp4 <- healthcare %>% select(TrainingTimesLastYear, Attrition) %>% 
  mutate(binning = as.factor(
                                ifelse(TrainingTimesLastYear <= 1, "0-1", 
                                  ifelse((TrainingTimesLastYear >= 2 & TrainingTimesLastYear <= 3), "2-3",
                                    ifelse((TrainingTimesLastYear >= 4 & TrainingTimesLastYear <= 5), "4-5", ">5"
                                          )
                                        )
                                      )
                                )
         ) %>% 
  count(binning, Attrition) %>% 
  ggplot(aes(x=factor(binning, levels = c("0-1", "2-3", "4-5", ">5")), 
             y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.8, reverse = F), size=4) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 40),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(
    title="Total Completed Training",
    fill = "Attrition", x = "Total Training Completed", y = ""
    )
## Warning: Ignoring unknown aesthetics: fill
experience_plot <- ggarrange(plot_exp1, plot_exp2, plot_exp3, plot_exp4,
                      ncol = 2, nrow = 2, 
                      common.legend = T, 
                      legend = "bottom")
experience_plot

Company Survey

#1. EnvironmentSatisfaction
plot_sv1 <- healthcare %>% select(EnvironmentSatisfaction, Attrition) %>% 
  count(EnvironmentSatisfaction, Attrition) %>% 
  ggplot(aes(x=factor(EnvironmentSatisfaction), y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(
    title="Employee Satisfaction",
    fill = "Attrition", x = "", y = ""
    )
## Warning: Ignoring unknown aesthetics: fill
#2. JobSatisfaction
plot_sv2 <- healthcare %>% select(JobSatisfaction, Attrition) %>% 
  count(JobSatisfaction, Attrition) %>% 
  ggplot(aes(x=factor(JobSatisfaction), y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(
    title="Job Satisfaction",
    fill = "Attrition", x = "", y = ""
    )
## Warning: Ignoring unknown aesthetics: fill
#3. JobInvolvement
plot_sv3 <- healthcare %>% select(JobInvolvement, Attrition) %>% 
  count(JobInvolvement, Attrition) %>% 
  ggplot(aes(x=factor(JobInvolvement), y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(
    title="Job Involvement",
    fill = "Attrition", x = "", y = ""
    )
## Warning: Ignoring unknown aesthetics: fill
#4. WorkLifeBalance
plot_sv4 <- healthcare %>% select(WorkLifeBalance, Attrition) %>% 
  count(WorkLifeBalance, Attrition) %>% 
  ggplot(aes(x=factor(WorkLifeBalance), y=n)) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(
    title="Work Life Balance",
    fill = "Attrition", x = "", y = ""
    )
## Warning: Ignoring unknown aesthetics: fill
csurvey_plot <- ggarrange(plot_sv1, plot_sv2, plot_sv3, plot_sv4,
                      ncol = 2, nrow = 2, 
                      common.legend = T, 
                      legend = "bottom")
csurvey_plot

healthcare %>% select(Attrition) %>% count(Attrition) %>% 
  mutate(percent=round((n/sum(n))*100,2), 
         lab_ypos = cumsum(percent) - 0.7*percent) %>% 
  ggplot(aes(x=2, y=percent, fill = factor(Attrition, levels = c("Yes", "No")))) +
  geom_bar(stat="identity", start=0) +
  coord_polar(theta = "y", start=0) +
  geom_text(aes(y = lab_ypos, 
                label = paste0(percent,' ','%')), color = "white") +
  theme_void() + theme(legend.position = "bottom") + xlim(0.5, 2.5) +
  labs(title = "Attrition Rate in Health Care ", fill = "Attrition")
## Warning: Ignoring unknown parameters: start

Department

temp_plot1 <- healthcare %>% 
  select(Department, Attrition) %>% count(Department, Attrition) %>% 
  group_by(Department) %>% 
  mutate(percent = round((n/sum(n))*100,2)) %>% 
  ggplot(aes(x=Department, y=n,
             text=paste0('</br>Department: ', Department,
                         '</br>Attrition Status: ', Attrition,
                         '</br>Count: ', n,
                         '</br>Percentage: ', percent, ' ', '%'))) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(title = "Attrition Rate per Department", fill = "Attrition", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(temp_plot1, tooltip="text")
healthcare$Gender<- as.character(healthcare$Gender)
temp_plot22 <- healthcare %>% 
  select(Gender, Attrition) %>% count(Gender, Attrition) %>% 
  group_by(Gender) %>% 
  mutate(percent = round((n/sum(n))*100,2)) %>% 
  ggplot(aes(x=factor(Gender), y=n,
             text=paste0('</br>Gender: ', Gender,
                         '</br>Attrition Status: ', Attrition,
                         '</br>Count: ', n,
                         '</br>Percentage: ', percent, ' ', '%'))) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(title = "Attrition Rate per Gender", fill = "Attrition", x = "Gender", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(temp_plot22, tooltip="text")

#Job Level

temp_plot2 <- healthcare %>% 
  select(JobLevel, Attrition) %>% count(JobLevel, Attrition) %>% 
  group_by(JobLevel) %>% 
  mutate(percent = round((n/sum(n))*100,2)) %>% 
  ggplot(aes(x=factor(JobLevel), y=n,
             text=paste0('</br>Job Level: ', JobLevel,
                         '</br>Attrition Status: ', Attrition,
                         '</br>Count: ', n,
                         '</br>Percentage: ', percent, ' ', '%'))) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(title = "Attrition Rate per Job Level", fill = "Attrition", x = "Job Level", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(temp_plot2, tooltip="text")

Around 63 % of employees were in Job Level 1 and 2. The highest attrition rate is from Job Level I (26.34 %), followed by Job Level III (14.68 %), Job Level II (9.74 %), Job Level V (7.25 %), and Job Level IV (4.72 %).

#Salary & Satisfaction Score

avgincome_plot1 <-  healthcare %>% 
  select(JobRole, MonthlyIncome, Attrition) %>% 
  group_by(JobRole, Attrition) %>% 
  summarise(avg_monthly_income = round(mean(MonthlyIncome),2)) %>% 
  ggplot(aes(x=JobRole, y=avg_monthly_income,
             fill=factor(Attrition, levels = c("Yes","No")))) +
  geom_bar(stat = "identity", position = "dodge") + 
  geom_text(aes(x=JobRole, y=1000, 
                label = paste0("$"," ", avg_monthly_income)),
                fontface="bold", color="white", size = 2.5, nudge_y = 1500) +
  facet_wrap(~Attrition) +
  coord_flip() +
  theme_minimal() +
  labs(
    x = "Job Role", y = "Average Income (dollar)",
    fill = "Attrition",
    title="Average Income\nby Job Role and Attrition Status"
  ) +
  theme(axis.text.x = element_text(angle = 90),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank())
## `summarise()` has grouped output by 'JobRole'. You can override using the
## `.groups` argument.
avgincome_plot1

median_incomevsjobsscore <- healthcare %>% select(MonthlyIncome, JobSatisfaction, Attrition) %>% 
  group_by(JobSatisfaction, Attrition) %>% 
  summarise(median_income = median(MonthlyIncome)) %>% 
  ggplot(aes(x=median_income, y=JobSatisfaction, 
             color=factor(Attrition, levels = c("Yes","No")))) +
  geom_point(size = 4) +
  geom_segment(aes(x = 0, xend = median_income, 
                   y = JobSatisfaction, yend = JobSatisfaction), size = 2) +
  geom_text(aes(x=1000, y=JobSatisfaction, 
                label= paste0("$ ", " ", median_income)),
            nudge_y = 0.2) + 
  facet_wrap(~Attrition) +
  theme_minimal() +
  theme(legend.position = "bottom",
        panel.grid.major.x = element_line(linetype = "dashed", colour = "grey"),
        panel.grid.minor.x = element_line(linetype = "dashed", colour = "grey"),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank()) +
  labs(
    x = "Median Income (dollar)", y = "Job Satisfaction Score",
    color = "Attrition",
    title="Does Income affect on Job Satisfaction?\nby Attrition Status"
  )
## `summarise()` has grouped output by 'JobSatisfaction'. You can override using
## the `.groups` argument.
median_incomevsjobsscore

#summary(healthcare$PercentSalaryHike)

plot_subsalary1 <- healthcare %>% select(PercentSalaryHike, Attrition) %>% 
  count(PercentSalaryHike, Attrition) %>% 
  group_by(PercentSalaryHike) %>% mutate(percent = round((n/sum(n))*100,2)) %>% 
  ggplot(aes(x=factor(PercentSalaryHike), y=n,
             text=paste0('</br>Salary Hike Last Year (in percent): ', PercentSalaryHike,
                         '</br>Attrition Status: ', Attrition,
                         '</br>Count: ', n,
                         '</br>Percentage: ', percent, ' ', '%'))) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
            angle=90, fontface="bold", color="white", size=2,
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(title = "Attrition Rate per Salary Hike Last Year\n in percent", fill = "Attrition", x = "Salary Hike (%)", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(plot_subsalary1, tooltip="text")  
healthcare %>% select(EnvironmentSatisfaction, JobRole, Attrition) %>% 
  group_by(JobRole, Attrition) %>% 
  summarize(avg_env_score = round(mean(EnvironmentSatisfaction),2)) %>% 
  ggplot(aes(x=JobRole,y=avg_env_score)) +
  geom_line(aes(group=Attrition), linetype= "twodash", size=1) +
  geom_point(aes(color=Attrition), size=3) + 
  theme_minimal() +
  theme(legend.position = "top", axis.text.x = element_text(angle = 90),
        axis.line = element_line(colour = "grey", 
                      size = 0.7, linetype = "solid"),
        panel.grid.major.x = element_line(size = 0.5, linetype = "dashed", colour = "lightgray"),
        panel.grid.minor.x = element_line(size = 0.5, linetype = "dashed", colour = "lightgray"),
        panel.grid.major.y = element_line(size = 0.5,linetype = "dashed", colour = "lightgray")) +
  labs(
    x = "", y = "Average Score",
    fill = "Attrition",
    title = "Average Environtment Satisfaction Score\nper Job Role"
  )
## `summarise()` has grouped output by 'JobRole'. You can override using the
## `.groups` argument.

temp_plot4 <- healthcare %>% 
  select(Gender, MaritalStatus, Attrition) %>% count(Gender, MaritalStatus, Attrition) %>% 
  group_by(Gender, MaritalStatus) %>% 
  mutate(percent = round((n/sum(n))*100,2)) %>% 
  ggplot(aes(x=factor(MaritalStatus), y=n,
             text=paste0('</br>Gender: ', Gender,
                         '</br>Marital Status: ', MaritalStatus,
                         '</br>Attrition Status: ', Attrition,
                         '</br>Count: ', n,
                         '</br>Percentage: ', percent, ' ', '%'))) +
  geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
  facet_wrap(~Gender) +
  geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
            position = position_stack(vjust = 0.5, reverse = F)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(title = "Attrition Rate per Job Level", fill = "Attrition", x = "Marital Status", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(temp_plot4, tooltip="text")

Chi-Square Test for Feature Selection

chisq.test(healthcare$BusinessTravel, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$BusinessTravel and healthcare$Attrition
## X-squared = 13.59, df = 2, p-value = 0.001119
chisq.test(healthcare$Department, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$Department and healthcare$Attrition
## X-squared = 8.0133, df = 2, p-value = 0.01819
chisq.test(healthcare$Education, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$Education and healthcare$Attrition
## X-squared = 9.0625, df = 4, p-value = 0.05956
chisq.test(healthcare$EducationField, healthcare$Attrition)
## Warning in chisq.test(healthcare$EducationField, healthcare$Attrition): Chi-
## squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$EducationField and healthcare$Attrition
## X-squared = 7.8745, df = 5, p-value = 0.1633
chisq.test(healthcare$EnvironmentSatisfaction, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$EnvironmentSatisfaction and healthcare$Attrition
## X-squared = 23.315, df = 3, p-value = 3.471e-05
chisq.test(healthcare$Gender, healthcare$Attrition)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  healthcare$Gender and healthcare$Attrition
## X-squared = 0.59123, df = 1, p-value = 0.4419
chisq.test(healthcare$JobInvolvement, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$JobInvolvement and healthcare$Attrition
## X-squared = 52.007, df = 3, p-value = 2.984e-11
chisq.test(healthcare$JobRole, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$JobRole and healthcare$Attrition
## X-squared = 43.64, df = 3, p-value = 1.799e-09
chisq.test(healthcare$JobSatisfaction, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$JobSatisfaction and healthcare$Attrition
## X-squared = 11.49, df = 3, p-value = 0.009353
chisq.test(healthcare$MaritalStatus, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$MaritalStatus and healthcare$Attrition
## X-squared = 72.489, df = 2, p-value < 2.2e-16
chisq.test(healthcare$PerformanceRating, healthcare$Attrition)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  healthcare$PerformanceRating and healthcare$Attrition
## X-squared = 0.11125, df = 1, p-value = 0.7387
chisq.test(healthcare$OverTime, healthcare$Attrition)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  healthcare$OverTime and healthcare$Attrition
## X-squared = 188.47, df = 1, p-value < 2.2e-16
chisq.test(healthcare$RelationshipSatisfaction, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$RelationshipSatisfaction and healthcare$Attrition
## X-squared = 1.5787, df = 3, p-value = 0.6642
chisq.test(healthcare$WorkLifeBalance, healthcare$Attrition)
## 
##  Pearson's Chi-squared test
## 
## data:  healthcare$WorkLifeBalance and healthcare$Attrition
## X-squared = 25.063, df = 3, p-value = 1.498e-05
library(knitr)
chisq_results = 
  data.frame(Variable = c("Business Travel",
                          "Department", 
                          "Education",
                          "Education Field",
                          "Environment Satisfaction",
                          "Gender",
                          "Job Involvement", 
                          "Job Role",
                          "Job Satisfaction",
                          "Marital Status",
                          "Over Time",
                          "Performance Rating",
                          "Relationship Satisfaction",
                          #"Stock Option Level",
                          "Work Life Balance"),
             Chi_Sq_Stat = c(13.59,
                             8.01, 
                             9.06, 
                             7.87,
                             23.31,
                             0.59,
                             52.00,
                             43.64,
                            11.49,
                             72.48,
                             0.11,
                             188.47,
                             1.57,
                             25.06),
             P_value = c(0.00,
                         0.01,
                         0.05, 
                         0.16,
                         0.00,
                         0.44,
                         0.00,
                         0.00,
                         0.00,
                         0.00,
                         0.73,
                         0.00,
                         0.66,
                         0.00),
             Stat_Sig = c("Yes",
                          "Yes",
                          "No",
                          "No",
                          "Yes",
                          "No",
                          "Yes",
                          "Yes",
                          "Yes",
                          "Yes",
                          "No",
                          "Yes",
                          "No",
                          "Yes"))

kable(chisq_results, 
      col.names = c("Variable","Chi-Square Statistic", "p-value", "Statistically Significant"))
Variable Chi-Square Statistic p-value Statistically Significant
Business Travel 13.59 0.00 Yes
Department 8.01 0.01 Yes
Education 9.06 0.05 No
Education Field 7.87 0.16 No
Environment Satisfaction 23.31 0.00 Yes
Gender 0.59 0.44 No
Job Involvement 52.00 0.00 Yes
Job Role 43.64 0.00 Yes
Job Satisfaction 11.49 0.00 Yes
Marital Status 72.48 0.00 Yes
Over Time 0.11 0.73 No
Performance Rating 188.47 0.00 Yes
Relationship Satisfaction 1.57 0.66 No
Work Life Balance 25.06 0.00 Yes

ANOVA for Feature Selection

library(stats)

# ANOVA: Distance From Home and Attrition 
aov.res = aov(DistanceFromHome~Attrition, data = healthcare)
summary(aov.res)
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## Attrition      1   1243  1242.7   18.87 1.48e-05 ***
## Residuals   1674 110237    65.9                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(aov.res)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value   Pr(>F)   
## group    1  10.204 0.001427 **
##       1674                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#oneway.test(DistanceFromHome~Attrition, data = healthcare, var.equal = FALSE)



# ANOVA: Monthly Income and Attrition 
aov.res = aov(MonthlyIncome~Attrition, data = healthcare)
summary(aov.res)
##               Df    Sum Sq   Mean Sq F value   Pr(>F)    
## Attrition      1 1.403e+09 1.403e+09   65.14 1.32e-15 ***
## Residuals   1674 3.605e+10 2.153e+07                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(aov.res)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value   Pr(>F)    
## group    1  33.209 9.83e-09 ***
##       1674                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#oneway.test(MonthlyIncome~Attrition, data = healthcare, var.equal = FALSE)
anova_results = 
  data.frame(Variable = c("Distance From Home",
                          "Monthly Income"),
             F_Stat = c(18.87,
                        65.14),
             P_value = c(0.00,
                         0.00),
             Levene_Test = c(0.001,
                             0.00),
             Stat_Sig = c("Yes",
                          "Yes"))

kable(anova_results , 
      col.names = c("Variable","F Statistic", "p-value", "Levene's Test p-value","Statistically Significant"))
Variable F Statistic p-value Levene’s Test p-value Statistically Significant
Distance From Home 18.87 0 0.001 Yes
Monthly Income 65.14 0 0.000 Yes
healthcare2<- healthcare

REGRESSION

GLM

#extract all yes and nos from y
healthcare_yes <- healthcare %>% filter(Attrition=='Yes') #total 199 obs
healthcare_no <- healthcare%>% filter(Attrition=='No')

#sample 199 obs from Attrition=='No'
set.seed(200)
healthcare_no_sample<- sample_n(healthcare_no, 199)

#combine sample nos and all yes' for a balanced data set
healthcare <- rbind(healthcare_no_sample, healthcare_yes)
dim(healthcare)
## [1] 398  28
healthcare$Attrition<- ifelse(healthcare$Attrition =="Yes", 1, 0)  
#80/20 train/test split
set.seed(200)
sample <- sample(nrow(healthcare), 0.8*nrow(healthcare), replace= F)
train<-  healthcare[sample, ]
test<- healthcare[-sample, ]

ATTEMPT #1 - Fit entire model - Complex

glm.fit1<- glm(Attrition~., data=train, family = binomial())

summary(glm.fit1)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial(), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4603  -0.2201   0.0170   0.2755   3.2570  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      8.1065718  4.3156112   1.878 0.060322 .  
## Age                             -0.0719136  0.0345662  -2.080 0.037483 *  
## BusinessTravelTravel_Frequently  2.6912722  1.0182904   2.643 0.008219 ** 
## BusinessTravelTravel_Rarely      0.6745823  0.8636300   0.781 0.434743    
## DepartmentMaternity             -1.4461135  0.7109350  -2.034 0.041941 *  
## DepartmentNeurology             -1.2343627  0.8426828  -1.465 0.142975    
## DistanceFromHome                 0.0840412  0.0298161   2.819 0.004823 ** 
## Education                       -0.1455381  0.2349748  -0.619 0.535668    
## EducationFieldLife Sciences     -1.6967636  1.4747900  -1.151 0.249933    
## EducationFieldMarketing         -1.4798113  1.5956419  -0.927 0.353715    
## EducationFieldMedical           -1.4480374  1.4639739  -0.989 0.322607    
## EducationFieldOther              0.4777449  2.0867393   0.229 0.818913    
## EducationFieldTechnical Degree  -1.2067540  1.5891890  -0.759 0.447642    
## EnvironmentSatisfaction         -0.5065119  0.2112972  -2.397 0.016523 *  
## GenderMale                       0.3494501  0.4663860   0.749 0.453693    
## JobInvolvement                  -1.6612075  0.3934164  -4.223 2.42e-05 ***
## JobLevel                        -1.1624995  0.9293545  -1.251 0.210983    
## JobRoleNurse                     3.7228240  1.9508992   1.908 0.056358 .  
## JobRoleOther                     3.2870288  1.9976678   1.645 0.099881 .  
## JobRoleTherapist                 0.3164105  2.0271682   0.156 0.875966    
## JobSatisfaction                 -0.7404001  0.2195938  -3.372 0.000747 ***
## MaritalStatusMarried             1.2649033  0.6843319   1.848 0.064548 .  
## MaritalStatusSingle              3.9119036  0.8971047   4.361 1.30e-05 ***
## MonthlyIncome                    0.0002637  0.0002237   1.179 0.238544    
## NumCompaniesWorked               0.3155358  0.1063027   2.968 0.002995 ** 
## OverTimeYes                      4.4069806  0.6299006   6.996 2.63e-12 ***
## PercentSalaryHike               -0.1935204  0.1089278  -1.777 0.075635 .  
## PerformanceRating                0.9724953  1.1572666   0.840 0.400719    
## RelationshipSatisfaction        -0.1748412  0.2010555  -0.870 0.384510    
## Shift                            0.5316162  0.4065238   1.308 0.190971    
## TotalWorkingYears               -0.0058947  0.0856518  -0.069 0.945132    
## TrainingTimesLastYear           -0.3694837  0.1957755  -1.887 0.059122 .  
## WorkLifeBalance                 -0.9518480  0.3106259  -3.064 0.002182 ** 
## YearsAtCompany                   0.1032067  0.1292205   0.799 0.424472    
## YearsInCurrentRole              -0.5514176  0.1689982  -3.263 0.001103 ** 
## YearsSinceLastPromotion          0.2539608  0.1147162   2.214 0.026841 *  
## YearsWithCurrManager            -0.2870282  0.1436914  -1.998 0.045767 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 440.23  on 317  degrees of freedom
## Residual deviance: 158.51  on 281  degrees of freedom
## AIC: 232.51
## 
## Number of Fisher Scoring iterations: 7
#MASS package
step(glm.fit1, direction = "both")
## Start:  AIC=232.51
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     Education + EducationField + EnvironmentSatisfaction + Gender + 
##     JobInvolvement + JobLevel + JobRole + JobSatisfaction + MaritalStatus + 
##     MonthlyIncome + NumCompaniesWorked + OverTime + PercentSalaryHike + 
##     PerformanceRating + RelationshipSatisfaction + Shift + TotalWorkingYears + 
##     TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + 
##     YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - EducationField            5   161.93 225.93
## - TotalWorkingYears         1   158.51 230.51
## - Education                 1   158.90 230.90
## - Gender                    1   159.07 231.07
## - YearsAtCompany            1   159.11 231.11
## - PerformanceRating         1   159.23 231.23
## - RelationshipSatisfaction  1   159.27 231.27
## - MonthlyIncome             1   159.92 231.92
## - JobLevel                  1   160.10 232.10
## - Shift                     1   160.34 232.34
## <none>                          158.51 232.51
## - Department                2   163.04 233.04
## - PercentSalaryHike         1   161.84 233.84
## - TrainingTimesLastYear     1   162.24 234.24
## - YearsWithCurrManager      1   162.66 234.66
## - Age                       1   163.16 235.16
## - YearsSinceLastPromotion   1   163.69 235.69
## - EnvironmentSatisfaction   1   164.61 236.61
## - JobRole                   3   169.05 237.05
## - DistanceFromHome          1   167.47 239.47
## - NumCompaniesWorked        1   168.60 240.60
## - WorkLifeBalance           1   169.20 241.20
## - BusinessTravel            2   173.11 243.11
## - JobSatisfaction           1   171.65 243.65
## - YearsInCurrentRole        1   172.84 244.83
## - MaritalStatus             2   186.48 256.48
## - JobInvolvement            1   185.37 257.37
## - OverTime                  1   257.40 329.40
## 
## Step:  AIC=225.93
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     Education + EnvironmentSatisfaction + Gender + JobInvolvement + 
##     JobLevel + JobRole + JobSatisfaction + MaritalStatus + MonthlyIncome + 
##     NumCompaniesWorked + OverTime + PercentSalaryHike + PerformanceRating + 
##     RelationshipSatisfaction + Shift + TotalWorkingYears + TrainingTimesLastYear + 
##     WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + 
##     YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - TotalWorkingYears         1   161.95 223.95
## - Education                 1   162.22 224.22
## - YearsAtCompany            1   162.41 224.41
## - Gender                    1   162.64 224.64
## - PerformanceRating         1   162.77 224.77
## - MonthlyIncome             1   162.80 224.80
## - JobLevel                  1   162.93 224.93
## - Shift                     1   163.08 225.08
## - RelationshipSatisfaction  1   163.22 225.22
## <none>                          161.93 225.93
## - YearsWithCurrManager      1   165.32 227.32
## - Department                2   167.38 227.38
## - PercentSalaryHike         1   165.41 227.41
## - TrainingTimesLastYear     1   165.59 227.59
## - YearsSinceLastPromotion   1   166.56 228.56
## - Age                       1   166.64 228.64
## - EnvironmentSatisfaction   1   167.18 229.18
## - JobRole                   3   172.42 230.42
## - DistanceFromHome          1   170.23 232.23
## + EducationField            5   158.51 232.51
## - WorkLifeBalance           1   171.28 233.28
## - NumCompaniesWorked        1   173.07 235.07
## - JobSatisfaction           1   173.57 235.57
## - YearsInCurrentRole        1   175.30 237.30
## - BusinessTravel            2   177.50 237.50
## - MaritalStatus             2   187.97 247.97
## - JobInvolvement            1   192.13 254.13
## - OverTime                  1   259.81 321.81
## 
## Step:  AIC=223.95
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     Education + EnvironmentSatisfaction + Gender + JobInvolvement + 
##     JobLevel + JobRole + JobSatisfaction + MaritalStatus + MonthlyIncome + 
##     NumCompaniesWorked + OverTime + PercentSalaryHike + PerformanceRating + 
##     RelationshipSatisfaction + Shift + TrainingTimesLastYear + 
##     WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + 
##     YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - Education                 1   162.28 222.28
## - YearsAtCompany            1   162.41 222.41
## - Gender                    1   162.66 222.66
## - PerformanceRating         1   162.81 222.81
## - MonthlyIncome             1   162.81 222.81
## - Shift                     1   163.12 223.12
## - JobLevel                  1   163.13 223.13
## - RelationshipSatisfaction  1   163.23 223.23
## <none>                          161.95 223.95
## - YearsWithCurrManager      1   165.37 225.37
## - PercentSalaryHike         1   165.46 225.46
## - TrainingTimesLastYear     1   165.63 225.63
## + TotalWorkingYears         1   161.93 225.93
## - Department                2   168.09 226.09
## - YearsSinceLastPromotion   1   166.61 226.61
## - EnvironmentSatisfaction   1   167.18 227.18
## - Age                       1   167.69 227.69
## - JobRole                   3   173.29 229.29
## - DistanceFromHome          1   170.31 230.31
## + EducationField            5   158.51 230.51
## - WorkLifeBalance           1   171.35 231.35
## - NumCompaniesWorked        1   173.33 233.33
## - JobSatisfaction           1   173.63 233.63
## - BusinessTravel            2   177.50 235.50
## - YearsInCurrentRole        1   175.88 235.88
## - MaritalStatus             2   188.10 246.10
## - JobInvolvement            1   192.19 252.19
## - OverTime                  1   260.10 320.10
## 
## Step:  AIC=222.28
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     EnvironmentSatisfaction + Gender + JobInvolvement + JobLevel + 
##     JobRole + JobSatisfaction + MaritalStatus + MonthlyIncome + 
##     NumCompaniesWorked + OverTime + PercentSalaryHike + PerformanceRating + 
##     RelationshipSatisfaction + Shift + TrainingTimesLastYear + 
##     WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + 
##     YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - YearsAtCompany            1   162.75 220.75
## - Gender                    1   162.92 220.92
## - PerformanceRating         1   163.09 221.09
## - MonthlyIncome             1   163.20 221.20
## - Shift                     1   163.34 221.35
## - JobLevel                  1   163.46 221.46
## - RelationshipSatisfaction  1   163.55 221.55
## <none>                          162.28 222.28
## - YearsWithCurrManager      1   165.69 223.69
## - PercentSalaryHike         1   165.76 223.76
## + Education                 1   161.95 223.95
## - TrainingTimesLastYear     1   166.08 224.08
## - Department                2   168.19 224.19
## + TotalWorkingYears         1   162.22 224.22
## - YearsSinceLastPromotion   1   167.03 225.02
## - EnvironmentSatisfaction   1   167.35 225.35
## - Age                       1   169.09 227.09
## - JobRole                   3   173.98 227.98
## - DistanceFromHome          1   170.48 228.48
## + EducationField            5   158.93 228.93
## - WorkLifeBalance           1   171.63 229.63
## - NumCompaniesWorked        1   173.40 231.40
## - JobSatisfaction           1   173.68 231.68
## - BusinessTravel            2   177.51 233.51
## - YearsInCurrentRole        1   176.21 234.21
## - MaritalStatus             2   188.33 244.33
## - JobInvolvement            1   192.91 250.91
## - OverTime                  1   260.29 318.29
## 
## Step:  AIC=220.75
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     EnvironmentSatisfaction + Gender + JobInvolvement + JobLevel + 
##     JobRole + JobSatisfaction + MaritalStatus + MonthlyIncome + 
##     NumCompaniesWorked + OverTime + PercentSalaryHike + PerformanceRating + 
##     RelationshipSatisfaction + Shift + TrainingTimesLastYear + 
##     WorkLifeBalance + YearsInCurrentRole + YearsSinceLastPromotion + 
##     YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - Gender                    1   163.29 219.29
## - PerformanceRating         1   163.46 219.46
## - MonthlyIncome             1   163.55 219.55
## - JobLevel                  1   163.72 219.72
## - Shift                     1   163.75 219.75
## - RelationshipSatisfaction  1   164.29 220.29
## <none>                          162.75 220.75
## - YearsWithCurrManager      1   165.76 221.76
## - PercentSalaryHike         1   166.21 222.21
## + YearsAtCompany            1   162.28 222.28
## - TrainingTimesLastYear     1   166.35 222.35
## + Education                 1   162.41 222.41
## - Department                2   168.42 222.42
## + TotalWorkingYears         1   162.75 222.74
## - YearsSinceLastPromotion   1   167.80 223.80
## - EnvironmentSatisfaction   1   168.60 224.60
## - Age                       1   169.10 225.10
## - DistanceFromHome          1   170.83 226.83
## - JobRole                   3   175.19 227.19
## + EducationField            5   159.58 227.58
## - WorkLifeBalance           1   172.72 228.72
## - NumCompaniesWorked        1   173.40 229.40
## - JobSatisfaction           1   173.68 229.68
## - BusinessTravel            2   177.69 231.69
## - YearsInCurrentRole        1   179.35 235.35
## - MaritalStatus             2   188.62 242.62
## - JobInvolvement            1   193.09 249.09
## - OverTime                  1   260.42 316.42
## 
## Step:  AIC=219.29
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRole + 
##     JobSatisfaction + MaritalStatus + MonthlyIncome + NumCompaniesWorked + 
##     OverTime + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + 
##     Shift + TrainingTimesLastYear + WorkLifeBalance + YearsInCurrentRole + 
##     YearsSinceLastPromotion + YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - PerformanceRating         1   164.10 218.10
## - Shift                     1   164.20 218.20
## - MonthlyIncome             1   164.32 218.32
## - JobLevel                  1   164.45 218.45
## - RelationshipSatisfaction  1   164.70 218.70
## <none>                          163.29 219.29
## - YearsWithCurrManager      1   166.20 220.20
## + Gender                    1   162.75 220.75
## - Department                2   168.78 220.78
## + YearsAtCompany            1   162.92 220.92
## - PercentSalaryHike         1   166.96 220.96
## + Education                 1   163.02 221.02
## - TrainingTimesLastYear     1   167.12 221.12
## + TotalWorkingYears         1   163.28 221.28
## - YearsSinceLastPromotion   1   168.00 222.00
## - Age                       1   169.29 223.29
## - EnvironmentSatisfaction   1   169.86 223.86
## - DistanceFromHome          1   171.35 225.35
## + EducationField            5   159.97 225.97
## - JobRole                   3   176.63 226.63
## - WorkLifeBalance           1   173.52 227.52
## - NumCompaniesWorked        1   173.57 227.57
## - JobSatisfaction           1   174.18 228.18
## - BusinessTravel            2   177.79 229.79
## - YearsInCurrentRole        1   180.53 234.53
## - MaritalStatus             2   188.95 240.95
## - JobInvolvement            1   193.10 247.10
## - OverTime                  1   260.86 314.86
## 
## Step:  AIC=218.1
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRole + 
##     JobSatisfaction + MaritalStatus + MonthlyIncome + NumCompaniesWorked + 
##     OverTime + PercentSalaryHike + RelationshipSatisfaction + 
##     Shift + TrainingTimesLastYear + WorkLifeBalance + YearsInCurrentRole + 
##     YearsSinceLastPromotion + YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - Shift                     1   164.71 216.71
## - MonthlyIncome             1   164.82 216.82
## - JobLevel                  1   165.02 217.02
## - RelationshipSatisfaction  1   165.55 217.55
## <none>                          164.10 218.10
## - YearsWithCurrManager      1   166.83 218.83
## - Department                2   168.92 218.92
## + PerformanceRating         1   163.29 219.29
## + Gender                    1   163.46 219.46
## + YearsAtCompany            1   163.79 219.79
## - PercentSalaryHike         1   167.84 219.84
## + Education                 1   163.86 219.86
## + TotalWorkingYears         1   164.08 220.08
## - YearsSinceLastPromotion   1   168.64 220.64
## - TrainingTimesLastYear     1   168.79 220.79
## - Age                       1   170.22 222.22
## - DistanceFromHome          1   171.39 223.39
## - EnvironmentSatisfaction   1   172.07 224.07
## + EducationField            5   160.62 224.62
## - JobRole                   3   177.40 225.40
## - WorkLifeBalance           1   173.63 225.63
## - NumCompaniesWorked        1   173.95 225.95
## - JobSatisfaction           1   175.43 227.43
## - BusinessTravel            2   179.24 229.24
## - YearsInCurrentRole        1   180.92 232.92
## - MaritalStatus             2   189.30 239.30
## - JobInvolvement            1   193.28 245.28
## - OverTime                  1   261.34 313.34
## 
## Step:  AIC=216.71
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRole + 
##     JobSatisfaction + MaritalStatus + MonthlyIncome + NumCompaniesWorked + 
##     OverTime + PercentSalaryHike + RelationshipSatisfaction + 
##     TrainingTimesLastYear + WorkLifeBalance + YearsInCurrentRole + 
##     YearsSinceLastPromotion + YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - MonthlyIncome             1   165.31 215.31
## - JobLevel                  1   165.50 215.50
## - RelationshipSatisfaction  1   166.52 216.52
## <none>                          164.71 216.71
## - YearsWithCurrManager      1   167.51 217.51
## - Department                2   169.78 217.78
## + Shift                     1   164.10 218.10
## + Gender                    1   164.19 218.19
## - PercentSalaryHike         1   168.19 218.19
## + PerformanceRating         1   164.20 218.20
## + YearsAtCompany            1   164.44 218.44
## + Education                 1   164.53 218.53
## + TotalWorkingYears         1   164.70 218.70
## - YearsSinceLastPromotion   1   169.18 219.18
## - TrainingTimesLastYear     1   169.30 219.30
## - Age                       1   170.81 220.81
## - DistanceFromHome          1   171.72 221.72
## - EnvironmentSatisfaction   1   172.64 222.64
## + EducationField            5   161.83 223.83
## - JobRole                   3   177.99 223.99
## - WorkLifeBalance           1   174.06 224.06
## - NumCompaniesWorked        1   174.75 224.75
## - JobSatisfaction           1   176.01 226.01
## - BusinessTravel            2   179.54 227.54
## - YearsInCurrentRole        1   182.06 232.06
## - JobInvolvement            1   193.46 243.46
## - MaritalStatus             2   196.51 244.51
## - OverTime                  1   261.59 311.59
## 
## Step:  AIC=215.31
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRole + 
##     JobSatisfaction + MaritalStatus + NumCompaniesWorked + OverTime + 
##     PercentSalaryHike + RelationshipSatisfaction + TrainingTimesLastYear + 
##     WorkLifeBalance + YearsInCurrentRole + YearsSinceLastPromotion + 
##     YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - JobLevel                  1   165.50 213.50
## - RelationshipSatisfaction  1   167.06 215.06
## <none>                          165.31 215.31
## - YearsWithCurrManager      1   167.93 215.93
## - Department                2   170.03 216.03
## + Gender                    1   164.62 216.62
## + MonthlyIncome             1   164.71 216.71
## + Shift                     1   164.82 216.82
## - PercentSalaryHike         1   168.84 216.84
## + PerformanceRating         1   165.00 217.00
## + Education                 1   165.08 217.08
## + YearsAtCompany            1   165.12 217.12
## + TotalWorkingYears         1   165.30 217.30
## - YearsSinceLastPromotion   1   169.84 217.84
## - TrainingTimesLastYear     1   170.11 218.11
## - Age                       1   171.34 219.34
## - DistanceFromHome          1   172.63 220.63
## - EnvironmentSatisfaction   1   173.67 221.67
## - JobRole                   3   178.08 222.08
## - WorkLifeBalance           1   174.13 222.13
## + EducationField            5   162.81 222.81
## - NumCompaniesWorked        1   176.71 224.71
## - JobSatisfaction           1   176.84 224.84
## - BusinessTravel            2   179.93 225.93
## - YearsInCurrentRole        1   183.41 231.41
## - JobInvolvement            1   193.67 241.67
## - MaritalStatus             2   197.38 243.38
## - OverTime                  1   261.71 309.71
## 
## Step:  AIC=213.5
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     EnvironmentSatisfaction + JobInvolvement + JobRole + JobSatisfaction + 
##     MaritalStatus + NumCompaniesWorked + OverTime + PercentSalaryHike + 
##     RelationshipSatisfaction + TrainingTimesLastYear + WorkLifeBalance + 
##     YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager
## 
##                            Df Deviance    AIC
## - RelationshipSatisfaction  1   167.18 213.18
## <none>                          165.50 213.50
## - Department                2   170.15 214.15
## - YearsWithCurrManager      1   168.74 214.74
## + Gender                    1   164.86 214.86
## - PercentSalaryHike         1   168.94 214.94
## + Shift                     1   165.02 215.02
## + PerformanceRating         1   165.15 215.15
## + Education                 1   165.31 215.31
## + JobLevel                  1   165.31 215.31
## + TotalWorkingYears         1   165.38 215.38
## + YearsAtCompany            1   165.38 215.38
## + MonthlyIncome             1   165.50 215.50
## - YearsSinceLastPromotion   1   170.06 216.06
## - TrainingTimesLastYear     1   170.24 216.24
## - DistanceFromHome          1   173.04 219.04
## - Age                       1   173.47 219.47
## - EnvironmentSatisfaction   1   173.81 219.81
## - WorkLifeBalance           1   174.42 220.42
## + EducationField            5   163.04 221.04
## - NumCompaniesWorked        1   176.77 222.77
## - JobSatisfaction           1   176.90 222.90
## - BusinessTravel            2   180.25 224.25
## - JobRole                   3   183.14 225.14
## - YearsInCurrentRole        1   183.58 229.58
## - JobInvolvement            1   194.87 240.87
## - MaritalStatus             2   197.49 241.49
## - OverTime                  1   261.80 307.80
## 
## Step:  AIC=213.18
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome + 
##     EnvironmentSatisfaction + JobInvolvement + JobRole + JobSatisfaction + 
##     MaritalStatus + NumCompaniesWorked + OverTime + PercentSalaryHike + 
##     TrainingTimesLastYear + WorkLifeBalance + YearsInCurrentRole + 
##     YearsSinceLastPromotion + YearsWithCurrManager
## 
##                            Df Deviance    AIC
## <none>                          167.18 213.18
## - Department                2   171.47 213.47
## + RelationshipSatisfaction  1   165.50 213.50
## - YearsWithCurrManager      1   169.88 213.88
## - PercentSalaryHike         1   170.29 214.29
## + Shift                     1   166.36 214.36
## + Gender                    1   166.69 214.69
## + PerformanceRating         1   166.84 214.84
## + YearsAtCompany            1   166.88 214.88
## + Education                 1   167.01 215.01
## + JobLevel                  1   167.06 215.06
## + TotalWorkingYears         1   167.12 215.12
## + MonthlyIncome             1   167.18 215.18
## - YearsSinceLastPromotion   1   171.30 215.30
## - TrainingTimesLastYear     1   171.92 215.92
## - DistanceFromHome          1   174.96 218.96
## - EnvironmentSatisfaction   1   175.19 219.19
## - Age                       1   175.39 219.39
## + EducationField            5   164.42 220.42
## - WorkLifeBalance           1   176.70 220.70
## - NumCompaniesWorked        1   178.18 222.18
## - JobSatisfaction           1   178.54 222.54
## - BusinessTravel            2   181.98 223.98
## - JobRole                   3   185.86 225.86
## - YearsInCurrentRole        1   185.24 229.24
## - MaritalStatus             2   198.21 240.21
## - JobInvolvement            1   197.74 241.74
## - OverTime                  1   262.27 306.27
## 
## Call:  glm(formula = Attrition ~ Age + BusinessTravel + Department + 
##     DistanceFromHome + EnvironmentSatisfaction + JobInvolvement + 
##     JobRole + JobSatisfaction + MaritalStatus + NumCompaniesWorked + 
##     OverTime + PercentSalaryHike + TrainingTimesLastYear + WorkLifeBalance + 
##     YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, 
##     family = binomial(), data = train)
## 
## Coefficients:
##                     (Intercept)                              Age  
##                         7.42311                         -0.07024  
## BusinessTravelTravel_Frequently      BusinessTravelTravel_Rarely  
##                         2.68001                          0.87744  
##             DepartmentMaternity              DepartmentNeurology  
##                        -0.99321                         -0.50434  
##                DistanceFromHome          EnvironmentSatisfaction  
##                         0.07082                         -0.52866  
##                  JobInvolvement                     JobRoleNurse  
##                        -1.53591                          3.01169  
##                    JobRoleOther                 JobRoleTherapist  
##                         2.74327                         -1.01539  
##                 JobSatisfaction             MaritalStatusMarried  
##                        -0.62789                          1.11123  
##             MaritalStatusSingle               NumCompaniesWorked  
##                         3.18857                          0.28388  
##                     OverTimeYes                PercentSalaryHike  
##                         3.95524                         -0.10693  
##           TrainingTimesLastYear                  WorkLifeBalance  
##                        -0.39351                         -0.81258  
##              YearsInCurrentRole          YearsSinceLastPromotion  
##                        -0.47481                          0.20272  
##            YearsWithCurrManager  
##                        -0.18419  
## 
## Degrees of Freedom: 317 Total (i.e. Null);  295 Residual
## Null Deviance:       440.2 
## Residual Deviance: 167.2     AIC: 213.2
glm.fit2<- glm(Attrition ~ Age + BusinessTravel + Department + 
    DistanceFromHome + EnvironmentSatisfaction + JobInvolvement + 
    JobRole + JobSatisfaction + MaritalStatus + NumCompaniesWorked + 
    OverTime + PercentSalaryHike + TrainingTimesLastYear + WorkLifeBalance + 
    YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, 
    family = binomial(), data = train)

#summary(glm.fit2)
#predictions saved to test df
test$GLM2Predsx <- predict(glm.fit2, test, type='response')

#covert probability predictions to yes/no format to match y (response variable)
test$GLM2Preds <- ifelse(test$GLM2Predsx>=0.5, 1 ,0 )

# Confusion matrix to compare accuracy
caret::confusionMatrix(as.factor(test$GLM2Preds), as.factor(test$Attrition))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 37  4
##          1 10 29
##                                           
##                Accuracy : 0.825           
##                  95% CI : (0.7238, 0.9009)
##     No Information Rate : 0.5875          
##     P-Value [Acc > NIR] : 4.97e-06        
##                                           
##                   Kappa : 0.6485          
##                                           
##  Mcnemar's Test P-Value : 0.1814          
##                                           
##             Sensitivity : 0.7872          
##             Specificity : 0.8788          
##          Pos Pred Value : 0.9024          
##          Neg Pred Value : 0.7436          
##              Prevalence : 0.5875          
##          Detection Rate : 0.4625          
##    Detection Prevalence : 0.5125          
##       Balanced Accuracy : 0.8330          
##                                           
##        'Positive' Class : 0               
## 

#Decision Tree Model

library(rpart)
dt.model = rpart(Attrition~., 
                             data = train)




set.seed(200)

dt.model <- rpart(formula = Attrition~., # Y ~ all other variables in dataframe
                  data = train, # include only relevant variables
                  method = "class") # classification

rattle::fancyRpartPlot(dt.model, sub = "")

dt.model$variable.importance
##                 OverTime           YearsAtCompany        TotalWorkingYears 
##               36.5186990               29.8530139               27.1688954 
##       YearsInCurrentRole     YearsWithCurrManager                      Age 
##               23.2306710               19.1897333               18.2343038 
##            MonthlyIncome                  JobRole         DistanceFromHome 
##               16.7615484                9.4283819                7.1548112 
##          WorkLifeBalance                 JobLevel  YearsSinceLastPromotion 
##                6.9738409                4.8514619                4.7911344 
##           JobInvolvement RelationshipSatisfaction       NumCompaniesWorked 
##                3.9248484                3.7696722                2.6501035 
##           EducationField          JobSatisfaction               Department 
##                2.1856423                1.5627385                0.7786298 
##           BusinessTravel 
##                0.3114519
printcp(dt.model)
## 
## Classification tree:
## rpart(formula = Attrition ~ ., data = train, method = "class")
## 
## Variables actually used in tree construction:
## [1] Age                JobInvolvement     JobRole            OverTime          
## [5] WorkLifeBalance    YearsAtCompany     YearsInCurrentRole
## 
## Root node error: 152/318 = 0.47799
## 
## n= 318 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.453947      0   1.00000 1.00000 0.058603
## 2 0.098684      1   0.54605 0.54605 0.051525
## 3 0.052632      2   0.44737 0.51316 0.050477
## 4 0.026316      3   0.39474 0.44737 0.048102
## 5 0.019737      5   0.34211 0.46711 0.048856
## 6 0.010965      7   0.30263 0.47368 0.049099
## 7 0.010000     10   0.26974 0.46711 0.048856
plotcp(dt.model)

library(rpart.plot)
dt.model_2 <- prune(dt.model, 
                  cp = dt.model$cptable[which.min(dt.model$cptable[, "xerror"]), "CP"])
#rm(oj.full_class)
rpart.plot(dt.model_2, yesno = TRUE)

dt.pred <- predict(dt.model_2, test, type = "class")
plot(as.factor(test$Attrition), dt.pred, 
     main = "Simple Classification: Predicted vs. Actual",
     xlab = "Actual",
     ylab = "Predicted")

caret::confusionMatrix(dt.pred, as.factor(test$Attrition))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 37  6
##          1 10 27
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.6956, 0.8811)
##     No Information Rate : 0.5875          
##     P-Value [Acc > NIR] : 4.709e-05       
##                                           
##                   Kappa : 0.5947          
##                                           
##  Mcnemar's Test P-Value : 0.4533          
##                                           
##             Sensitivity : 0.7872          
##             Specificity : 0.8182          
##          Pos Pred Value : 0.8605          
##          Neg Pred Value : 0.7297          
##              Prevalence : 0.5875          
##          Detection Rate : 0.4625          
##    Detection Prevalence : 0.5375          
##       Balanced Accuracy : 0.8027          
##                                           
##        'Positive' Class : 0               
## 

RANDOM FOREST

library(randomForestSRC)
## 
##  randomForestSRC 3.1.1 
##  
##  Type rfsrc.news() to see new features, changes, and bug fixes. 
## 
## 
## Attaching package: 'randomForestSRC'
## The following object is masked from 'package:purrr':
## 
##     partial
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
set.seed(123)

forest_attrition = randomForest( as.factor(Attrition)~., 
                     data = train,
                     mtry = 4,
                     importance = TRUE,
                     nodesize = 6,
                     ntree = 4000)


forest_attrition 
## 
## Call:
##  randomForest(formula = as.factor(Attrition) ~ ., data = train,      mtry = 4, importance = TRUE, nodesize = 6, ntree = 4000) 
##                Type of random forest: classification
##                      Number of trees: 4000
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 18.87%
## Confusion matrix:
##     0   1 class.error
## 0 117  35   0.2302632
## 1  25 141   0.1506024
forest_attrition$importance
##                                      0            1 MeanDecreaseAccuracy
## Age                       1.430434e-02 1.842709e-02         1.629812e-02
## BusinessTravel            5.849689e-04 4.773240e-04         5.289570e-04
## Department               -3.635458e-04 5.939141e-03         2.911026e-03
## DistanceFromHome          3.033042e-05 2.198226e-03         1.164164e-03
## Education                 7.355907e-06 1.774369e-03         9.388388e-04
## EducationField            3.861125e-05 8.368406e-05         6.142557e-05
## EnvironmentSatisfaction  -5.824322e-04 1.138020e-03         3.011791e-04
## Gender                    1.869168e-04 4.015145e-04         3.054460e-04
## JobInvolvement            6.499667e-03 6.109738e-03         6.286215e-03
## JobLevel                  8.155532e-03 1.460965e-02         1.149369e-02
## JobRole                   5.237617e-03 1.395944e-02         9.742046e-03
## JobSatisfaction          -4.462138e-04 1.530078e-03         5.907809e-04
## MaritalStatus             7.805321e-03 6.471305e-03         7.085827e-03
## MonthlyIncome             1.060592e-02 1.886684e-02         1.483210e-02
## NumCompaniesWorked        6.593085e-04 3.469472e-03         2.096233e-03
## OverTime                  5.328765e-02 5.843386e-02         5.574207e-02
## PercentSalaryHike        -2.069767e-03 6.789197e-05        -9.561063e-04
## PerformanceRating        -1.253912e-04 8.126190e-05        -2.578386e-05
## RelationshipSatisfaction  3.691584e-04 1.234608e-03         7.829124e-04
## Shift                     1.158897e-02 9.679083e-03         1.059665e-02
## TotalWorkingYears         1.709703e-02 1.647894e-02         1.668216e-02
## TrainingTimesLastYear     1.515487e-03 3.163348e-04         8.751312e-04
## WorkLifeBalance           1.496394e-03 2.732453e-03         2.105492e-03
## YearsAtCompany            1.963235e-02 2.116314e-02         2.038626e-02
## YearsInCurrentRole        1.361941e-02 1.603535e-02         1.488709e-02
## YearsSinceLastPromotion  -8.045431e-04 2.769669e-03         1.081270e-03
## YearsWithCurrManager      1.526900e-02 1.848750e-02         1.679891e-02
##                          MeanDecreaseGini
## Age                            10.3722897
## BusinessTravel                  1.3561160
## Department                      2.2617323
## DistanceFromHome                4.6806382
## Education                       2.3374612
## EducationField                  1.8734543
## EnvironmentSatisfaction         2.1928515
## Gender                          0.8757624
## JobInvolvement                  4.4733704
## JobLevel                        4.3247900
## JobRole                         4.3743525
## JobSatisfaction                 2.2650832
## MaritalStatus                   3.9258642
## MonthlyIncome                   9.3171168
## NumCompaniesWorked              3.7408885
## OverTime                       17.9290236
## PercentSalaryHike               2.8176257
## PerformanceRating               0.4275825
## RelationshipSatisfaction        1.9329476
## Shift                           4.7592462
## TotalWorkingYears               9.1174986
## TrainingTimesLastYear           2.5198297
## WorkLifeBalance                 2.7272578
## YearsAtCompany                  9.7457075
## YearsInCurrentRole              6.8144949
## YearsSinceLastPromotion         2.7564217
## YearsWithCurrManager            7.4030528
pred1_att = predict(forest_attrition,newdata = test)

Accuracy Random Forest

caret::confusionMatrix(pred1_att, as.factor(test$Attrition))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 40  3
##          1  7 30
##                                           
##                Accuracy : 0.875           
##                  95% CI : (0.7821, 0.9384)
##     No Information Rate : 0.5875          
##     P-Value [Acc > NIR] : 1.976e-08       
##                                           
##                   Kappa : 0.7467          
##                                           
##  Mcnemar's Test P-Value : 0.3428          
##                                           
##             Sensitivity : 0.8511          
##             Specificity : 0.9091          
##          Pos Pred Value : 0.9302          
##          Neg Pred Value : 0.8108          
##              Prevalence : 0.5875          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.5375          
##       Balanced Accuracy : 0.8801          
##                                           
##        'Positive' Class : 0               
##