library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(readr)
library(stringr)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(gtools)
library(caret)
## Loading required package: lattice
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:gtools':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggthemes)
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
library(performance)
library(ggpubr)
library(logistf)
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ scales::col_factor() masks readr::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
library(corrplot)
## corrplot 0.92 loaded
library(lattice)
library(caret)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:plotly':
##
## select
##
## The following object is masked from 'package:dplyr':
##
## select
library(caTools)
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loaded gam 1.20.2
library(tidyverse)
library(car)
library(broom)
library(DescTools)
##
## Attaching package: 'DescTools'
##
## The following object is masked from 'package:foreach':
##
## %:%
##
## The following objects are masked from 'package:MLmetrics':
##
## AUC, Gini, MAE, MAPE, MSE, RMSE
##
## The following object is masked from 'package:car':
##
## Recode
##
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
library(ROCR)
##
## Attaching package: 'ROCR'
##
## The following object is masked from 'package:performance':
##
## performance
library(lmtest)
library(readr)
healthcare <- read_csv("watson_healthcare_modified.csv")
## Rows: 1676 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): EmployeeID, Age, DailyRate, DistanceFromHome, Education, EmployeeC...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(healthcare)
## spec_tbl_df [1,676 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ EmployeeID : num [1:1676] 1313919 1200302 1060315 1272912 1414939 ...
## $ Age : num [1:1676] 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr [1:1676] "No" "No" "Yes" "No" ...
## $ BusinessTravel : chr [1:1676] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : num [1:1676] 1102 279 1373 1392 591 ...
## $ Department : chr [1:1676] "Cardiology" "Maternity" "Maternity" "Maternity" ...
## $ DistanceFromHome : num [1:1676] 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num [1:1676] 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr [1:1676] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : num [1:1676] 1 1 1 1 1 1 1 1 1 1 ...
## $ EnvironmentSatisfaction : num [1:1676] 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr [1:1676] "Female" "Male" "Male" "Female" ...
## $ HourlyRate : num [1:1676] 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : num [1:1676] 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num [1:1676] 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr [1:1676] "Nurse" "Other" "Nurse" "Other" ...
## $ JobSatisfaction : num [1:1676] 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr [1:1676] "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : num [1:1676] 5993 5130 2090 2909 3468 ...
## $ MonthlyRate : num [1:1676] 19479 24907 2396 23159 16632 ...
## $ NumCompaniesWorked : num [1:1676] 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr [1:1676] "Y" "Y" "Y" "Y" ...
## $ OverTime : chr [1:1676] "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : num [1:1676] 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num [1:1676] 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num [1:1676] 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : num [1:1676] 80 80 80 80 80 80 80 80 80 80 ...
## $ Shift : num [1:1676] 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num [1:1676] 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num [1:1676] 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num [1:1676] 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num [1:1676] 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num [1:1676] 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num [1:1676] 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num [1:1676] 5 7 0 0 2 6 0 0 8 7 ...
## - attr(*, "spec")=
## .. cols(
## .. EmployeeID = col_double(),
## .. Age = col_double(),
## .. Attrition = col_character(),
## .. BusinessTravel = col_character(),
## .. DailyRate = col_double(),
## .. Department = col_character(),
## .. DistanceFromHome = col_double(),
## .. Education = col_double(),
## .. EducationField = col_character(),
## .. EmployeeCount = col_double(),
## .. EnvironmentSatisfaction = col_double(),
## .. Gender = col_character(),
## .. HourlyRate = col_double(),
## .. JobInvolvement = col_double(),
## .. JobLevel = col_double(),
## .. JobRole = col_character(),
## .. JobSatisfaction = col_double(),
## .. MaritalStatus = col_character(),
## .. MonthlyIncome = col_double(),
## .. MonthlyRate = col_double(),
## .. NumCompaniesWorked = col_double(),
## .. Over18 = col_character(),
## .. OverTime = col_character(),
## .. PercentSalaryHike = col_double(),
## .. PerformanceRating = col_double(),
## .. RelationshipSatisfaction = col_double(),
## .. StandardHours = col_double(),
## .. Shift = col_double(),
## .. TotalWorkingYears = col_double(),
## .. TrainingTimesLastYear = col_double(),
## .. WorkLifeBalance = col_double(),
## .. YearsAtCompany = col_double(),
## .. YearsInCurrentRole = col_double(),
## .. YearsSinceLastPromotion = col_double(),
## .. YearsWithCurrManager = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
anyNA(healthcare)
## [1] FALSE
healthcare[duplicated(healthcare),]
## # A tibble: 0 × 35
## # … with 35 variables: EmployeeID <dbl>, Age <dbl>, Attrition <chr>,
## # BusinessTravel <chr>, DailyRate <dbl>, Department <chr>,
## # DistanceFromHome <dbl>, Education <dbl>, EducationField <chr>,
## # EmployeeCount <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## # HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## # JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## # MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, …
# no duplicated data
glimpse(healthcare)
## Rows: 1,676
## Columns: 35
## $ EmployeeID <dbl> 1313919, 1200302, 1060315, 1272912, 1414939, …
## $ Age <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2…
## $ Attrition <chr> "No", "No", "Yes", "No", "No", "No", "No", "N…
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Frequently", "Travel…
## $ DailyRate <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358,…
## $ Department <chr> "Cardiology", "Maternity", "Maternity", "Mate…
## $ DistanceFromHome <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, …
## $ Education <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, …
## $ EducationField <chr> "Life Sciences", "Life Sciences", "Other", "L…
## $ EmployeeCount <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EnvironmentSatisfaction <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, …
## $ Gender <chr> "Female", "Male", "Male", "Female", "Male", "…
## $ HourlyRate <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 4…
## $ JobInvolvement <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, …
## $ JobLevel <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, …
## $ JobRole <chr> "Nurse", "Other", "Nurse", "Other", "Nurse", …
## $ JobSatisfaction <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, …
## $ MaritalStatus <chr> "Single", "Married", "Single", "Married", "Ma…
## $ MonthlyIncome <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269…
## $ MonthlyRate <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 9964…
## $ NumCompaniesWorked <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, …
## $ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ PercentSalaryHike <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1…
## $ PerformanceRating <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, …
## $ StandardHours <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ Shift <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, …
## $ TotalWorkingYears <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3…
## $ TrainingTimesLastYear <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, …
## $ WorkLifeBalance <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, …
## $ YearsAtCompany <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4,…
## $ YearsInCurrentRole <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, …
## $ YearsSinceLastPromotion <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, …
## $ YearsWithCurrManager <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, …
#This dataset consist of 35 features (variables) and 1,676 observations (rows data). There are 9 categorical columns and 26 numerical columns.
# Make age a categorical variable
summary(healthcare
%>% select_if(is.numeric))
## EmployeeID Age DailyRate DistanceFromHome
## Min. :1025177 Min. :18.00 Min. : 102.0 Min. : 1.000
## 1st Qu.:1235832 1st Qu.:30.00 1st Qu.: 465.0 1st Qu.: 2.000
## Median :1464606 Median :36.00 Median : 796.5 Median : 7.000
## Mean :1456796 Mean :36.87 Mean : 800.6 Mean : 9.222
## 3rd Qu.:1667992 3rd Qu.:43.00 3rd Qu.:1157.0 3rd Qu.:14.000
## Max. :1886378 Max. :60.00 Max. :1499.0 Max. :29.000
## Education EmployeeCount EnvironmentSatisfaction HourlyRate
## Min. :1.000 Min. :1 Min. :1.000 Min. : 30.00
## 1st Qu.:2.000 1st Qu.:1 1st Qu.:2.000 1st Qu.: 48.00
## Median :3.000 Median :1 Median :3.000 Median : 65.50
## Mean :2.908 Mean :1 Mean :2.715 Mean : 65.47
## 3rd Qu.:4.000 3rd Qu.:1 3rd Qu.:4.000 3rd Qu.: 83.00
## Max. :5.000 Max. :1 Max. :4.000 Max. :100.00
## JobInvolvement JobLevel JobSatisfaction MonthlyIncome
## Min. :1.000 Min. :1.000 Min. :1.000 Min. : 1009
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 2928
## Median :3.000 Median :2.000 Median :3.000 Median : 4899
## Mean :2.725 Mean :2.067 Mean :2.739 Mean : 6517
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.: 8380
## Max. :4.000 Max. :5.000 Max. :4.000 Max. :19999
## MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating
## Min. : 2094 Min. :0.000 Min. :11.0 Min. :3.00
## 1st Qu.: 7993 1st Qu.:1.000 1st Qu.:12.0 1st Qu.:3.00
## Median :14270 Median :2.000 Median :14.0 Median :3.00
## Mean :14287 Mean :2.662 Mean :15.2 Mean :3.15
## 3rd Qu.:20462 3rd Qu.:4.000 3rd Qu.:18.0 3rd Qu.:3.00
## Max. :26999 Max. :9.000 Max. :25.0 Max. :4.00
## RelationshipSatisfaction StandardHours Shift TotalWorkingYears
## Min. :1.000 Min. :80 Min. :0.0000 Min. : 0.00
## 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00
## Median :3.000 Median :80 Median :1.0000 Median :10.00
## Mean :2.718 Mean :80 Mean :0.8061 Mean :11.34
## 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :4.000 Max. :80 Max. :3.0000 Max. :40.00
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.805 Mean :2.766 Mean : 7.033 Mean : 4.265
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 7.000
## Max. :6.000 Max. :4.000 Max. :40.000 Max. :18.000
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 2.000
## Median : 1.0 Median : 3.000
## Mean : 2.2 Mean : 4.135
## 3rd Qu.: 3.0 3rd Qu.: 7.000
## Max. :15.0 Max. :17.000
#healthcare <- healthcare %>% select(-c("Over18", "EmployeeCount", "EmployeeID", "StandardHours", "HourlyRate", "MonthlyRate", "DailyRate"))
healthcare <- healthcare [-c(22, 10, 1, 27, 13, 20, 5)]
str(healthcare)
## tibble [1,676 × 28] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:1676] 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr [1:1676] "No" "No" "Yes" "No" ...
## $ BusinessTravel : chr [1:1676] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ Department : chr [1:1676] "Cardiology" "Maternity" "Maternity" "Maternity" ...
## $ DistanceFromHome : num [1:1676] 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num [1:1676] 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr [1:1676] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EnvironmentSatisfaction : num [1:1676] 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr [1:1676] "Female" "Male" "Male" "Female" ...
## $ JobInvolvement : num [1:1676] 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num [1:1676] 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr [1:1676] "Nurse" "Other" "Nurse" "Other" ...
## $ JobSatisfaction : num [1:1676] 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr [1:1676] "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : num [1:1676] 5993 5130 2090 2909 3468 ...
## $ NumCompaniesWorked : num [1:1676] 8 1 6 1 9 0 4 1 0 6 ...
## $ OverTime : chr [1:1676] "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : num [1:1676] 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num [1:1676] 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num [1:1676] 1 4 2 3 4 3 1 2 2 2 ...
## $ Shift : num [1:1676] 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num [1:1676] 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num [1:1676] 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num [1:1676] 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num [1:1676] 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num [1:1676] 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num [1:1676] 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num [1:1676] 5 7 0 0 2 6 0 0 8 7 ...
# Now the number of columns reduced from 35 to 28. Now let’s check if there is a missing value:
head(healthcare)
## # A tibble: 6 × 28
## Age Attrition Busin…¹ Depar…² Dista…³ Educa…⁴ Educa…⁵ Envir…⁶ Gender JobIn…⁷
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 41 No Travel… Cardio… 1 2 Life S… 2 Female 3
## 2 49 No Travel… Matern… 8 1 Life S… 3 Male 2
## 3 37 Yes Travel… Matern… 2 2 Other 4 Male 2
## 4 33 No Travel… Matern… 3 4 Life S… 4 Female 3
## 5 27 No Travel… Matern… 2 1 Medical 1 Male 3
## 6 32 No Travel… Matern… 2 2 Life S… 4 Male 3
## # … with 18 more variables: JobLevel <dbl>, JobRole <chr>,
## # JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## # NumCompaniesWorked <dbl>, OverTime <chr>, PercentSalaryHike <dbl>,
## # PerformanceRating <dbl>, RelationshipSatisfaction <dbl>, Shift <dbl>,
## # TotalWorkingYears <dbl>, TrainingTimesLastYear <dbl>,
## # WorkLifeBalance <dbl>, YearsAtCompany <dbl>, YearsInCurrentRole <dbl>,
## # YearsSinceLastPromotion <dbl>, YearsWithCurrManager <dbl>, and …
#table(healthcare$JobRole)
healthcare$JobRole<- car::Recode(healthcare$JobRole,
recodes="'Admin'='Administrative'",
as.factor=T)
detach(package: MASS, unload = TRUE)
## Warning: 'MASS' namespace cannot be unloaded:
## namespace 'MASS' is imported by 'DescTools', 'ipred' so cannot be unloaded
#library(conflicted)
#conflict_prefer("select", "dplyr")
d1_plot <- healthcare %>%
select(Gender, Attrition) %>%
count(Gender, Attrition) %>%
ggplot(aes(x=Gender, y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.2, reverse = F), size=4) +
labs(fill = "Attrition", y="Count") + theme_minimal() +
theme(axis.text.x = element_text(angle = 40),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
## Warning: Ignoring unknown aesthetics: fill
d2_plot <- healthcare %>%
select(EducationField, Attrition) %>%
count(EducationField, Attrition) %>%
ggplot(aes(x=EducationField, y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F), size=4) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 40),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(fill = "Attrition", y = "")
## Warning: Ignoring unknown aesthetics: fill
d3_plot <- healthcare %>%
select(Department, Attrition) %>% count(Department, Attrition) %>%
ggplot(aes(x=Department, y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 40),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
scale_x_discrete(labels=c("Cardiology","Maternity","Neurology")) +
labs(fill = "Attrition", y = "Count")
## Warning: Ignoring unknown aesthetics: fill
d4_plot <- healthcare %>%
select(JobRole, Attrition) %>% count(JobRole, Attrition) %>%
ggplot(aes(x=JobRole, y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.3, reverse = F),size=3.5) +
coord_flip() +
theme_minimal() +
theme(#axis.text.x = element_text(angle = 90),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(fill = "Attrition",
x = "Job Role", y = "Count")
## Warning: Ignoring unknown aesthetics: fill
d5_plot <- healthcare %>%
select(JobLevel, Attrition) %>% count(JobLevel, Attrition) %>%
ggplot(aes(x=JobLevel, y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.8, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(fill = "Attrition",
x = "Job Level", y="")
## Warning: Ignoring unknown aesthetics: fill
d6_plot <- healthcare %>%
mutate(Age = as.factor(
ifelse(Age < 20, "18-19",
ifelse((Age >= 20) & (Age <= 25), "20-25",
ifelse((Age >= 26) & (Age <= 30), "26-30",
ifelse((Age >= 31) & (Age <= 35), "31-35",
ifelse((Age >= 36) & (Age <= 40), "36-40",
ifelse((Age >= 41) & (Age <= 45), "41-45",
ifelse((Age >= 46) & (Age <= 50), "46-50",
ifelse((Age >= 51) & (Age <= 55), "51-55", ">55"
)
)
)
)
)
)
)
)
)
) %>%
group_by(Age, Attrition) %>% count(Age, Attrition) %>%
ggplot(aes(x=factor(Age, levels = c("18-19", "20-25", "26-30", "31-35", "36-40",
"41-45", "46-50", "51-55", ">55")),
y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.3, reverse = F),size=3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(fill = "Attrition", x = "Age", y = "")
## Warning: Ignoring unknown aesthetics: fill
demography_plot <- ggarrange(d1_plot, d2_plot, d3_plot, d5_plot,
ncol = 2, nrow = 2,
common.legend = T,
legend = "bottom")
demography_plot
demography_plot2 <- ggarrange(d4_plot, d6_plot,
ncol = 2, nrow = 1,
common.legend = T,
legend = "bottom")
demography_plot2
# 1. TotalWorkingYears
plot_exp1 <- healthcare %>% select(TotalWorkingYears, Attrition) %>%
mutate(binning_workingyears = as.factor(
ifelse(TotalWorkingYears <= 4, "0-4",
ifelse((TotalWorkingYears >= 5 & TotalWorkingYears <= 9), "5-9",
ifelse((TotalWorkingYears >= 10 & TotalWorkingYears <= 14), "10-14",
ifelse((TotalWorkingYears >= 15 & TotalWorkingYears <= 19), "15-19",
ifelse((TotalWorkingYears >= 20 & TotalWorkingYears <= 24), "20-24",
ifelse((TotalWorkingYears >= 25 & TotalWorkingYears <= 29), "25-29",
ifelse((TotalWorkingYears >= 26 & TotalWorkingYears <= 29), "26-29",
ifelse((TotalWorkingYears >= 30 & TotalWorkingYears <= 34), "30-34",
ifelse((TotalWorkingYears >= 35 & TotalWorkingYears <= 39), "35-39", ">=40"
)
)
)
)
)
)
)
)
)
)
) %>%
count(binning_workingyears, Attrition) %>%
ggplot(aes(x=factor(binning_workingyears, levels = c("0-4", "5-9", "10-14", "15-19", "20-24",
"25-29", "26-29", "30-34", "35-39", ">=40")),
y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.3, reverse = F), size=3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 40),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(
title="Total Working Experience",
fill = "Attrition", x = "Total Working (Years)", y = ""
)
## Warning: Ignoring unknown aesthetics: fill
# 2.NumCompaniesWorked
plot_exp2 <- healthcare %>% select(NumCompaniesWorked, Attrition) %>%
mutate(binning = as.factor(
ifelse(NumCompaniesWorked <= 1, "0-1",
ifelse((NumCompaniesWorked >= 2 & NumCompaniesWorked <= 3), "2-3",
ifelse((NumCompaniesWorked >= 4 & NumCompaniesWorked <= 5), "4-5",
ifelse((NumCompaniesWorked >= 6 & NumCompaniesWorked <= 7), "6-7", ">7"
)
)
)
)
)
) %>%
count(binning, Attrition) %>%
ggplot(aes(x=factor(binning, levels = c("0-1", "2-3", "4-5", "6-7",">7")),
y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 40),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(
title="Total Company Worked",
fill = "Attrition", x = "Total Company Worked", y = ""
)
## Warning: Ignoring unknown aesthetics: fill
# 3.YearsAtCompany
plot_exp3 <- healthcare %>% select(YearsAtCompany, Attrition) %>%
mutate(binning_years = as.factor(
ifelse(YearsAtCompany <= 4, "0-4",
ifelse((YearsAtCompany >= 5 & YearsAtCompany <= 9), "5-9",
ifelse((YearsAtCompany >= 10 & YearsAtCompany <= 14), "10-14",
ifelse((YearsAtCompany >= 15 & YearsAtCompany <= 19), "15-19",
ifelse((YearsAtCompany >= 20 & YearsAtCompany <= 24), "20-24",
ifelse((YearsAtCompany >= 25 & YearsAtCompany <= 29), "25-29",
ifelse((YearsAtCompany >= 26 & YearsAtCompany <= 29), "26-29",
ifelse((YearsAtCompany >= 30 & YearsAtCompany <= 34), "30-34",
ifelse((YearsAtCompany >= 35 & YearsAtCompany <= 39), "35-39", ">=40"
)
)
)
)
)
)
)
)
)
)
) %>%
count(binning_years, Attrition) %>%
ggplot(aes(x=factor(binning_years, levels = c("0-4", "5-9", "10-14", "15-19", "20-24",
"25-29", "26-29", "30-34", "35-39", ">=40")),
y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.2, reverse = F), size=3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 40),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(
title="Years at Company",
fill = "Attrition", x = "Years at Company", y = ""
)
## Warning: Ignoring unknown aesthetics: fill
# 4.NumCompaniesWorked
plot_exp4 <- healthcare %>% select(TrainingTimesLastYear, Attrition) %>%
mutate(binning = as.factor(
ifelse(TrainingTimesLastYear <= 1, "0-1",
ifelse((TrainingTimesLastYear >= 2 & TrainingTimesLastYear <= 3), "2-3",
ifelse((TrainingTimesLastYear >= 4 & TrainingTimesLastYear <= 5), "4-5", ">5"
)
)
)
)
) %>%
count(binning, Attrition) %>%
ggplot(aes(x=factor(binning, levels = c("0-1", "2-3", "4-5", ">5")),
y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.8, reverse = F), size=4) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 40),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(
title="Total Completed Training",
fill = "Attrition", x = "Total Training Completed", y = ""
)
## Warning: Ignoring unknown aesthetics: fill
experience_plot <- ggarrange(plot_exp1, plot_exp2, plot_exp3, plot_exp4,
ncol = 2, nrow = 2,
common.legend = T,
legend = "bottom")
experience_plot
#1. EnvironmentSatisfaction
plot_sv1 <- healthcare %>% select(EnvironmentSatisfaction, Attrition) %>%
count(EnvironmentSatisfaction, Attrition) %>%
ggplot(aes(x=factor(EnvironmentSatisfaction), y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(
title="Employee Satisfaction",
fill = "Attrition", x = "", y = ""
)
## Warning: Ignoring unknown aesthetics: fill
#2. JobSatisfaction
plot_sv2 <- healthcare %>% select(JobSatisfaction, Attrition) %>%
count(JobSatisfaction, Attrition) %>%
ggplot(aes(x=factor(JobSatisfaction), y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(
title="Job Satisfaction",
fill = "Attrition", x = "", y = ""
)
## Warning: Ignoring unknown aesthetics: fill
#3. JobInvolvement
plot_sv3 <- healthcare %>% select(JobInvolvement, Attrition) %>%
count(JobInvolvement, Attrition) %>%
ggplot(aes(x=factor(JobInvolvement), y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(
title="Job Involvement",
fill = "Attrition", x = "", y = ""
)
## Warning: Ignoring unknown aesthetics: fill
#4. WorkLifeBalance
plot_sv4 <- healthcare %>% select(WorkLifeBalance, Attrition) %>%
count(WorkLifeBalance, Attrition) %>%
ggplot(aes(x=factor(WorkLifeBalance), y=n)) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=n, fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(
title="Work Life Balance",
fill = "Attrition", x = "", y = ""
)
## Warning: Ignoring unknown aesthetics: fill
csurvey_plot <- ggarrange(plot_sv1, plot_sv2, plot_sv3, plot_sv4,
ncol = 2, nrow = 2,
common.legend = T,
legend = "bottom")
csurvey_plot
healthcare %>% select(Attrition) %>% count(Attrition) %>%
mutate(percent=round((n/sum(n))*100,2),
lab_ypos = cumsum(percent) - 0.7*percent) %>%
ggplot(aes(x=2, y=percent, fill = factor(Attrition, levels = c("Yes", "No")))) +
geom_bar(stat="identity", start=0) +
coord_polar(theta = "y", start=0) +
geom_text(aes(y = lab_ypos,
label = paste0(percent,' ','%')), color = "white") +
theme_void() + theme(legend.position = "bottom") + xlim(0.5, 2.5) +
labs(title = "Attrition Rate in Health Care ", fill = "Attrition")
## Warning: Ignoring unknown parameters: start
temp_plot1 <- healthcare %>%
select(Department, Attrition) %>% count(Department, Attrition) %>%
group_by(Department) %>%
mutate(percent = round((n/sum(n))*100,2)) %>%
ggplot(aes(x=Department, y=n,
text=paste0('</br>Department: ', Department,
'</br>Attrition Status: ', Attrition,
'</br>Count: ', n,
'</br>Percentage: ', percent, ' ', '%'))) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(title = "Attrition Rate per Department", fill = "Attrition", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(temp_plot1, tooltip="text")
healthcare$Gender<- as.character(healthcare$Gender)
temp_plot22 <- healthcare %>%
select(Gender, Attrition) %>% count(Gender, Attrition) %>%
group_by(Gender) %>%
mutate(percent = round((n/sum(n))*100,2)) %>%
ggplot(aes(x=factor(Gender), y=n,
text=paste0('</br>Gender: ', Gender,
'</br>Attrition Status: ', Attrition,
'</br>Count: ', n,
'</br>Percentage: ', percent, ' ', '%'))) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(title = "Attrition Rate per Gender", fill = "Attrition", x = "Gender", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(temp_plot22, tooltip="text")
#Job Level
temp_plot2 <- healthcare %>%
select(JobLevel, Attrition) %>% count(JobLevel, Attrition) %>%
group_by(JobLevel) %>%
mutate(percent = round((n/sum(n))*100,2)) %>%
ggplot(aes(x=factor(JobLevel), y=n,
text=paste0('</br>Job Level: ', JobLevel,
'</br>Attrition Status: ', Attrition,
'</br>Count: ', n,
'</br>Percentage: ', percent, ' ', '%'))) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(title = "Attrition Rate per Job Level", fill = "Attrition", x = "Job Level", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(temp_plot2, tooltip="text")
#Salary & Satisfaction Score
avgincome_plot1 <- healthcare %>%
select(JobRole, MonthlyIncome, Attrition) %>%
group_by(JobRole, Attrition) %>%
summarise(avg_monthly_income = round(mean(MonthlyIncome),2)) %>%
ggplot(aes(x=JobRole, y=avg_monthly_income,
fill=factor(Attrition, levels = c("Yes","No")))) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(x=JobRole, y=1000,
label = paste0("$"," ", avg_monthly_income)),
fontface="bold", color="white", size = 2.5, nudge_y = 1500) +
facet_wrap(~Attrition) +
coord_flip() +
theme_minimal() +
labs(
x = "Job Role", y = "Average Income (dollar)",
fill = "Attrition",
title="Average Income\nby Job Role and Attrition Status"
) +
theme(axis.text.x = element_text(angle = 90),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
## `summarise()` has grouped output by 'JobRole'. You can override using the
## `.groups` argument.
avgincome_plot1
median_incomevsjobsscore <- healthcare %>% select(MonthlyIncome, JobSatisfaction, Attrition) %>%
group_by(JobSatisfaction, Attrition) %>%
summarise(median_income = median(MonthlyIncome)) %>%
ggplot(aes(x=median_income, y=JobSatisfaction,
color=factor(Attrition, levels = c("Yes","No")))) +
geom_point(size = 4) +
geom_segment(aes(x = 0, xend = median_income,
y = JobSatisfaction, yend = JobSatisfaction), size = 2) +
geom_text(aes(x=1000, y=JobSatisfaction,
label= paste0("$ ", " ", median_income)),
nudge_y = 0.2) +
facet_wrap(~Attrition) +
theme_minimal() +
theme(legend.position = "bottom",
panel.grid.major.x = element_line(linetype = "dashed", colour = "grey"),
panel.grid.minor.x = element_line(linetype = "dashed", colour = "grey"),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()) +
labs(
x = "Median Income (dollar)", y = "Job Satisfaction Score",
color = "Attrition",
title="Does Income affect on Job Satisfaction?\nby Attrition Status"
)
## `summarise()` has grouped output by 'JobSatisfaction'. You can override using
## the `.groups` argument.
median_incomevsjobsscore
#summary(healthcare$PercentSalaryHike)
plot_subsalary1 <- healthcare %>% select(PercentSalaryHike, Attrition) %>%
count(PercentSalaryHike, Attrition) %>%
group_by(PercentSalaryHike) %>% mutate(percent = round((n/sum(n))*100,2)) %>%
ggplot(aes(x=factor(PercentSalaryHike), y=n,
text=paste0('</br>Salary Hike Last Year (in percent): ', PercentSalaryHike,
'</br>Attrition Status: ', Attrition,
'</br>Count: ', n,
'</br>Percentage: ', percent, ' ', '%'))) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
angle=90, fontface="bold", color="white", size=2,
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(title = "Attrition Rate per Salary Hike Last Year\n in percent", fill = "Attrition", x = "Salary Hike (%)", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(plot_subsalary1, tooltip="text")
healthcare %>% select(EnvironmentSatisfaction, JobRole, Attrition) %>%
group_by(JobRole, Attrition) %>%
summarize(avg_env_score = round(mean(EnvironmentSatisfaction),2)) %>%
ggplot(aes(x=JobRole,y=avg_env_score)) +
geom_line(aes(group=Attrition), linetype= "twodash", size=1) +
geom_point(aes(color=Attrition), size=3) +
theme_minimal() +
theme(legend.position = "top", axis.text.x = element_text(angle = 90),
axis.line = element_line(colour = "grey",
size = 0.7, linetype = "solid"),
panel.grid.major.x = element_line(size = 0.5, linetype = "dashed", colour = "lightgray"),
panel.grid.minor.x = element_line(size = 0.5, linetype = "dashed", colour = "lightgray"),
panel.grid.major.y = element_line(size = 0.5,linetype = "dashed", colour = "lightgray")) +
labs(
x = "", y = "Average Score",
fill = "Attrition",
title = "Average Environtment Satisfaction Score\nper Job Role"
)
## `summarise()` has grouped output by 'JobRole'. You can override using the
## `.groups` argument.
temp_plot4 <- healthcare %>%
select(Gender, MaritalStatus, Attrition) %>% count(Gender, MaritalStatus, Attrition) %>%
group_by(Gender, MaritalStatus) %>%
mutate(percent = round((n/sum(n))*100,2)) %>%
ggplot(aes(x=factor(MaritalStatus), y=n,
text=paste0('</br>Gender: ', Gender,
'</br>Marital Status: ', MaritalStatus,
'</br>Attrition Status: ', Attrition,
'</br>Count: ', n,
'</br>Percentage: ', percent, ' ', '%'))) +
geom_col(aes(fill=factor(Attrition, levels = c("Yes", "No")))) +
facet_wrap(~Gender) +
geom_text(aes(label=paste0(percent, ' ' ,'%') , fill = factor(Attrition, levels = c("Yes", "No"))),
position = position_stack(vjust = 0.5, reverse = F)) +
theme_minimal() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
labs(title = "Attrition Rate per Job Level", fill = "Attrition", x = "Marital Status", y = "")
## Warning: Ignoring unknown aesthetics: fill
ggplotly(temp_plot4, tooltip="text")
chisq.test(healthcare$BusinessTravel, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$BusinessTravel and healthcare$Attrition
## X-squared = 13.59, df = 2, p-value = 0.001119
chisq.test(healthcare$Department, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$Department and healthcare$Attrition
## X-squared = 8.0133, df = 2, p-value = 0.01819
chisq.test(healthcare$Education, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$Education and healthcare$Attrition
## X-squared = 9.0625, df = 4, p-value = 0.05956
chisq.test(healthcare$EducationField, healthcare$Attrition)
## Warning in chisq.test(healthcare$EducationField, healthcare$Attrition): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: healthcare$EducationField and healthcare$Attrition
## X-squared = 7.8745, df = 5, p-value = 0.1633
chisq.test(healthcare$EnvironmentSatisfaction, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$EnvironmentSatisfaction and healthcare$Attrition
## X-squared = 23.315, df = 3, p-value = 3.471e-05
chisq.test(healthcare$Gender, healthcare$Attrition)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: healthcare$Gender and healthcare$Attrition
## X-squared = 0.59123, df = 1, p-value = 0.4419
chisq.test(healthcare$JobInvolvement, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$JobInvolvement and healthcare$Attrition
## X-squared = 52.007, df = 3, p-value = 2.984e-11
chisq.test(healthcare$JobRole, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$JobRole and healthcare$Attrition
## X-squared = 43.64, df = 3, p-value = 1.799e-09
chisq.test(healthcare$JobSatisfaction, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$JobSatisfaction and healthcare$Attrition
## X-squared = 11.49, df = 3, p-value = 0.009353
chisq.test(healthcare$MaritalStatus, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$MaritalStatus and healthcare$Attrition
## X-squared = 72.489, df = 2, p-value < 2.2e-16
chisq.test(healthcare$PerformanceRating, healthcare$Attrition)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: healthcare$PerformanceRating and healthcare$Attrition
## X-squared = 0.11125, df = 1, p-value = 0.7387
chisq.test(healthcare$OverTime, healthcare$Attrition)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: healthcare$OverTime and healthcare$Attrition
## X-squared = 188.47, df = 1, p-value < 2.2e-16
chisq.test(healthcare$RelationshipSatisfaction, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$RelationshipSatisfaction and healthcare$Attrition
## X-squared = 1.5787, df = 3, p-value = 0.6642
chisq.test(healthcare$WorkLifeBalance, healthcare$Attrition)
##
## Pearson's Chi-squared test
##
## data: healthcare$WorkLifeBalance and healthcare$Attrition
## X-squared = 25.063, df = 3, p-value = 1.498e-05
library(knitr)
chisq_results =
data.frame(Variable = c("Business Travel",
"Department",
"Education",
"Education Field",
"Environment Satisfaction",
"Gender",
"Job Involvement",
"Job Role",
"Job Satisfaction",
"Marital Status",
"Over Time",
"Performance Rating",
"Relationship Satisfaction",
#"Stock Option Level",
"Work Life Balance"),
Chi_Sq_Stat = c(13.59,
8.01,
9.06,
7.87,
23.31,
0.59,
52.00,
43.64,
11.49,
72.48,
0.11,
188.47,
1.57,
25.06),
P_value = c(0.00,
0.01,
0.05,
0.16,
0.00,
0.44,
0.00,
0.00,
0.00,
0.00,
0.73,
0.00,
0.66,
0.00),
Stat_Sig = c("Yes",
"Yes",
"No",
"No",
"Yes",
"No",
"Yes",
"Yes",
"Yes",
"Yes",
"No",
"Yes",
"No",
"Yes"))
kable(chisq_results,
col.names = c("Variable","Chi-Square Statistic", "p-value", "Statistically Significant"))
Variable | Chi-Square Statistic | p-value | Statistically Significant |
---|---|---|---|
Business Travel | 13.59 | 0.00 | Yes |
Department | 8.01 | 0.01 | Yes |
Education | 9.06 | 0.05 | No |
Education Field | 7.87 | 0.16 | No |
Environment Satisfaction | 23.31 | 0.00 | Yes |
Gender | 0.59 | 0.44 | No |
Job Involvement | 52.00 | 0.00 | Yes |
Job Role | 43.64 | 0.00 | Yes |
Job Satisfaction | 11.49 | 0.00 | Yes |
Marital Status | 72.48 | 0.00 | Yes |
Over Time | 0.11 | 0.73 | No |
Performance Rating | 188.47 | 0.00 | Yes |
Relationship Satisfaction | 1.57 | 0.66 | No |
Work Life Balance | 25.06 | 0.00 | Yes |
library(stats)
# ANOVA: Distance From Home and Attrition
aov.res = aov(DistanceFromHome~Attrition, data = healthcare)
summary(aov.res)
## Df Sum Sq Mean Sq F value Pr(>F)
## Attrition 1 1243 1242.7 18.87 1.48e-05 ***
## Residuals 1674 110237 65.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(aov.res)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 10.204 0.001427 **
## 1674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#oneway.test(DistanceFromHome~Attrition, data = healthcare, var.equal = FALSE)
# ANOVA: Monthly Income and Attrition
aov.res = aov(MonthlyIncome~Attrition, data = healthcare)
summary(aov.res)
## Df Sum Sq Mean Sq F value Pr(>F)
## Attrition 1 1.403e+09 1.403e+09 65.14 1.32e-15 ***
## Residuals 1674 3.605e+10 2.153e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(aov.res)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 33.209 9.83e-09 ***
## 1674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#oneway.test(MonthlyIncome~Attrition, data = healthcare, var.equal = FALSE)
anova_results =
data.frame(Variable = c("Distance From Home",
"Monthly Income"),
F_Stat = c(18.87,
65.14),
P_value = c(0.00,
0.00),
Levene_Test = c(0.001,
0.00),
Stat_Sig = c("Yes",
"Yes"))
kable(anova_results ,
col.names = c("Variable","F Statistic", "p-value", "Levene's Test p-value","Statistically Significant"))
Variable | F Statistic | p-value | Levene’s Test p-value | Statistically Significant |
---|---|---|---|---|
Distance From Home | 18.87 | 0 | 0.001 | Yes |
Monthly Income | 65.14 | 0 | 0.000 | Yes |
healthcare2<- healthcare
#extract all yes and nos from y
healthcare_yes <- healthcare %>% filter(Attrition=='Yes') #total 199 obs
healthcare_no <- healthcare%>% filter(Attrition=='No')
#sample 199 obs from Attrition=='No'
set.seed(200)
healthcare_no_sample<- sample_n(healthcare_no, 199)
#combine sample nos and all yes' for a balanced data set
healthcare <- rbind(healthcare_no_sample, healthcare_yes)
dim(healthcare)
## [1] 398 28
healthcare$Attrition<- ifelse(healthcare$Attrition =="Yes", 1, 0)
#80/20 train/test split
set.seed(200)
sample <- sample(nrow(healthcare), 0.8*nrow(healthcare), replace= F)
train<- healthcare[sample, ]
test<- healthcare[-sample, ]
glm.fit1<- glm(Attrition~., data=train, family = binomial())
summary(glm.fit1)
##
## Call:
## glm(formula = Attrition ~ ., family = binomial(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4603 -0.2201 0.0170 0.2755 3.2570
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.1065718 4.3156112 1.878 0.060322 .
## Age -0.0719136 0.0345662 -2.080 0.037483 *
## BusinessTravelTravel_Frequently 2.6912722 1.0182904 2.643 0.008219 **
## BusinessTravelTravel_Rarely 0.6745823 0.8636300 0.781 0.434743
## DepartmentMaternity -1.4461135 0.7109350 -2.034 0.041941 *
## DepartmentNeurology -1.2343627 0.8426828 -1.465 0.142975
## DistanceFromHome 0.0840412 0.0298161 2.819 0.004823 **
## Education -0.1455381 0.2349748 -0.619 0.535668
## EducationFieldLife Sciences -1.6967636 1.4747900 -1.151 0.249933
## EducationFieldMarketing -1.4798113 1.5956419 -0.927 0.353715
## EducationFieldMedical -1.4480374 1.4639739 -0.989 0.322607
## EducationFieldOther 0.4777449 2.0867393 0.229 0.818913
## EducationFieldTechnical Degree -1.2067540 1.5891890 -0.759 0.447642
## EnvironmentSatisfaction -0.5065119 0.2112972 -2.397 0.016523 *
## GenderMale 0.3494501 0.4663860 0.749 0.453693
## JobInvolvement -1.6612075 0.3934164 -4.223 2.42e-05 ***
## JobLevel -1.1624995 0.9293545 -1.251 0.210983
## JobRoleNurse 3.7228240 1.9508992 1.908 0.056358 .
## JobRoleOther 3.2870288 1.9976678 1.645 0.099881 .
## JobRoleTherapist 0.3164105 2.0271682 0.156 0.875966
## JobSatisfaction -0.7404001 0.2195938 -3.372 0.000747 ***
## MaritalStatusMarried 1.2649033 0.6843319 1.848 0.064548 .
## MaritalStatusSingle 3.9119036 0.8971047 4.361 1.30e-05 ***
## MonthlyIncome 0.0002637 0.0002237 1.179 0.238544
## NumCompaniesWorked 0.3155358 0.1063027 2.968 0.002995 **
## OverTimeYes 4.4069806 0.6299006 6.996 2.63e-12 ***
## PercentSalaryHike -0.1935204 0.1089278 -1.777 0.075635 .
## PerformanceRating 0.9724953 1.1572666 0.840 0.400719
## RelationshipSatisfaction -0.1748412 0.2010555 -0.870 0.384510
## Shift 0.5316162 0.4065238 1.308 0.190971
## TotalWorkingYears -0.0058947 0.0856518 -0.069 0.945132
## TrainingTimesLastYear -0.3694837 0.1957755 -1.887 0.059122 .
## WorkLifeBalance -0.9518480 0.3106259 -3.064 0.002182 **
## YearsAtCompany 0.1032067 0.1292205 0.799 0.424472
## YearsInCurrentRole -0.5514176 0.1689982 -3.263 0.001103 **
## YearsSinceLastPromotion 0.2539608 0.1147162 2.214 0.026841 *
## YearsWithCurrManager -0.2870282 0.1436914 -1.998 0.045767 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 440.23 on 317 degrees of freedom
## Residual deviance: 158.51 on 281 degrees of freedom
## AIC: 232.51
##
## Number of Fisher Scoring iterations: 7
#MASS package
step(glm.fit1, direction = "both")
## Start: AIC=232.51
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## Education + EducationField + EnvironmentSatisfaction + Gender +
## JobInvolvement + JobLevel + JobRole + JobSatisfaction + MaritalStatus +
## MonthlyIncome + NumCompaniesWorked + OverTime + PercentSalaryHike +
## PerformanceRating + RelationshipSatisfaction + Shift + TotalWorkingYears +
## TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany +
## YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager
##
## Df Deviance AIC
## - EducationField 5 161.93 225.93
## - TotalWorkingYears 1 158.51 230.51
## - Education 1 158.90 230.90
## - Gender 1 159.07 231.07
## - YearsAtCompany 1 159.11 231.11
## - PerformanceRating 1 159.23 231.23
## - RelationshipSatisfaction 1 159.27 231.27
## - MonthlyIncome 1 159.92 231.92
## - JobLevel 1 160.10 232.10
## - Shift 1 160.34 232.34
## <none> 158.51 232.51
## - Department 2 163.04 233.04
## - PercentSalaryHike 1 161.84 233.84
## - TrainingTimesLastYear 1 162.24 234.24
## - YearsWithCurrManager 1 162.66 234.66
## - Age 1 163.16 235.16
## - YearsSinceLastPromotion 1 163.69 235.69
## - EnvironmentSatisfaction 1 164.61 236.61
## - JobRole 3 169.05 237.05
## - DistanceFromHome 1 167.47 239.47
## - NumCompaniesWorked 1 168.60 240.60
## - WorkLifeBalance 1 169.20 241.20
## - BusinessTravel 2 173.11 243.11
## - JobSatisfaction 1 171.65 243.65
## - YearsInCurrentRole 1 172.84 244.83
## - MaritalStatus 2 186.48 256.48
## - JobInvolvement 1 185.37 257.37
## - OverTime 1 257.40 329.40
##
## Step: AIC=225.93
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## Education + EnvironmentSatisfaction + Gender + JobInvolvement +
## JobLevel + JobRole + JobSatisfaction + MaritalStatus + MonthlyIncome +
## NumCompaniesWorked + OverTime + PercentSalaryHike + PerformanceRating +
## RelationshipSatisfaction + Shift + TotalWorkingYears + TrainingTimesLastYear +
## WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion +
## YearsWithCurrManager
##
## Df Deviance AIC
## - TotalWorkingYears 1 161.95 223.95
## - Education 1 162.22 224.22
## - YearsAtCompany 1 162.41 224.41
## - Gender 1 162.64 224.64
## - PerformanceRating 1 162.77 224.77
## - MonthlyIncome 1 162.80 224.80
## - JobLevel 1 162.93 224.93
## - Shift 1 163.08 225.08
## - RelationshipSatisfaction 1 163.22 225.22
## <none> 161.93 225.93
## - YearsWithCurrManager 1 165.32 227.32
## - Department 2 167.38 227.38
## - PercentSalaryHike 1 165.41 227.41
## - TrainingTimesLastYear 1 165.59 227.59
## - YearsSinceLastPromotion 1 166.56 228.56
## - Age 1 166.64 228.64
## - EnvironmentSatisfaction 1 167.18 229.18
## - JobRole 3 172.42 230.42
## - DistanceFromHome 1 170.23 232.23
## + EducationField 5 158.51 232.51
## - WorkLifeBalance 1 171.28 233.28
## - NumCompaniesWorked 1 173.07 235.07
## - JobSatisfaction 1 173.57 235.57
## - YearsInCurrentRole 1 175.30 237.30
## - BusinessTravel 2 177.50 237.50
## - MaritalStatus 2 187.97 247.97
## - JobInvolvement 1 192.13 254.13
## - OverTime 1 259.81 321.81
##
## Step: AIC=223.95
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## Education + EnvironmentSatisfaction + Gender + JobInvolvement +
## JobLevel + JobRole + JobSatisfaction + MaritalStatus + MonthlyIncome +
## NumCompaniesWorked + OverTime + PercentSalaryHike + PerformanceRating +
## RelationshipSatisfaction + Shift + TrainingTimesLastYear +
## WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion +
## YearsWithCurrManager
##
## Df Deviance AIC
## - Education 1 162.28 222.28
## - YearsAtCompany 1 162.41 222.41
## - Gender 1 162.66 222.66
## - PerformanceRating 1 162.81 222.81
## - MonthlyIncome 1 162.81 222.81
## - Shift 1 163.12 223.12
## - JobLevel 1 163.13 223.13
## - RelationshipSatisfaction 1 163.23 223.23
## <none> 161.95 223.95
## - YearsWithCurrManager 1 165.37 225.37
## - PercentSalaryHike 1 165.46 225.46
## - TrainingTimesLastYear 1 165.63 225.63
## + TotalWorkingYears 1 161.93 225.93
## - Department 2 168.09 226.09
## - YearsSinceLastPromotion 1 166.61 226.61
## - EnvironmentSatisfaction 1 167.18 227.18
## - Age 1 167.69 227.69
## - JobRole 3 173.29 229.29
## - DistanceFromHome 1 170.31 230.31
## + EducationField 5 158.51 230.51
## - WorkLifeBalance 1 171.35 231.35
## - NumCompaniesWorked 1 173.33 233.33
## - JobSatisfaction 1 173.63 233.63
## - BusinessTravel 2 177.50 235.50
## - YearsInCurrentRole 1 175.88 235.88
## - MaritalStatus 2 188.10 246.10
## - JobInvolvement 1 192.19 252.19
## - OverTime 1 260.10 320.10
##
## Step: AIC=222.28
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## EnvironmentSatisfaction + Gender + JobInvolvement + JobLevel +
## JobRole + JobSatisfaction + MaritalStatus + MonthlyIncome +
## NumCompaniesWorked + OverTime + PercentSalaryHike + PerformanceRating +
## RelationshipSatisfaction + Shift + TrainingTimesLastYear +
## WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion +
## YearsWithCurrManager
##
## Df Deviance AIC
## - YearsAtCompany 1 162.75 220.75
## - Gender 1 162.92 220.92
## - PerformanceRating 1 163.09 221.09
## - MonthlyIncome 1 163.20 221.20
## - Shift 1 163.34 221.35
## - JobLevel 1 163.46 221.46
## - RelationshipSatisfaction 1 163.55 221.55
## <none> 162.28 222.28
## - YearsWithCurrManager 1 165.69 223.69
## - PercentSalaryHike 1 165.76 223.76
## + Education 1 161.95 223.95
## - TrainingTimesLastYear 1 166.08 224.08
## - Department 2 168.19 224.19
## + TotalWorkingYears 1 162.22 224.22
## - YearsSinceLastPromotion 1 167.03 225.02
## - EnvironmentSatisfaction 1 167.35 225.35
## - Age 1 169.09 227.09
## - JobRole 3 173.98 227.98
## - DistanceFromHome 1 170.48 228.48
## + EducationField 5 158.93 228.93
## - WorkLifeBalance 1 171.63 229.63
## - NumCompaniesWorked 1 173.40 231.40
## - JobSatisfaction 1 173.68 231.68
## - BusinessTravel 2 177.51 233.51
## - YearsInCurrentRole 1 176.21 234.21
## - MaritalStatus 2 188.33 244.33
## - JobInvolvement 1 192.91 250.91
## - OverTime 1 260.29 318.29
##
## Step: AIC=220.75
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## EnvironmentSatisfaction + Gender + JobInvolvement + JobLevel +
## JobRole + JobSatisfaction + MaritalStatus + MonthlyIncome +
## NumCompaniesWorked + OverTime + PercentSalaryHike + PerformanceRating +
## RelationshipSatisfaction + Shift + TrainingTimesLastYear +
## WorkLifeBalance + YearsInCurrentRole + YearsSinceLastPromotion +
## YearsWithCurrManager
##
## Df Deviance AIC
## - Gender 1 163.29 219.29
## - PerformanceRating 1 163.46 219.46
## - MonthlyIncome 1 163.55 219.55
## - JobLevel 1 163.72 219.72
## - Shift 1 163.75 219.75
## - RelationshipSatisfaction 1 164.29 220.29
## <none> 162.75 220.75
## - YearsWithCurrManager 1 165.76 221.76
## - PercentSalaryHike 1 166.21 222.21
## + YearsAtCompany 1 162.28 222.28
## - TrainingTimesLastYear 1 166.35 222.35
## + Education 1 162.41 222.41
## - Department 2 168.42 222.42
## + TotalWorkingYears 1 162.75 222.74
## - YearsSinceLastPromotion 1 167.80 223.80
## - EnvironmentSatisfaction 1 168.60 224.60
## - Age 1 169.10 225.10
## - DistanceFromHome 1 170.83 226.83
## - JobRole 3 175.19 227.19
## + EducationField 5 159.58 227.58
## - WorkLifeBalance 1 172.72 228.72
## - NumCompaniesWorked 1 173.40 229.40
## - JobSatisfaction 1 173.68 229.68
## - BusinessTravel 2 177.69 231.69
## - YearsInCurrentRole 1 179.35 235.35
## - MaritalStatus 2 188.62 242.62
## - JobInvolvement 1 193.09 249.09
## - OverTime 1 260.42 316.42
##
## Step: AIC=219.29
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRole +
## JobSatisfaction + MaritalStatus + MonthlyIncome + NumCompaniesWorked +
## OverTime + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction +
## Shift + TrainingTimesLastYear + WorkLifeBalance + YearsInCurrentRole +
## YearsSinceLastPromotion + YearsWithCurrManager
##
## Df Deviance AIC
## - PerformanceRating 1 164.10 218.10
## - Shift 1 164.20 218.20
## - MonthlyIncome 1 164.32 218.32
## - JobLevel 1 164.45 218.45
## - RelationshipSatisfaction 1 164.70 218.70
## <none> 163.29 219.29
## - YearsWithCurrManager 1 166.20 220.20
## + Gender 1 162.75 220.75
## - Department 2 168.78 220.78
## + YearsAtCompany 1 162.92 220.92
## - PercentSalaryHike 1 166.96 220.96
## + Education 1 163.02 221.02
## - TrainingTimesLastYear 1 167.12 221.12
## + TotalWorkingYears 1 163.28 221.28
## - YearsSinceLastPromotion 1 168.00 222.00
## - Age 1 169.29 223.29
## - EnvironmentSatisfaction 1 169.86 223.86
## - DistanceFromHome 1 171.35 225.35
## + EducationField 5 159.97 225.97
## - JobRole 3 176.63 226.63
## - WorkLifeBalance 1 173.52 227.52
## - NumCompaniesWorked 1 173.57 227.57
## - JobSatisfaction 1 174.18 228.18
## - BusinessTravel 2 177.79 229.79
## - YearsInCurrentRole 1 180.53 234.53
## - MaritalStatus 2 188.95 240.95
## - JobInvolvement 1 193.10 247.10
## - OverTime 1 260.86 314.86
##
## Step: AIC=218.1
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRole +
## JobSatisfaction + MaritalStatus + MonthlyIncome + NumCompaniesWorked +
## OverTime + PercentSalaryHike + RelationshipSatisfaction +
## Shift + TrainingTimesLastYear + WorkLifeBalance + YearsInCurrentRole +
## YearsSinceLastPromotion + YearsWithCurrManager
##
## Df Deviance AIC
## - Shift 1 164.71 216.71
## - MonthlyIncome 1 164.82 216.82
## - JobLevel 1 165.02 217.02
## - RelationshipSatisfaction 1 165.55 217.55
## <none> 164.10 218.10
## - YearsWithCurrManager 1 166.83 218.83
## - Department 2 168.92 218.92
## + PerformanceRating 1 163.29 219.29
## + Gender 1 163.46 219.46
## + YearsAtCompany 1 163.79 219.79
## - PercentSalaryHike 1 167.84 219.84
## + Education 1 163.86 219.86
## + TotalWorkingYears 1 164.08 220.08
## - YearsSinceLastPromotion 1 168.64 220.64
## - TrainingTimesLastYear 1 168.79 220.79
## - Age 1 170.22 222.22
## - DistanceFromHome 1 171.39 223.39
## - EnvironmentSatisfaction 1 172.07 224.07
## + EducationField 5 160.62 224.62
## - JobRole 3 177.40 225.40
## - WorkLifeBalance 1 173.63 225.63
## - NumCompaniesWorked 1 173.95 225.95
## - JobSatisfaction 1 175.43 227.43
## - BusinessTravel 2 179.24 229.24
## - YearsInCurrentRole 1 180.92 232.92
## - MaritalStatus 2 189.30 239.30
## - JobInvolvement 1 193.28 245.28
## - OverTime 1 261.34 313.34
##
## Step: AIC=216.71
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRole +
## JobSatisfaction + MaritalStatus + MonthlyIncome + NumCompaniesWorked +
## OverTime + PercentSalaryHike + RelationshipSatisfaction +
## TrainingTimesLastYear + WorkLifeBalance + YearsInCurrentRole +
## YearsSinceLastPromotion + YearsWithCurrManager
##
## Df Deviance AIC
## - MonthlyIncome 1 165.31 215.31
## - JobLevel 1 165.50 215.50
## - RelationshipSatisfaction 1 166.52 216.52
## <none> 164.71 216.71
## - YearsWithCurrManager 1 167.51 217.51
## - Department 2 169.78 217.78
## + Shift 1 164.10 218.10
## + Gender 1 164.19 218.19
## - PercentSalaryHike 1 168.19 218.19
## + PerformanceRating 1 164.20 218.20
## + YearsAtCompany 1 164.44 218.44
## + Education 1 164.53 218.53
## + TotalWorkingYears 1 164.70 218.70
## - YearsSinceLastPromotion 1 169.18 219.18
## - TrainingTimesLastYear 1 169.30 219.30
## - Age 1 170.81 220.81
## - DistanceFromHome 1 171.72 221.72
## - EnvironmentSatisfaction 1 172.64 222.64
## + EducationField 5 161.83 223.83
## - JobRole 3 177.99 223.99
## - WorkLifeBalance 1 174.06 224.06
## - NumCompaniesWorked 1 174.75 224.75
## - JobSatisfaction 1 176.01 226.01
## - BusinessTravel 2 179.54 227.54
## - YearsInCurrentRole 1 182.06 232.06
## - JobInvolvement 1 193.46 243.46
## - MaritalStatus 2 196.51 244.51
## - OverTime 1 261.59 311.59
##
## Step: AIC=215.31
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRole +
## JobSatisfaction + MaritalStatus + NumCompaniesWorked + OverTime +
## PercentSalaryHike + RelationshipSatisfaction + TrainingTimesLastYear +
## WorkLifeBalance + YearsInCurrentRole + YearsSinceLastPromotion +
## YearsWithCurrManager
##
## Df Deviance AIC
## - JobLevel 1 165.50 213.50
## - RelationshipSatisfaction 1 167.06 215.06
## <none> 165.31 215.31
## - YearsWithCurrManager 1 167.93 215.93
## - Department 2 170.03 216.03
## + Gender 1 164.62 216.62
## + MonthlyIncome 1 164.71 216.71
## + Shift 1 164.82 216.82
## - PercentSalaryHike 1 168.84 216.84
## + PerformanceRating 1 165.00 217.00
## + Education 1 165.08 217.08
## + YearsAtCompany 1 165.12 217.12
## + TotalWorkingYears 1 165.30 217.30
## - YearsSinceLastPromotion 1 169.84 217.84
## - TrainingTimesLastYear 1 170.11 218.11
## - Age 1 171.34 219.34
## - DistanceFromHome 1 172.63 220.63
## - EnvironmentSatisfaction 1 173.67 221.67
## - JobRole 3 178.08 222.08
## - WorkLifeBalance 1 174.13 222.13
## + EducationField 5 162.81 222.81
## - NumCompaniesWorked 1 176.71 224.71
## - JobSatisfaction 1 176.84 224.84
## - BusinessTravel 2 179.93 225.93
## - YearsInCurrentRole 1 183.41 231.41
## - JobInvolvement 1 193.67 241.67
## - MaritalStatus 2 197.38 243.38
## - OverTime 1 261.71 309.71
##
## Step: AIC=213.5
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## EnvironmentSatisfaction + JobInvolvement + JobRole + JobSatisfaction +
## MaritalStatus + NumCompaniesWorked + OverTime + PercentSalaryHike +
## RelationshipSatisfaction + TrainingTimesLastYear + WorkLifeBalance +
## YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager
##
## Df Deviance AIC
## - RelationshipSatisfaction 1 167.18 213.18
## <none> 165.50 213.50
## - Department 2 170.15 214.15
## - YearsWithCurrManager 1 168.74 214.74
## + Gender 1 164.86 214.86
## - PercentSalaryHike 1 168.94 214.94
## + Shift 1 165.02 215.02
## + PerformanceRating 1 165.15 215.15
## + Education 1 165.31 215.31
## + JobLevel 1 165.31 215.31
## + TotalWorkingYears 1 165.38 215.38
## + YearsAtCompany 1 165.38 215.38
## + MonthlyIncome 1 165.50 215.50
## - YearsSinceLastPromotion 1 170.06 216.06
## - TrainingTimesLastYear 1 170.24 216.24
## - DistanceFromHome 1 173.04 219.04
## - Age 1 173.47 219.47
## - EnvironmentSatisfaction 1 173.81 219.81
## - WorkLifeBalance 1 174.42 220.42
## + EducationField 5 163.04 221.04
## - NumCompaniesWorked 1 176.77 222.77
## - JobSatisfaction 1 176.90 222.90
## - BusinessTravel 2 180.25 224.25
## - JobRole 3 183.14 225.14
## - YearsInCurrentRole 1 183.58 229.58
## - JobInvolvement 1 194.87 240.87
## - MaritalStatus 2 197.49 241.49
## - OverTime 1 261.80 307.80
##
## Step: AIC=213.18
## Attrition ~ Age + BusinessTravel + Department + DistanceFromHome +
## EnvironmentSatisfaction + JobInvolvement + JobRole + JobSatisfaction +
## MaritalStatus + NumCompaniesWorked + OverTime + PercentSalaryHike +
## TrainingTimesLastYear + WorkLifeBalance + YearsInCurrentRole +
## YearsSinceLastPromotion + YearsWithCurrManager
##
## Df Deviance AIC
## <none> 167.18 213.18
## - Department 2 171.47 213.47
## + RelationshipSatisfaction 1 165.50 213.50
## - YearsWithCurrManager 1 169.88 213.88
## - PercentSalaryHike 1 170.29 214.29
## + Shift 1 166.36 214.36
## + Gender 1 166.69 214.69
## + PerformanceRating 1 166.84 214.84
## + YearsAtCompany 1 166.88 214.88
## + Education 1 167.01 215.01
## + JobLevel 1 167.06 215.06
## + TotalWorkingYears 1 167.12 215.12
## + MonthlyIncome 1 167.18 215.18
## - YearsSinceLastPromotion 1 171.30 215.30
## - TrainingTimesLastYear 1 171.92 215.92
## - DistanceFromHome 1 174.96 218.96
## - EnvironmentSatisfaction 1 175.19 219.19
## - Age 1 175.39 219.39
## + EducationField 5 164.42 220.42
## - WorkLifeBalance 1 176.70 220.70
## - NumCompaniesWorked 1 178.18 222.18
## - JobSatisfaction 1 178.54 222.54
## - BusinessTravel 2 181.98 223.98
## - JobRole 3 185.86 225.86
## - YearsInCurrentRole 1 185.24 229.24
## - MaritalStatus 2 198.21 240.21
## - JobInvolvement 1 197.74 241.74
## - OverTime 1 262.27 306.27
##
## Call: glm(formula = Attrition ~ Age + BusinessTravel + Department +
## DistanceFromHome + EnvironmentSatisfaction + JobInvolvement +
## JobRole + JobSatisfaction + MaritalStatus + NumCompaniesWorked +
## OverTime + PercentSalaryHike + TrainingTimesLastYear + WorkLifeBalance +
## YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager,
## family = binomial(), data = train)
##
## Coefficients:
## (Intercept) Age
## 7.42311 -0.07024
## BusinessTravelTravel_Frequently BusinessTravelTravel_Rarely
## 2.68001 0.87744
## DepartmentMaternity DepartmentNeurology
## -0.99321 -0.50434
## DistanceFromHome EnvironmentSatisfaction
## 0.07082 -0.52866
## JobInvolvement JobRoleNurse
## -1.53591 3.01169
## JobRoleOther JobRoleTherapist
## 2.74327 -1.01539
## JobSatisfaction MaritalStatusMarried
## -0.62789 1.11123
## MaritalStatusSingle NumCompaniesWorked
## 3.18857 0.28388
## OverTimeYes PercentSalaryHike
## 3.95524 -0.10693
## TrainingTimesLastYear WorkLifeBalance
## -0.39351 -0.81258
## YearsInCurrentRole YearsSinceLastPromotion
## -0.47481 0.20272
## YearsWithCurrManager
## -0.18419
##
## Degrees of Freedom: 317 Total (i.e. Null); 295 Residual
## Null Deviance: 440.2
## Residual Deviance: 167.2 AIC: 213.2
glm.fit2<- glm(Attrition ~ Age + BusinessTravel + Department +
DistanceFromHome + EnvironmentSatisfaction + JobInvolvement +
JobRole + JobSatisfaction + MaritalStatus + NumCompaniesWorked +
OverTime + PercentSalaryHike + TrainingTimesLastYear + WorkLifeBalance +
YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager,
family = binomial(), data = train)
#summary(glm.fit2)
#predictions saved to test df
test$GLM2Predsx <- predict(glm.fit2, test, type='response')
#covert probability predictions to yes/no format to match y (response variable)
test$GLM2Preds <- ifelse(test$GLM2Predsx>=0.5, 1 ,0 )
# Confusion matrix to compare accuracy
caret::confusionMatrix(as.factor(test$GLM2Preds), as.factor(test$Attrition))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 4
## 1 10 29
##
## Accuracy : 0.825
## 95% CI : (0.7238, 0.9009)
## No Information Rate : 0.5875
## P-Value [Acc > NIR] : 4.97e-06
##
## Kappa : 0.6485
##
## Mcnemar's Test P-Value : 0.1814
##
## Sensitivity : 0.7872
## Specificity : 0.8788
## Pos Pred Value : 0.9024
## Neg Pred Value : 0.7436
## Prevalence : 0.5875
## Detection Rate : 0.4625
## Detection Prevalence : 0.5125
## Balanced Accuracy : 0.8330
##
## 'Positive' Class : 0
##
library(rpart)
dt.model = rpart(Attrition~.,
data = train)
set.seed(200)
dt.model <- rpart(formula = Attrition~., # Y ~ all other variables in dataframe
data = train, # include only relevant variables
method = "class") # classification
rattle::fancyRpartPlot(dt.model, sub = "")
dt.model$variable.importance
## OverTime YearsAtCompany TotalWorkingYears
## 36.5186990 29.8530139 27.1688954
## YearsInCurrentRole YearsWithCurrManager Age
## 23.2306710 19.1897333 18.2343038
## MonthlyIncome JobRole DistanceFromHome
## 16.7615484 9.4283819 7.1548112
## WorkLifeBalance JobLevel YearsSinceLastPromotion
## 6.9738409 4.8514619 4.7911344
## JobInvolvement RelationshipSatisfaction NumCompaniesWorked
## 3.9248484 3.7696722 2.6501035
## EducationField JobSatisfaction Department
## 2.1856423 1.5627385 0.7786298
## BusinessTravel
## 0.3114519
printcp(dt.model)
##
## Classification tree:
## rpart(formula = Attrition ~ ., data = train, method = "class")
##
## Variables actually used in tree construction:
## [1] Age JobInvolvement JobRole OverTime
## [5] WorkLifeBalance YearsAtCompany YearsInCurrentRole
##
## Root node error: 152/318 = 0.47799
##
## n= 318
##
## CP nsplit rel error xerror xstd
## 1 0.453947 0 1.00000 1.00000 0.058603
## 2 0.098684 1 0.54605 0.54605 0.051525
## 3 0.052632 2 0.44737 0.51316 0.050477
## 4 0.026316 3 0.39474 0.44737 0.048102
## 5 0.019737 5 0.34211 0.46711 0.048856
## 6 0.010965 7 0.30263 0.47368 0.049099
## 7 0.010000 10 0.26974 0.46711 0.048856
plotcp(dt.model)
library(rpart.plot)
dt.model_2 <- prune(dt.model,
cp = dt.model$cptable[which.min(dt.model$cptable[, "xerror"]), "CP"])
#rm(oj.full_class)
rpart.plot(dt.model_2, yesno = TRUE)
dt.pred <- predict(dt.model_2, test, type = "class")
plot(as.factor(test$Attrition), dt.pred,
main = "Simple Classification: Predicted vs. Actual",
xlab = "Actual",
ylab = "Predicted")
caret::confusionMatrix(dt.pred, as.factor(test$Attrition))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 6
## 1 10 27
##
## Accuracy : 0.8
## 95% CI : (0.6956, 0.8811)
## No Information Rate : 0.5875
## P-Value [Acc > NIR] : 4.709e-05
##
## Kappa : 0.5947
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.7872
## Specificity : 0.8182
## Pos Pred Value : 0.8605
## Neg Pred Value : 0.7297
## Prevalence : 0.5875
## Detection Rate : 0.4625
## Detection Prevalence : 0.5375
## Balanced Accuracy : 0.8027
##
## 'Positive' Class : 0
##
library(randomForestSRC)
##
## randomForestSRC 3.1.1
##
## Type rfsrc.news() to see new features, changes, and bug fixes.
##
##
## Attaching package: 'randomForestSRC'
## The following object is masked from 'package:purrr':
##
## partial
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(123)
forest_attrition = randomForest( as.factor(Attrition)~.,
data = train,
mtry = 4,
importance = TRUE,
nodesize = 6,
ntree = 4000)
forest_attrition
##
## Call:
## randomForest(formula = as.factor(Attrition) ~ ., data = train, mtry = 4, importance = TRUE, nodesize = 6, ntree = 4000)
## Type of random forest: classification
## Number of trees: 4000
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 18.87%
## Confusion matrix:
## 0 1 class.error
## 0 117 35 0.2302632
## 1 25 141 0.1506024
forest_attrition$importance
## 0 1 MeanDecreaseAccuracy
## Age 1.430434e-02 1.842709e-02 1.629812e-02
## BusinessTravel 5.849689e-04 4.773240e-04 5.289570e-04
## Department -3.635458e-04 5.939141e-03 2.911026e-03
## DistanceFromHome 3.033042e-05 2.198226e-03 1.164164e-03
## Education 7.355907e-06 1.774369e-03 9.388388e-04
## EducationField 3.861125e-05 8.368406e-05 6.142557e-05
## EnvironmentSatisfaction -5.824322e-04 1.138020e-03 3.011791e-04
## Gender 1.869168e-04 4.015145e-04 3.054460e-04
## JobInvolvement 6.499667e-03 6.109738e-03 6.286215e-03
## JobLevel 8.155532e-03 1.460965e-02 1.149369e-02
## JobRole 5.237617e-03 1.395944e-02 9.742046e-03
## JobSatisfaction -4.462138e-04 1.530078e-03 5.907809e-04
## MaritalStatus 7.805321e-03 6.471305e-03 7.085827e-03
## MonthlyIncome 1.060592e-02 1.886684e-02 1.483210e-02
## NumCompaniesWorked 6.593085e-04 3.469472e-03 2.096233e-03
## OverTime 5.328765e-02 5.843386e-02 5.574207e-02
## PercentSalaryHike -2.069767e-03 6.789197e-05 -9.561063e-04
## PerformanceRating -1.253912e-04 8.126190e-05 -2.578386e-05
## RelationshipSatisfaction 3.691584e-04 1.234608e-03 7.829124e-04
## Shift 1.158897e-02 9.679083e-03 1.059665e-02
## TotalWorkingYears 1.709703e-02 1.647894e-02 1.668216e-02
## TrainingTimesLastYear 1.515487e-03 3.163348e-04 8.751312e-04
## WorkLifeBalance 1.496394e-03 2.732453e-03 2.105492e-03
## YearsAtCompany 1.963235e-02 2.116314e-02 2.038626e-02
## YearsInCurrentRole 1.361941e-02 1.603535e-02 1.488709e-02
## YearsSinceLastPromotion -8.045431e-04 2.769669e-03 1.081270e-03
## YearsWithCurrManager 1.526900e-02 1.848750e-02 1.679891e-02
## MeanDecreaseGini
## Age 10.3722897
## BusinessTravel 1.3561160
## Department 2.2617323
## DistanceFromHome 4.6806382
## Education 2.3374612
## EducationField 1.8734543
## EnvironmentSatisfaction 2.1928515
## Gender 0.8757624
## JobInvolvement 4.4733704
## JobLevel 4.3247900
## JobRole 4.3743525
## JobSatisfaction 2.2650832
## MaritalStatus 3.9258642
## MonthlyIncome 9.3171168
## NumCompaniesWorked 3.7408885
## OverTime 17.9290236
## PercentSalaryHike 2.8176257
## PerformanceRating 0.4275825
## RelationshipSatisfaction 1.9329476
## Shift 4.7592462
## TotalWorkingYears 9.1174986
## TrainingTimesLastYear 2.5198297
## WorkLifeBalance 2.7272578
## YearsAtCompany 9.7457075
## YearsInCurrentRole 6.8144949
## YearsSinceLastPromotion 2.7564217
## YearsWithCurrManager 7.4030528
pred1_att = predict(forest_attrition,newdata = test)
caret::confusionMatrix(pred1_att, as.factor(test$Attrition))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 40 3
## 1 7 30
##
## Accuracy : 0.875
## 95% CI : (0.7821, 0.9384)
## No Information Rate : 0.5875
## P-Value [Acc > NIR] : 1.976e-08
##
## Kappa : 0.7467
##
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.8511
## Specificity : 0.9091
## Pos Pred Value : 0.9302
## Neg Pred Value : 0.8108
## Prevalence : 0.5875
## Detection Rate : 0.5000
## Detection Prevalence : 0.5375
## Balanced Accuracy : 0.8801
##
## 'Positive' Class : 0
##