Installing the required libraries for the analysis

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'purrr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::combine()  masks randomForest::combine()
## x dplyr::filter()   masks stats::filter()
## x dplyr::lag()      masks stats::lag()
## x ggplot2::margin() masks randomForest::margin()
library(dbplyr)
## Warning: package 'dbplyr' was built under R version 4.0.3
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(readr)
library(devtools)
## Warning: package 'devtools' was built under R version 4.0.3
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.0.3
#install_github("ramnathv/htmlwidgets") 
#install_github("smartinsightsfromdata/rpivotTable")
library(rpivotTable)
library(easyalluvial)
library(parcats)
## Warning: package 'parcats' was built under R version 4.0.3
#install.packages("ROSE")
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.0.3
## Loaded ROSE 0.0-3
#devtools::install_github("MI2DataLab/randomForestExplainer")
library(randomForestExplainer)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Reading te HR data and also looking at the sumamry stats

WA_Fn_UseC_HR_Employee_Attrition <- read_csv("C:/Users/Nikhil/Desktop/WA_Fn-UseC_-HR-Employee-Attrition.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Attrition = col_character(),
##   BusinessTravel = col_character(),
##   Department = col_character(),
##   EducationField = col_character(),
##   Gender = col_character(),
##   JobRole = col_character(),
##   MaritalStatus = col_character(),
##   Over18 = col_character(),
##   OverTime = col_character()
## )
## i Use `spec()` for the full column specifications.
View(WA_Fn_UseC_HR_Employee_Attrition)
dataHR<-WA_Fn_UseC_HR_Employee_Attrition
str(dataHR)
## tibble [1,470 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                     : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : chr [1:1470] "Yes" "No" "Yes" "No" ...
##  $ BusinessTravel          : chr [1:1470] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
##  $ DailyRate               : num [1:1470] 1102 279 1373 1392 591 ...
##  $ Department              : chr [1:1470] "Sales" "Research & Development" "Research & Development" "Research & Development" ...
##  $ DistanceFromHome        : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : chr [1:1470] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
##  $ EmployeeCount           : num [1:1470] 1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : num [1:1470] 1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : chr [1:1470] "Female" "Male" "Male" "Female" ...
##  $ HourlyRate              : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : chr [1:1470] "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
##  $ JobSatisfaction         : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : chr [1:1470] "Single" "Married" "Single" "Married" ...
##  $ MonthlyIncome           : num [1:1470] 5993 5130 2090 2909 3468 ...
##  $ MonthlyRate             : num [1:1470] 19479 24907 2396 23159 16632 ...
##  $ NumCompaniesWorked      : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : chr [1:1470] "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr [1:1470] "Yes" "No" "Yes" "Yes" ...
##  $ PercentSalaryHike       : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Attrition = col_character(),
##   ..   BusinessTravel = col_character(),
##   ..   DailyRate = col_double(),
##   ..   Department = col_character(),
##   ..   DistanceFromHome = col_double(),
##   ..   Education = col_double(),
##   ..   EducationField = col_character(),
##   ..   EmployeeCount = col_double(),
##   ..   EmployeeNumber = col_double(),
##   ..   EnvironmentSatisfaction = col_double(),
##   ..   Gender = col_character(),
##   ..   HourlyRate = col_double(),
##   ..   JobInvolvement = col_double(),
##   ..   JobLevel = col_double(),
##   ..   JobRole = col_character(),
##   ..   JobSatisfaction = col_double(),
##   ..   MaritalStatus = col_character(),
##   ..   MonthlyIncome = col_double(),
##   ..   MonthlyRate = col_double(),
##   ..   NumCompaniesWorked = col_double(),
##   ..   Over18 = col_character(),
##   ..   OverTime = col_character(),
##   ..   PercentSalaryHike = col_double(),
##   ..   PerformanceRating = col_double(),
##   ..   RelationshipSatisfaction = col_double(),
##   ..   StandardHours = col_double(),
##   ..   StockOptionLevel = col_double(),
##   ..   TotalWorkingYears = col_double(),
##   ..   TrainingTimesLastYear = col_double(),
##   ..   WorkLifeBalance = col_double(),
##   ..   YearsAtCompany = col_double(),
##   ..   YearsInCurrentRole = col_double(),
##   ..   YearsSinceLastPromotion = col_double(),
##   ..   YearsWithCurrManager = col_double()
##   .. )
summary(dataHR)
##       Age         Attrition         BusinessTravel       DailyRate     
##  Min.   :18.00   Length:1470        Length:1470        Min.   : 102.0  
##  1st Qu.:30.00   Class :character   Class :character   1st Qu.: 465.0  
##  Median :36.00   Mode  :character   Mode  :character   Median : 802.0  
##  Mean   :36.92                                         Mean   : 802.5  
##  3rd Qu.:43.00                                         3rd Qu.:1157.0  
##  Max.   :60.00                                         Max.   :1499.0  
##   Department        DistanceFromHome   Education     EducationField    
##  Length:1470        Min.   : 1.000   Min.   :1.000   Length:1470       
##  Class :character   1st Qu.: 2.000   1st Qu.:2.000   Class :character  
##  Mode  :character   Median : 7.000   Median :3.000   Mode  :character  
##                     Mean   : 9.193   Mean   :2.913                     
##                     3rd Qu.:14.000   3rd Qu.:4.000                     
##                     Max.   :29.000   Max.   :5.000                     
##  EmployeeCount EmployeeNumber   EnvironmentSatisfaction    Gender         
##  Min.   :1     Min.   :   1.0   Min.   :1.000           Length:1470       
##  1st Qu.:1     1st Qu.: 491.2   1st Qu.:2.000           Class :character  
##  Median :1     Median :1020.5   Median :3.000           Mode  :character  
##  Mean   :1     Mean   :1024.9   Mean   :2.722                             
##  3rd Qu.:1     3rd Qu.:1555.8   3rd Qu.:4.000                             
##  Max.   :1     Max.   :2068.0   Max.   :4.000                             
##    HourlyRate     JobInvolvement    JobLevel       JobRole         
##  Min.   : 30.00   Min.   :1.00   Min.   :1.000   Length:1470       
##  1st Qu.: 48.00   1st Qu.:2.00   1st Qu.:1.000   Class :character  
##  Median : 66.00   Median :3.00   Median :2.000   Mode  :character  
##  Mean   : 65.89   Mean   :2.73   Mean   :2.064                     
##  3rd Qu.: 83.75   3rd Qu.:3.00   3rd Qu.:3.000                     
##  Max.   :100.00   Max.   :4.00   Max.   :5.000                     
##  JobSatisfaction MaritalStatus      MonthlyIncome    MonthlyRate   
##  Min.   :1.000   Length:1470        Min.   : 1009   Min.   : 2094  
##  1st Qu.:2.000   Class :character   1st Qu.: 2911   1st Qu.: 8047  
##  Median :3.000   Mode  :character   Median : 4919   Median :14236  
##  Mean   :2.729                      Mean   : 6503   Mean   :14313  
##  3rd Qu.:4.000                      3rd Qu.: 8379   3rd Qu.:20462  
##  Max.   :4.000                      Max.   :19999   Max.   :26999  
##  NumCompaniesWorked    Over18            OverTime         PercentSalaryHike
##  Min.   :0.000      Length:1470        Length:1470        Min.   :11.00    
##  1st Qu.:1.000      Class :character   Class :character   1st Qu.:12.00    
##  Median :2.000      Mode  :character   Mode  :character   Median :14.00    
##  Mean   :2.693                                            Mean   :15.21    
##  3rd Qu.:4.000                                            3rd Qu.:18.00    
##  Max.   :9.000                                            Max.   :25.00    
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.154     Mean   :2.712            Mean   :80    Mean   :0.7939  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.28     Mean   :2.799         Mean   :2.761   Mean   : 7.008  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.: 9.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 3.000     Median : 1.000          Median : 3.000      
##  Mean   : 4.229     Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :18.000     Max.   :15.000          Max.   :17.000
dim(dataHR)
## [1] 1470   35
head(dataHR)
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
32 No Travel_Frequently 1005 Research & Development 2 2 Life Sciences 1 8 4 Male 79 3 1 Laboratory Technician 4 Single 3068 11864 0 Y No 13 3 3 80 0 8 2 2 7 7 3 6

Converting the columns to appropraite data types

names(dataHR)[names(dataHR) == "ï..Age"] <- "Age"
dataHR$Education <- as.factor(dataHR$Education)
dataHR$EnvironmentSatisfaction <- as.factor(dataHR$EnvironmentSatisfaction)
dataHR$JobInvolvement <- as.factor(dataHR$JobInvolvement)
dataHR$JobLevel <- as.factor(dataHR$JobLevel)
dataHR$JobSatisfaction <- as.factor(dataHR$JobSatisfaction)
dataHR$StockOptionLevel <- as.factor(dataHR$StockOptionLevel)
dataHR$PerformanceRating <- as.factor(dataHR$PerformanceRating)
dataHR$RelationshipSatisfaction <- as.factor(dataHR$RelationshipSatisfaction)
dataHR$WorkLifeBalance <- as.factor(dataHR$WorkLifeBalance)

Alluvial Plots

suppressPackageStartupMessages( require(parcats) )

p = alluvial_wide(dataHR, max_variables = 5)

parcats(p, marginal_histograms = TRUE, data_input = dataHR)

Class Imbalance treatment for Attrition and dividing the dataset in to test and train set

library(ROSE)
set.seed(18)
index <- sample(nrow(dataHR), nrow(dataHR)*0.8)
data_train <- dataHR[index, ]
data_test <- dataHR[-index,]
train_balanced <- ovun.sample(Attrition ~ ., data = data_train, method = "over",N = 996*2, seed = 1)$data
table(train_balanced$Attrition)
## 
##   No  Yes 
##  977 1015

Building the random Forest Model

set.seed(2017)
forest <- randomForest(as.factor(Attrition) ~ ., data = data_train, localImp = TRUE)
forest
## 
## Call:
##  randomForest(formula = as.factor(Attrition) ~ ., data = data_train,      localImp = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 15.14%
## Confusion matrix:
##      No Yes class.error
## No  962  15  0.01535312
## Yes 163  36  0.81909548

##identifying the minimum depth of the distribution

min_depth_frame <- min_depth_distribution(forest)
head(min_depth_frame, n = 10)
tree variable minimal_depth
1 Age 4
1 BusinessTravel 10
1 DailyRate 2
1 Department 3
1 DistanceFromHome 6
1 Education 4
1 EducationField 4
1 EmployeeNumber 3
1 EnvironmentSatisfaction 4
1 Gender 7

plotting the minimum depth distribution of random forest

plot_min_depth_distribution(min_depth_frame)

plot_min_depth_distribution(min_depth_frame, mean_sample = "relevant_trees", k = 15)

plot_min_depth_distribution(min_depth_frame, mean_sample = "relevant_trees", k = 10)

plot_min_depth_distribution(min_depth_frame, mean_sample = "relevant_trees", k = 7)

plot_min_depth_distribution(min_depth_frame, mean_sample = "relevant_trees", k = 5)

Variable importance from Random Forest

importance_frame <- measure_importance(forest)
importance_frame
variable mean_min_depth no_of_nodes accuracy_decrease gini_decrease no_of_trees times_a_root p_value
Age 3.096000 3654 0.0048660 18.015176 500 32 0.0000000
BusinessTravel 6.276096 1100 0.0007821 4.128006 449 1 1.0000000
DailyRate 3.822000 3774 -0.0004191 16.459973 500 6 0.0000000
Department 6.103872 1025 0.0008948 4.278470 443 2 1.0000000
DistanceFromHome 3.960000 3277 0.0005757 14.064970 500 3 0.0000000
Education 5.107184 1998 0.0002596 7.541793 496 2 0.9030326
EducationField 5.119776 1853 0.0000779 7.213499 494 4 0.9999980
EmployeeCount NA 0 0.0000000 0.000000 0 0 1.0000000
EmployeeNumber 3.934000 3603 -0.0007056 15.008320 500 3 0.0000000
EnvironmentSatisfaction 3.697888 2403 0.0027532 11.880843 497 9 0.0000000
Gender 7.270864 880 0.0000350 2.780954 416 0 1.0000000
HourlyRate 4.197296 3428 -0.0004743 13.784202 499 1 0.0000000
JobInvolvement 4.331664 1706 0.0012502 8.168434 491 9 1.0000000
JobLevel 3.967552 1420 0.0048050 9.320446 488 49 1.0000000
JobRole 4.148592 2248 0.0019138 9.381890 498 19 0.0000111
JobSatisfaction 3.912592 2333 0.0025473 11.001585 498 6 0.0000000
MaritalStatus 4.849584 1335 0.0027792 6.826378 471 17 1.0000000
MonthlyIncome 2.844000 4067 0.0079190 22.961365 500 47 0.0000000
MonthlyRate 4.237296 3450 -0.0003348 13.697943 499 5 0.0000000
NumCompaniesWorked 4.579888 2397 0.0010880 9.984568 497 3 0.0000000
Over18 NA 0 0.0000000 0.000000 0 0 1.0000000
OverTime 2.748592 1613 0.0152744 19.109253 498 63 1.0000000
PercentSalaryHike 4.390000 2802 0.0003409 10.902798 500 0 0.0000000
PerformanceRating 8.820736 313 0.0001822 1.100411 234 0 1.0000000
RelationshipSatisfaction 4.655888 2160 -0.0002907 8.282852 497 0 0.0103003
StandardHours NA 0 0.0000000 0.000000 0 0 1.0000000
StockOptionLevel 3.473776 1654 0.0054037 10.334163 494 35 1.0000000
TotalWorkingYears 3.154592 2791 0.0061197 16.602159 498 54 0.0000000
TrainingTimesLastYear 4.709184 2172 -0.0002434 8.109115 496 2 0.0049488
WorkLifeBalance 4.410480 1836 0.0016001 8.471253 495 9 0.9999997
YearsAtCompany 3.516000 2601 0.0046960 13.845431 500 56 0.0000000
YearsInCurrentRole 4.469776 2045 0.0033190 9.107631 494 26 0.5952213
YearsSinceLastPromotion 4.987184 1998 0.0007326 8.107687 496 1 0.9030326
YearsWithCurrManager 4.375664 1948 0.0028666 9.137336 491 36 0.9925221

Variable ranking plot and importance plot

plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes")

plot_importance_ggpairs(importance_frame)
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning: Removed 3 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing missing values (geom_point).

plot_importance_rankings(importance_frame)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

(vars <- important_variables(importance_frame, k = 5, measures = c("mean_min_depth", "no_of_trees")))
## [1] "MonthlyIncome"     "Age"               "OverTime"         
## [4] "TotalWorkingYears" "YearsAtCompany"

Interaction Analysis for Random Forest

Interaction plot

Intercation plots for Features in Attrition Analysis

plot_predict_interaction(forest, data_test, "DailyRate", "MonthlyIncome")

plot_predict_interaction(forest, data_test, "Age", "MonthlyIncome")

plot_predict_interaction(forest, data_test, "Age", "DistanceFromHome")

plot_predict_interaction(forest, data_test, "Age", "NumCompaniesWorked")

plot_predict_interaction(forest, data_test, "Age", "NumCompaniesWorked")

plot_predict_interaction(forest, data_test, "HourlyRate", "YearsInCurrentRole")

Interactive Pivot Table visualization

rpivotTable(dataHR, rows="Education", col="Attrition", aggregatorName="Count Unique Values ", 
vals="JobLevel", rendererName="Treemap")