shiny: https://peipeicai.shinyapps.io/PEPE/
library(knitr)
knitr::opts_chunk$set(fig.width = 12,fig.height = 8)
library(dplyr)
library(caret)
library(arules)
library(arulesViz)
library(ggplot2)
setwd("C:/Users/13158/Desktop/2019 Fall/707/PEPE")
EMP <- read.csv('employee_attrition.csv',header = T,sep = ',')
sum(!complete.cases(EMP)) # 9 missing values
## [1] 9
EMP<- na.omit(EMP)
summary(EMP) # double check the missing values and other data quality issues
‘NA’ values;
Missing values in ‘Gender’ and ‘Overtime’ that are not encoded as ‘NA’;
Redudent Columns, such as ‘StandardHours’,‘Over18’,‘PerformanceRating’,‘Employee Count’,‘EmployeeNumber’. The former three only include the same value(variance=1),the last one is equivalent to index.
abnormal data in working years (one is 114 years)
# 1. delete 'na' value
EMP<- na.omit(EMP)
# 2. delete missing value in gender and overtime
EMP<- EMP[-which(EMP$Gender==levels(EMP$Gender)[1]),]
EMP<- EMP[-which(EMP$OverTime==levels(EMP$OverTime)[1]),]
nrow(EMP) # 1165 observations left
## [1] 1165
# 3.1 identify features with no or low variance
nzv <- nearZeroVar(EMP,freqCut = 95/5,saveMetrics = T)
colnames(EMP)[nzv$nzv]
## [1] "EmployeeCount" "Over18" "StandardHours"
nzv[nzv$nzv,]
## freqRatio percentUnique zeroVar nzv
## EmployeeCount 0 0.08583691 TRUE TRUE
## Over18 0 0.08583691 TRUE TRUE
## StandardHours 0 0.08583691 TRUE TRUE
# 3.2 delete redudent columns
EMP <- EMP[,-c(8,9,22,25,27)]
sum(!complete.cases(EMP))# check, zero now.
## [1] 0
## 4. remove outliers
summary(EMP$TotalWorkingYears,na.rm = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 10.00 11.35 15.00 114.00
which(EMP$TotalWorkingYears==114.00) # There is some abnormal data in working years (114 in the 114th record)
## [1] 137
EMP <- EMP[-137,]
15% of the Employees has Attritions.
prop.table(table(EMP$Attrition))
##
## No Yes
## 0.8427835 0.1572165
EMP %>% group_by(Attrition) %>%
summarise(Age=round(mean(Age,na.rm = T)),
DailyRate=round(mean(DailyRate,na.rm = T),2),
MInc=round(mean(MonthlyIncome),2),
EnvSa=mean(EnvironmentSatisfaction,na.rm = T),
YWork=mean(TotalWorkingYears,na.rm = T),
YNoProm = round(mean(YearsSinceLastPromotion),2),
Edu=round(mean(Education),2),
Dis = round(mean(DistanceFromHome),2),
Comp = round(mean(NumCompaniesWorked),2)
)
## # A tibble: 2 x 10
## Attrition Age DailyRate MInc EnvSa YWork YNoProm Edu Dis Comp
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 No 38 813. 6826. 2.76 11.8 2.19 2.89 9.18 2.63
## 2 Yes 34 738. 4782. 2.43 8.28 1.72 2.9 11.3 2.99
The ‘Yes’ Attrition Group has following features:
# change data type,
EMP$Education<- as.factor(EMP$Education)
EMP$EnvironmentSatisfaction <- as.factor(EMP$EnvironmentSatisfaction)
EMP$JobInvolvement <- as.factor(EMP$JobInvolvement)
EMP$JobLevel <- as.factor(EMP$JobLevel)
EMP$JobSatisfaction <- as.factor(EMP$JobSatisfaction)
EMP$RelationshipSatisfaction <- as.factor(EMP$RelationshipSatisfaction)
EMP$StockOptionLevel <- as.factor(EMP$StockOptionLevel)
EMP$WorkLifeBalance <- as.factor(EMP$WorkLifeBalance)
Examine Outliers
boxplot(EMP$TotalWorkingYears)
# remove outliers
EMP$TotalWorkingYears[EMP$TotalWorkingYears %in% boxplot.stats(EMP$TotalWorkingYears)$out] <- median(EMP$TotalWorkingYears,na.rm = T)
boxplot(EMP$TotalWorkingYears)
EMP$Age_grp <- discretize(EMP$Age,method = 'frequency',breaks =3,labels = c('low','medium','high'))
#boxplot(Age ~ Age_grp,data=EMP)
#prop.table(table(EMP$Age_grp,EMP$Attrition))
EMP$DailyRate_grp <- discretize(EMP$DailyRate,method = 'frequency',breaks =3,labels = c('low','medium','high'))
EMP$DistanceFromHome_grp <- discretize(EMP$DistanceFromHome,method = 'frequency',breaks = 3,labels = c('low','medium','high'))
EMP$HourlyR_grp <- discretize(EMP$HourlyRate,method = 'frequency',breaks = 3,labels = c('low','medium','high'))
EMP$Mincome_grp <- discretize(EMP$MonthlyIncome,method = 'frequency',breaks = 3,labels = c('low','medium','high'))
EMP$NumCompaniesWorked_grp <- discretize(EMP$NumCompaniesWorked,method = 'frequency',breaks = 3,labels = c('low','medium','high'))
EMP$Wyear_grp <- discretize(EMP$TotalWorkingYears,method = 'frequency',breaks =3,labels = c('low','medium','high'))
EMP$traTimLas_grp <- discretize(EMP$TrainingTimesLastYear,method = 'frequency',breaks =3,labels = c('low','medium','high'))
EMP$YCompany <- discretize(EMP$YearsAtCompany,method = 'frequency',breaks =3,labels = c('low','medium','high'))
EMP$YCurrentrole <- discretize(EMP$YearsInCurrentRole,method = 'frequency',breaks =2,labels = c('low','high'))
EMP$YWithManager <- discretize(EMP$YearsWithCurrManager,method = 'frequency',breaks =3,labels = c('low','medium','high'))
Bin Column ‘Years Since Last Promotion’ according to its percentile
#EMP$YearNoPro <- discretize(EMP$YearsSinceLastPromotion,method = 'frequency',breaks =5,labels = c('low','medium','high'))
#Unable to use discretize to bin this column
summary(EMP$YearsSinceLastPromotion)
EMP$YearNoPro <- 'Medium'
EMP$YearNoPro[which(EMP$YearsSinceLastPromotion<=1)] <- 'Short'
EMP$YearNoPro[which(EMP$YearsSinceLastPromotion>=2)] <- 'long'
rules_record <- apriori(EMP[,sapply(EMP,is.factor)],parameter = NULL, appearance = NULL,control = NULL)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 116
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[90 item(s), 1164 transaction(s)] done [0.02s].
## sorting and recoding items ... [75 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.03s].
## writing ... [5270 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(rules_record,5))
## lhs rhs support confidence lift count
## [1] {} => {Attrition=No} 0.8427835 0.8427835 1.000000 981
## [2] {JobRole=Manufacturing Director} => {Department=Research & Development} 0.1048110 1.0000000 1.539683 122
## [3] {StockOptionLevel=2} => {Attrition=No} 0.1005155 0.9212598 1.093116 117
## [4] {NumCompaniesWorked_grp=low} => {Attrition=No} 0.1176976 0.9013158 1.069451 137
## [5] {JobLevel=3} => {Mincome_grp=high} 0.1451890 0.9883041 2.964912 169
EMP_FACTOR <- as(EMP[,sapply(EMP,is.factor)],'transactions') # Change dataset into transactions data
rules_transaction <- apriori(EMP_FACTOR,parameter = list(support=0.1,confidence=0.8,minlen=3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 3
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 116
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[90 item(s), 1164 transaction(s)] done [0.00s].
## sorting and recoding items ... [75 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.01s].
## writing ... [5196 rule(s)] done [0.00s].
## creating S4 object ... done [0.02s].
inspect(head(rules_transaction))
## lhs rhs support confidence lift count
## [1] {JobLevel=3,
## Mincome_grp=high} => {Wyear_grp=high} 0.1262887 0.8698225 1.710259 147
## [2] {JobLevel=3,
## Wyear_grp=high} => {Mincome_grp=high} 0.1262887 0.9865772 2.959732 147
## [3] {JobLevel=3,
## YCurrentrole=high} => {Mincome_grp=high} 0.1125430 0.9924242 2.977273 131
## [4] {BusinessTravel=Travel_Rarely,
## JobLevel=3} => {Mincome_grp=high} 0.1091065 0.9921875 2.976562 127
## [5] {JobLevel=3,
## Mincome_grp=high} => {Attrition=No} 0.1262887 0.8698225 1.032083 147
## [6] {Attrition=No,
## JobLevel=3} => {Mincome_grp=high} 0.1262887 0.9932432 2.979730 147
frequent_items <- eclat(EMP_FACTOR,parameter = list(support=0.5,minlen=2))
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.5 2 10 frequent itemsets FALSE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 582
##
## create itemset ...
## set transactions ...[90 item(s), 1164 transaction(s)] done [0.00s].
## sorting and recoding items ... [10 item(s)] done [0.00s].
## creating bit matrix ... [10 row(s), 1164 column(s)] done [0.00s].
## writing ... [6 set(s)] done [0.00s].
## Creating S4 object ... done [0.00s].
inspect(head(frequent_items,2))
## items support count
## [1] {Attrition=No,JobInvolvement=3} 0.5068729 590
## [2] {Attrition=No,WorkLifeBalance=3} 0.5378007 626
itemFrequencyPlot(EMP_FACTOR,topN=15,type='absolute',main='Item frequency')
rules <- apriori(EMP_FACTOR,parameter = list(support=0.1,confidence=0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 116
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[90 item(s), 1164 transaction(s)] done [0.00s].
## sorting and recoding items ... [75 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.03s].
## writing ... [19968 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
quality(head(rules,3))
## support confidence lift count
## 1 0.5085911 0.5085911 1 592
## 2 0.5378007 0.5378007 1 626
## 3 0.5446735 0.5446735 1 634
inspect(head(sort(rules,by='lift',decreasing = T),8))
## lhs rhs support confidence lift count
## [1] {Department=Sales,
## JobLevel=2,
## Mincome_grp=medium} => {JobRole=Sales Executive} 0.1202749 0.9859155 4.413868 140
## [2] {Department=Sales,
## JobLevel=2,
## WorkLifeBalance=3} => {JobRole=Sales Executive} 0.1030928 0.9836066 4.403531 120
## [3] {Attrition=No,
## Department=Sales,
## JobLevel=2,
## Mincome_grp=medium} => {JobRole=Sales Executive} 0.1005155 0.9831933 4.401681 117
## [4] {Department=Sales,
## JobLevel=2,
## OverTime=No} => {JobRole=Sales Executive} 0.1151203 0.9781022 4.378888 134
## [5] {Attrition=No,
## Department=Sales,
## JobLevel=2,
## OverTime=No} => {JobRole=Sales Executive} 0.1039519 0.9758065 4.368610 121
## [6] {Department=Sales,
## JobInvolvement=3,
## JobLevel=2} => {JobRole=Sales Executive} 0.1013746 0.9752066 4.365925 118
## [7] {Department=Sales,
## JobLevel=2} => {JobRole=Sales Executive} 0.1615120 0.9740933 4.360941 188
## [8] {BusinessTravel=Travel_Rarely,
## Department=Sales,
## JobLevel=2} => {JobRole=Sales Executive} 0.1125430 0.9703704 4.344274 131
subset_rules <- which(colSums(is.subset(rules,rules))>1)
rules <- sort(rules[-subset_rules],by='lift',decreasing = T)
inspect(head(rules,10))
## lhs rhs support
## [1] {JobLevel=3} => {Mincome_grp=high} 0.1451890
## [2] {JobRole=Research Scientist} => {JobLevel=1} 0.1546392
## [3] {JobRole=Research Scientist} => {Mincome_grp=low} 0.1400344
## [4] {JobRole=Laboratory Technician} => {JobLevel=1} 0.1348797
## [5] {JobRole=Laboratory Technician} => {Mincome_grp=low} 0.1176976
## [6] {JobRole=Sales Executive} => {JobLevel=2} 0.1615120
## [7] {YWithManager=low} => {YCurrentrole=low} 0.1786942
## [8] {Wyear_grp=low} => {YCurrentrole=low} 0.2233677
## [9] {JobRole=Sales Executive} => {Mincome_grp=medium} 0.1202749
## [10] {Age_grp=low} => {JobLevel=1} 0.1804124
## confidence lift count
## [1] 0.9883041 2.964912 169
## [2] 0.8294931 2.255911 180
## [3] 0.7511521 2.253456 163
## [4] 0.7511962 2.042973 157
## [5] 0.6555024 1.966507 137
## [6] 0.7230769 1.966499 188
## [7] 0.7732342 1.672945 208
## [8] 0.7536232 1.630516 260
## [9] 0.5384615 1.615385 140
## [10] 0.5915493 1.608793 210
Observations:
rules <- apriori(data=EMP_FACTOR,parameter = list(supp=0.02,conf=0.7),
appearance = list(default='lhs',rhs='Attrition=Yes'),
control = list(verbose=F))
subset_rules <- which(colSums(is.subset(rules,rules))>1)
rules <- sort(rules[-subset_rules],by='lift',decreasing = T)
inspect(head(rules,10))
## lhs rhs support confidence lift count
## [1] {OverTime=Yes,
## StockOptionLevel=0,
## Age_grp=low,
## Mincome_grp=low} => {Attrition=Yes} 0.02233677 0.7878788 5.011426 26
## [2] {JobLevel=1,
## OverTime=Yes,
## StockOptionLevel=0,
## Age_grp=low} => {Attrition=Yes} 0.02233677 0.7647059 4.864031 26
## [3] {OverTime=Yes,
## Age_grp=low,
## Mincome_grp=low,
## YCompany=low} => {Attrition=Yes} 0.02491409 0.7631579 4.854185 29
## [4] {BusinessTravel=Travel_Frequently,
## JobLevel=1,
## Age_grp=low,
## YCurrentrole=low} => {Attrition=Yes} 0.02147766 0.7575758 4.818679 25
## [5] {OverTime=Yes,
## StockOptionLevel=0,
## YCompany=low,
## YWithManager=low} => {Attrition=Yes} 0.02061856 0.7500000 4.770492 24
## [6] {MaritalStatus=Single,
## Age_grp=low,
## YCompany=low,
## YWithManager=low} => {Attrition=Yes} 0.02319588 0.7500000 4.770492 27
## [7] {OverTime=Yes,
## Age_grp=low,
## Wyear_grp=low,
## YCompany=low} => {Attrition=Yes} 0.02749141 0.7441860 4.733511 32
## [8] {OverTime=Yes,
## StockOptionLevel=0,
## Mincome_grp=low,
## YCompany=low} => {Attrition=Yes} 0.02405498 0.7368421 4.686799 28
## [9] {JobLevel=1,
## OverTime=Yes,
## StockOptionLevel=0,
## YCompany=low} => {Attrition=Yes} 0.02405498 0.7368421 4.686799 28
## [10] {BusinessTravel=Travel_Frequently,
## JobLevel=1,
## Age_grp=low,
## Wyear_grp=low} => {Attrition=Yes} 0.02147766 0.7352941 4.676953 25
Observations:
For ‘Attrition=Yes’ Group, in general, they are featured in younger age, less working years, more overtime working, frequent travel, lower income, and lower job level. We might summarize that those who entered the company more recently have higher possibility to have attrition.
plot(rules,measure = c('support','lift'),shading='confidence',jitter=0)