library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_hr=read.csv("CaseStudy2_data.csv")
dim(df_hr)
## [1] 870 36
head(df_hr)
## ID Age Attrition BusinessTravel DailyRate Department
## 1 1 32 No Travel_Rarely 117 Sales
## 2 2 40 No Travel_Rarely 1308 Research & Development
## 3 3 35 No Travel_Frequently 200 Research & Development
## 4 4 32 No Travel_Rarely 801 Sales
## 5 5 24 No Travel_Frequently 567 Research & Development
## 6 6 27 No Travel_Frequently 294 Research & Development
## DistanceFromHome Education EducationField EmployeeCount EmployeeNumber
## 1 13 4 Life Sciences 1 859
## 2 14 3 Medical 1 1128
## 3 18 2 Life Sciences 1 1412
## 4 1 4 Marketing 1 2016
## 5 2 1 Technical Degree 1 1646
## 6 10 2 Life Sciences 1 733
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1 2 Male 73 3 2
## 2 3 Male 44 2 5
## 3 3 Male 60 3 3
## 4 3 Female 48 3 3
## 5 1 Female 32 3 1
## 6 4 Male 32 3 3
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1 Sales Executive 4 Divorced 4403
## 2 Research Director 3 Single 19626
## 3 Manufacturing Director 4 Single 9362
## 4 Sales Executive 4 Married 10422
## 5 Research Scientist 4 Single 3760
## 6 Manufacturing Director 1 Divorced 8793
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## 1 9250 2 Y No 11
## 2 17544 1 Y No 14
## 3 19944 2 Y No 11
## 4 24032 1 Y No 19
## 5 17218 1 Y Yes 13
## 6 4809 1 Y No 21
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1 3 3 80 1
## 2 3 1 80 0
## 3 3 3 80 0
## 4 3 3 80 2
## 5 3 3 80 0
## 6 4 3 80 2
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1 8 3 2 5
## 2 21 2 4 20
## 3 10 2 3 2
## 4 14 3 3 14
## 5 6 2 3 6
## 6 9 4 2 9
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1 2 0 3
## 2 7 4 9
## 3 2 2 2
## 4 10 5 7
## 5 3 1 3
## 6 7 1 7
I guess top 3 is JobRole, MonthlyIncome, OverTime. Let’s check it.
check the plausible factors related to attrition ; monthly income, job role, overtime, travel
library("ggplot2")
ggplot(df_hr, aes(x=JobRole, fill=Attrition)) + geom_bar(position = "fill")
ggplot(df_hr, aes(x=OverTime, fill=Attrition)) + geom_bar(position = "fill")
ggplot(df_hr, aes(x=BusinessTravel, fill=Attrition)) + geom_bar(position = "fill")
ggplot(df_hr, aes(x=MonthlyIncome, fill=Attrition)) + geom_bar(position = "fill")
Jobrole, overtime and business travel gives good idea but monthly income
doesn’t cause too many categories. I want to check one thing before
re-define income level. BTW, These factors are also related to the
Attrition. interesting. (MaritalStatus, JobSatisfaction, JobInvolvement,
JobLevel,EnvironmentSatisfaction,Education,Age)
it will be better to re-scale the monthly income and check.
df_MonthlyIncome = 0
for (x in 1:nrow(df_hr)) {
if(df_hr$MonthlyIncome[x]<1000){df_MonthlyIncome[x] = 1}
else if(df_hr$MonthlyIncome[x]>=1000 & df_hr$MonthlyIncome[x]<2000){df_MonthlyIncome[x] = 2}
else if(df_hr$MonthlyIncome[x]>=2000 & df_hr$MonthlyIncome[x]<3000){df_MonthlyIncome[x] = 3}
else if(df_hr$MonthlyIncome[x]>=3000 & df_hr$MonthlyIncome[x]<4000){df_MonthlyIncome[x] = 4}
else if(df_hr$MonthlyIncome[x]>=4000 & df_hr$MonthlyIncome[x]<5000){df_MonthlyIncome[x] = 5}
else if(df_hr$MonthlyIncome[x]>=5000 & df_hr$MonthlyIncome[x]<6000){df_MonthlyIncome[x] = 6}
else if(df_hr$MonthlyIncome[x]>=6000 & df_hr$MonthlyIncome[x]<7000){df_MonthlyIncome[x] = 7}
else if(df_hr$MonthlyIncome[x]>=7000 & df_hr$MonthlyIncome[x]<8000){df_MonthlyIncome[x] = 8}
else if(df_hr$MonthlyIncome[x]>=8000 & df_hr$MonthlyIncome[x]<9000){df_MonthlyIncome[x] = 9}
else if(df_hr$MonthlyIncome[x]>=9000 & df_hr$MonthlyIncome[x]<10000){df_MonthlyIncome[x] = 10}
else if(df_hr$MonthlyIncome[x]>=10000 & df_hr$MonthlyIncome[x]<11000){df_MonthlyIncome[x] = 11}
else if(df_hr$MonthlyIncome[x]>=11000 & df_hr$MonthlyIncome[x]<12000){df_MonthlyIncome[x] = 12}
else if(df_hr$MonthlyIncome[x]>=12000 & df_hr$MonthlyIncome[x]<13000){df_MonthlyIncome[x] = 13}
else if(df_hr$MonthlyIncome[x]>=13000 & df_hr$MonthlyIncome[x]<14000){df_MonthlyIncome[x] = 14}
else if(df_hr$MonthlyIncome[x]>=14000 & df_hr$MonthlyIncome[x]<15000){df_MonthlyIncome[x] = 15}
else if(df_hr$MonthlyIncome[x]>=15000 & df_hr$MonthlyIncome[x]<16000){df_MonthlyIncome[x] = 16}
else if(df_hr$MonthlyIncome[x]>=16000 & df_hr$MonthlyIncome[x]<17000){df_MonthlyIncome[x] = 17}
else if(df_hr$MonthlyIncome[x]>=17000 & df_hr$MonthlyIncome[x]<18000){df_MonthlyIncome[x] = 18}
else if(df_hr$MonthlyIncome[x]>=18000 & df_hr$MonthlyIncome[x]<19000){df_MonthlyIncome[x] = 19}
else if(df_hr$MonthlyIncome[x]>=19000 & df_hr$MonthlyIncome[x]<20000){df_MonthlyIncome[x] = 20}
else {df_MonthlyIncome[x] = 8}
}
df_re=df_hr
df_re$MonthlyIncomeLevel <- df_MonthlyIncome
ggplot(df_re, aes(x=MonthlyIncomeLevel, fill=Attrition)) + geom_bar(position = "fill")
hello1 <- df_hr %>% group_by(JobRole) %>% summarize(meanIncome = mean(MonthlyIncome), count = n()) %>% arrange(meanIncome)
print(hello1)
## # A tibble: 9 × 3
## JobRole meanIncome count
## <chr> <dbl> <int>
## 1 Sales Representative 2653. 53
## 2 Laboratory Technician 3222. 153
## 3 Research Scientist 3259. 172
## 4 Human Resources 3285. 27
## 5 Sales Executive 6892. 200
## 6 Healthcare Representative 7435. 76
## 7 Manufacturing Director 7505. 87
## 8 Research Director 15750. 51
## 9 Manager 17197. 51
barplot(hello1$meanIncome, names.arg = hello1$JobRole)
this gives good idea that the income is highly related with job role.
And there are three levels of income here. under 50k, 50k~100k, over
100k.I will recast montlhyincome for only attrition prediction not
salary prediction.
re-define categorical variables like BusinessTravel,Department, EducationField, Gender,JobRole,MaritalStatus,OverTime, etc Gender first! cause only two types for now.
Male is 1
library("stringr")
df_Gender <- 1*str_detect(df_hr$Gender, "Male")
df_cv=df_hr
df_cv$Gender<-df_Gender
BusinessTravel Travel_rarely:0 Travel_Frequently:1 Non-Travel:2
df_BusinessTravel = ifelse(str_detect(df_hr$BusinessTravel,"Travel_Rarely")==TRUE,"0",ifelse(str_detect(df_hr$BusinessTravel,"Travel_Frequently")==TRUE,"1","2"))
df_cv$BusinessTravel<-df_BusinessTravel
attrition No is 1
df_Attrition <- 1*str_detect(df_hr$Attrition, "No")
df_cv$Attrition<-df_Attrition
Sales 0 Research & Development 1 Human Resources 2
df_Department = ifelse(str_detect(df_hr$Department,"Sales")==TRUE,"0",ifelse(str_detect(df_hr$Department,"Research & Development")==TRUE,"1","2"))
df_cv$Department<-df_Department
Healthcare Representative 1 Human Resources 2 Laboratory Technician 3 Manager 4 Manufacturing Director 5 Research Director 6 Research Scientist 7 Sales Executive 8 Sales Representative 9
df_cv$JobRole<-as.integer(factor(df_hr$JobRole, levels = c("Healthcare Representative", "Human Resources","Laboratory Technician","Manager","Manufacturing Director","Research Director","Research Scientist","Sales Executive","Sales Representative")))
df_cv$MaritalStatus<-as.integer(factor(df_hr$MaritalStatus, levels = c("Divorced", "Married","Single")))
Over18 Yes 1 No 2
df_cv$Over18<-as.integer(factor(df_hr$Over18, levels = c("Y", "N")))
OverTime Yes 1 No 2
df_cv$OverTime<-as.integer(factor(df_hr$OverTime, levels = c("Yes", "No")))
EducationField Life Sciences 1 Medical 2 Marketing 3 Technical Degree 4 Human Resources 5 Other 6
df_cv$EducationField<-as.integer(factor(df_hr$EducationField, levels = c("Life Sciences", "Medical","Marketing","Technical Degree","Human Resources","Other")))
correlation map for attrition
library(GGally)
# Convert data to numeric
corr <- data.frame(lapply(df_cv, as.integer))
corr$Attri <- corr$Attrition
# Plot the graph
plot_attrition=ggcorr(corr,
method = c("pairwise", "spearman"),
nbreaks = 6,
hjust = 0.8,
label = TRUE,
label_size = 2,
label_round = 4,
color = "grey50")
plot_attrition
head(df_cv)
## ID Age Attrition BusinessTravel DailyRate Department DistanceFromHome
## 1 1 32 1 0 117 0 13
## 2 2 40 1 0 1308 1 14
## 3 3 35 1 1 200 1 18
## 4 4 32 1 0 801 0 1
## 5 5 24 1 1 567 1 2
## 6 6 27 1 1 294 1 10
## Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## 1 4 1 1 859 2
## 2 3 2 1 1128 3
## 3 2 1 1 1412 3
## 4 4 3 1 2016 3
## 5 1 4 1 1646 1
## 6 2 1 1 733 4
## Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction
## 1 1 73 3 2 8 4
## 2 1 44 2 5 6 3
## 3 1 60 3 3 5 4
## 4 0 48 3 3 8 4
## 5 0 32 3 1 7 4
## 6 1 32 3 3 5 1
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime
## 1 1 4403 9250 2 1 2
## 2 3 19626 17544 1 1 2
## 3 3 9362 19944 2 1 2
## 4 2 10422 24032 1 1 2
## 5 3 3760 17218 1 1 1
## 6 1 8793 4809 1 1 2
## PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours
## 1 11 3 3 80
## 2 14 3 1 80
## 3 11 3 3 80
## 4 19 3 3 80
## 5 13 3 3 80
## 6 21 4 3 80
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## 1 1 8 3 2
## 2 0 21 2 4
## 3 0 10 2 3
## 4 2 14 3 3
## 5 0 6 2 3
## 6 2 9 4 2
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## 1 5 2 0
## 2 20 7 4
## 3 2 2 2
## 4 14 10 5
## 5 6 3 1
## 6 9 7 1
## YearsWithCurrManager
## 1 3
## 2 9
## 3 2
## 4 7
## 5 3
## 6 7
overtime 0.27, #1
monthly income, .1995 #4 marrital status, .198 #5 total working year, .1996 #3 stock option level, .201 #2
there will be intended highly turnover rate position.
salary always number 1 factor of retrition so it will be the last choice to reduce retrition. hire more people or promotion and productivity increasing are good way to go
overtime 0.27, #1 marrital status, .198 #5 total working year, .1996 #3 stock option level, .201 #2 monthly income .1992. interesting. #4 will be good
correlation map for monthly income
joblevel .9 total working year .7 years at company .5
correlation network
library(qgraph)
#use cor function to create a correlation matrix with milan.mort dataset
#and put into cormat variable
###################################################
corr1 <- data.frame(lapply(df_cv, as.integer))
cormat=cor(corr1) #correlation matrix generated
## Warning in cor(corr1): the standard deviation is zero
###################################################
###################################################
#now plot a graph of the correlation matrix
###################################################
#qgraph(cormat, shape="circle", posCol="darkgreen", negCol="darkred", layout="groups", vsize=10)
qgraph(cormat, shape="circle", posCol="darkgreen", negCol="darkred", layout="spring", vsize=5)
## Warning in qgraph(cormat, shape = "circle", posCol = "darkgreen", negCol =
## "darkred", : Non-finite weights are omitted
lets make rshiny with this
#nodes
cormat=cor(corr1) #correlation matrix generated
## Warning in cor(corr1): the standard deviation is zero
diag(cormat) <- 0
cormat[upper.tri(cormat)] <- 0
df_nodes <- list("name"=rownames(cormat),"group"=c(1:36),"size"=c(1:36))
df_nodes <- as.data.frame(df_nodes)
#links
rownames(cormat) <- c(0:35)
colnames(cormat) <- c(0:35)
cormat <- cormat %>% replace(is.na(.), 0)
cormat=abs(cormat*100)
cormat=format(round(cormat))
cormat <- cormat %>% replace(is.na(.), 0)
df_pairs=as.data.frame(as.table(cormat))
df_links=df_pairs
names(df_links) <- c("source","target","Freq")
df_links=df_links[order(df_links$Freq, decreasing = TRUE), ]
df_links = df_links[1:60,]
library(networkD3)
forceNetwork(Links = df_links, Nodes = df_nodes, Source = "source",
Target = "target", Value = "Freq", NodeID = "name",
Group = "group", opacity = 0.4, fontSize = 30, zoom = T)
write.csv(df_nodes, file = “df_nodes.csv”) write.csv(df_links, file = “df_links.csv”)
df_cv %>% ggplot(aes(x = MonthlyIncome, y = TotalWorkingYears, color=Attrition > 0)) +
geom_point() +
stat_ellipse()
Lots of outliers!
let’s use GLM and do benchmark with NB
#70/30 training/
# Loop for many k and the average of many training / test partition
hello_scale=df_cv
set.seed(300)
iterations = 1
numks = 1
splitPerc = .70
for(j in 1:iterations)
{
trainIndices = sample(1:dim(hello_scale)[1],round(splitPerc * dim(hello_scale)[1]))
train = hello_scale[trainIndices,]
test = hello_scale[-trainIndices,]
for(i in 1:numks)
{formula = Attrition ~ Age + BusinessTravel + Department + Education + JobLevel + JobRole + MonthlyIncome + NumCompaniesWorked + OverTime + StockOptionLevel + TotalWorkingYears + YearsAtCompany + StockOptionLevel + YearsWithCurrManager + YearsInCurrentRole
classifications = glm(formula, data = train, family = binomial())
}
}
summary(classifications)
##
## Call:
## glm(formula = formula, family = binomial(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0869 0.2064 0.3777 0.5646 1.5235
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.574e+00 1.152e+00 -3.101 0.00193 **
## Age 2.333e-02 1.938e-02 1.204 0.22867
## BusinessTravel1 -7.673e-01 2.977e-01 -2.578 0.00995 **
## BusinessTravel2 2.226e-01 4.359e-01 0.511 0.60962
## Department1 1.003e+00 4.199e-01 2.390 0.01687 *
## Department2 2.405e-01 7.702e-01 0.312 0.75482
## Education -8.390e-03 1.277e-01 -0.066 0.94761
## JobLevel 7.055e-01 4.257e-01 1.657 0.09743 .
## JobRole 6.698e-02 7.716e-02 0.868 0.38534
## MonthlyIncome -5.872e-05 1.025e-04 -0.573 0.56683
## NumCompaniesWorked -1.577e-01 5.471e-02 -2.883 0.00394 **
## OverTime 1.479e+00 2.574e-01 5.745 9.2e-09 ***
## StockOptionLevel 3.875e-01 1.577e-01 2.457 0.01401 *
## TotalWorkingYears 6.075e-02 4.652e-02 1.306 0.19163
## YearsAtCompany -9.437e-02 5.981e-02 -1.578 0.11463
## YearsWithCurrManager 5.678e-02 6.814e-02 0.833 0.40471
## YearsInCurrentRole 6.921e-02 6.359e-02 1.088 0.27641
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 520.55 on 608 degrees of freedom
## Residual deviance: 423.57 on 592 degrees of freedom
## AIC: 457.57
##
## Number of Fisher Scoring iterations: 6
predicted <- predict(classifications, test, type = 'response')
accuracy
# confusion matrix
table_mat <- table(test$Attrition, predicted>0.5)
table_mat
##
## FALSE TRUE
## 0 10 37
## 1 1 213
TP = table_mat[1,1]
FP = table_mat[1,2]
FN = table_mat[2,1]
TN = table_mat[2,2]
accuracy = (TN+TP)/(TN+TP+FP+TP)
accuracy
## [1] 0.8259259
sensitivity = TP/(TP+FN)
sensitivity
## [1] 0.9090909
specificity = TN/(TN+FP)
specificity
## [1] 0.852
confusion matrix in caret threshold default is .5 https://www.rdocumentation.org/packages/InformationValue/versions/1.2.3/topics/confusionMatrix
library("caret")
## Loading required package: lattice
linearMod <- lm(train$MonthlyIncome ~ JobLevel + TotalWorkingYears + YearsAtCompany, data=train)
#
summary(linearMod)
##
## Call:
## lm(formula = train$MonthlyIncome ~ JobLevel + TotalWorkingYears +
## YearsAtCompany, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5653.6 -909.0 55.1 715.6 3863.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1724.76 122.15 -14.119 < 2e-16 ***
## JobLevel 3687.49 83.84 43.982 < 2e-16 ***
## TotalWorkingYears 75.44 13.39 5.634 2.71e-08 ***
## YearsAtCompany -38.18 12.61 -3.028 0.00256 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1390 on 605 degrees of freedom
## Multiple R-squared: 0.904, Adjusted R-squared: 0.9036
## F-statistic: 1900 on 3 and 605 DF, p-value: < 2.2e-16
# now use it to predict the salary
predict = predict(linearMod, test, inverval="predict")
RMSE(test$MonthlyIncome,predict)
## [1] 1368.084