library(ggplot2)
library(caTools)
library(ROCR)
library(rpart)
library(rpart.plot)
library(corrplot)
library(caret)
library(reshape2)
library(pROC)
#loading dataset
att_data=read.csv("D:/capstone/attrition_data.csv")
#HEAD
head(att_data)
colnames(att_data)[1]="Age"
#checking null values
data.frame(colSums(is.na(att_data)))
Hence it is clear that our data has no missing value and so we can move further for Descriptive Analysis of the data.
#Checking structure of the given attributes in data
str(att_data)
'data.frame': 1470 obs. of 35 variables:
$ Age : int 41 49 37 33 27 32 59 30 38 36 ...
$ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
$ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
$ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
$ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
$ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
$ Education : int 2 1 2 4 1 2 3 1 3 3 ...
$ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
$ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
$ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
$ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
$ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
$ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
$ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
$ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
$ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
$ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
$ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
$ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
$ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
$ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
$ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
$ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
$ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
$ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
$ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
$ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
$ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
$ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
$ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
$ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
$ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
$ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
$ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
We can see that our data has 1470 observations and 35 variables and also the class of each variable.
Some of the variables (like Education,JobInvolvement, EnvironmentSatisfaction…)are ordinal but fall under integer class in our data so we need to change those as a factor.
ord_col=c("Education","EnvironmentSatisfaction","JobInvolvement","JobLevel",
"JobSatisfaction","PerformanceRating","RelationshipSatisfaction",
"StandardHours","StockOptionLevel","WorkLifeBalance")
att_data[,ord_col]=lapply(att_data[,ord_col],as.factor)
str(att_data)
'data.frame': 1470 obs. of 35 variables:
$ Age : int 41 49 37 33 27 32 59 30 38 36 ...
$ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
$ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
$ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
$ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
$ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
$ Education : Factor w/ 5 levels "1","2","3","4",..: 2 1 2 4 1 2 3 1 3 3 ...
$ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
$ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
$ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
$ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 3 4 4 1 4 3 4 4 3 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
$ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
$ JobInvolvement : Factor w/ 4 levels "1","2","3","4": 3 2 2 3 3 3 4 3 2 3 ...
$ JobLevel : Factor w/ 5 levels "1","2","3","4",..: 2 2 1 1 1 1 1 1 3 2 ...
$ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
$ JobSatisfaction : Factor w/ 4 levels "1","2","3","4": 4 2 3 3 2 4 1 3 3 3 ...
$ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
$ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
$ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
$ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
$ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
$ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
$ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
$ PerformanceRating : Factor w/ 2 levels "3","4": 1 2 1 1 1 1 2 2 2 1 ...
$ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 1 4 2 3 4 3 1 2 2 2 ...
$ StandardHours : Factor w/ 1 level "80": 1 1 1 1 1 1 1 1 1 1 ...
$ StockOptionLevel : Factor w/ 4 levels "0","1","2","3": 1 2 1 1 2 1 4 2 1 3 ...
$ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
$ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
$ WorkLifeBalance : Factor w/ 4 levels "1","2","3","4": 1 3 3 3 3 2 2 3 3 2 ...
$ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
$ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
$ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
$ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
Now we have the data with ordinal columns as factor and other variables in integer format.
#summarising data
summary(att_data)
Age Attrition BusinessTravel DailyRate
Min. :18.00 No :1233 Non-Travel : 150 Min. : 102.0
1st Qu.:30.00 Yes: 237 Travel_Frequently: 277 1st Qu.: 465.0
Median :36.00 Travel_Rarely :1043 Median : 802.0
Mean :36.92 Mean : 802.5
3rd Qu.:43.00 3rd Qu.:1157.0
Max. :60.00 Max. :1499.0
Department DistanceFromHome Education
Human Resources : 63 Min. : 1.000 1:170
Research & Development:961 1st Qu.: 2.000 2:282
Sales :446 Median : 7.000 3:572
Mean : 9.193 4:398
3rd Qu.:14.000 5: 48
Max. :29.000
EducationField EmployeeCount EmployeeNumber
Human Resources : 27 Min. :1 Min. : 1.0
Life Sciences :606 1st Qu.:1 1st Qu.: 491.2
Marketing :159 Median :1 Median :1020.5
Medical :464 Mean :1 Mean :1024.9
Other : 82 3rd Qu.:1 3rd Qu.:1555.8
Technical Degree:132 Max. :1 Max. :2068.0
EnvironmentSatisfaction Gender HourlyRate JobInvolvement
1:284 Female:588 Min. : 30.00 1: 83
2:287 Male :882 1st Qu.: 48.00 2:375
3:453 Median : 66.00 3:868
4:446 Mean : 65.89 4:144
3rd Qu.: 83.75
Max. :100.00
JobLevel JobRole JobSatisfaction MaritalStatus
1:543 Sales Executive :326 1:289 Divorced:327
2:534 Research Scientist :292 2:280 Married :673
3:218 Laboratory Technician :259 3:442 Single :470
4:106 Manufacturing Director :145 4:459
5: 69 Healthcare Representative:131
Manager :102
(Other) :215
MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime
Min. : 1009 Min. : 2094 Min. :0.000 Y:1470 No :1054
1st Qu.: 2911 1st Qu.: 8047 1st Qu.:1.000 Yes: 416
Median : 4919 Median :14236 Median :2.000
Mean : 6503 Mean :14313 Mean :2.693
3rd Qu.: 8379 3rd Qu.:20462 3rd Qu.:4.000
Max. :19999 Max. :26999 Max. :9.000
PercentSalaryHike PerformanceRating RelationshipSatisfaction
Min. :11.00 3:1244 1:276
1st Qu.:12.00 4: 226 2:303
Median :14.00 3:459
Mean :15.21 4:432
3rd Qu.:18.00
Max. :25.00
StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear
80:1470 0:631 Min. : 0.00 Min. :0.000
1:596 1st Qu.: 6.00 1st Qu.:2.000
2:158 Median :10.00 Median :3.000
3: 85 Mean :11.28 Mean :2.799
3rd Qu.:15.00 3rd Qu.:3.000
Max. :40.00 Max. :6.000
WorkLifeBalance YearsAtCompany YearsInCurrentRole
1: 80 Min. : 0.000 Min. : 0.000
2:344 1st Qu.: 3.000 1st Qu.: 2.000
3:893 Median : 5.000 Median : 3.000
4:153 Mean : 7.008 Mean : 4.229
3rd Qu.: 9.000 3rd Qu.: 7.000
Max. :40.000 Max. :18.000
YearsSinceLastPromotion YearsWithCurrManager
Min. : 0.000 Min. : 0.000
1st Qu.: 0.000 1st Qu.: 2.000
Median : 1.000 Median : 3.000
Mean : 2.188 Mean : 4.123
3rd Qu.: 3.000 3rd Qu.: 7.000
Max. :15.000 Max. :17.000
Above information describes the distribution of the data (central tendencies of integer type data variables and frequency distribution of category type columns).
#Distribution of the data
int_vars=colnames(att_data[which(sapply(att_data,class)=="integer")])
melt_attrition_dat = melt(att_data[,c("Attrition", int_vars)], id.var = "Attrition")
head(melt_attrition_dat)
NA
p <- ggplot(data = melt_attrition_dat , aes(x=variable, y=value,fill=Attrition)) + geom_boxplot()+scale_fill_viridis_d()
p <- p + facet_wrap( ~ variable, scales="free")
p
Here we can see the distribution of all the variable of integer class.
cat_cols=colnames(att_data[which(sapply(att_data,class)=="factor")])
freq_tbl=apply(att_data[,cat_cols],2, function(x) table(att_data$Attrition,x))
freq_tbl= lapply(freq_tbl,function(x) as.data.frame.matrix(x))
freq_tbl
$Attrition
$BusinessTravel
$Department
$Education
$EducationField
$EnvironmentSatisfaction
$Gender
$JobInvolvement
$JobLevel
$JobRole
$JobSatisfaction
$MaritalStatus
$Over18
$OverTime
$PerformanceRating
$RelationshipSatisfaction
$StandardHours
$StockOptionLevel
$WorkLifeBalance
a=names(freq_tbl)[2]
freq_tbl[a][[1]]
i =0
for(name in names(freq_tbl)[-1]){
i <- i +1
var_data <- data.frame(apply(freq_tbl[name][[1]],2, function(x) x[2]/sum(x)))
colnames(var_data) <- name
my_plot <- ggplot(data=var_data, aes(x=row.names(var_data), y=var_data[,name])) + geom_bar(stat="identity",fill='red') +
ylim(0.0,1.0) + ylab("%attrition") + xlab(name) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(my_plot)
remove(my_plot)
}
From the above graphs we can understand the attrition rate of employees with respect to different attribute values.
library(gridExtra)
g1=ggplot(data = att_data,aes(MonthlyIncome,EmployeeNumber,size=Age,col=Attrition))+geom_point(alpha=0.7)+ggtitle("Attrition vs Monthlyincome (1)")
g2=ggplot(data = att_data[which(att_data$Attrition=='Yes'),],aes(MonthlyIncome,EmployeeNumber,size=Age))+geom_point(col='skyblue',alpha=0.7)+ggtitle("Attrition vs Monthlyincome (2)")
grid.arrange(g1,g2)
from the first plot shows the attrition according to Monthlyincome and Age of employee.
From the second graph we can conclude that employees with Monthlyincome less than 15000 and Age between 20-40 have more chances of leaving the company.
#plotting corrplot
corrplot(cor(att_data[setdiff(int_vars,"EmployeeCount")]))
Above graph shows correlation between the two variables and we can clearly see that some of the vairables are having a good postive correlation (like Age and totalworkingyears).
#1)Feature Reduction using entropy
library(entropy)
library(dplyr)
#1.1)Calculating entropy of the categorical variable
entropy_cat = unlist(lapply(att_data[,cat_cols],function(x) entropy(table(x))))/unlist(lapply(att_data[,cat_cols],function(x) log2(length(x))))
#1.2)Picking variable with 0 entropy
zero_entropy_variable= names(entropy_cat[entropy_cat==0])
#2)Feature Reduction using variance
#2.1)Normalising the data
norm_att_data=as.data.frame.matrix(apply(att_data[,int_vars],2,function(x) (x-min(x))/(max(x)-min(x)) ))
#2.2)Calculating variance
norm_att_data=apply(norm_att_data,2,var)
#2.3 Picking low variance variable
low_var=names(norm_att_data[is.na(norm_att_data)==TRUE])
#Removing variable with 0 entropy and low variance
att_data1=select(att_data,-c(zero_entropy_variable,low_var,"EmployeeNumber"))
#dimension
dim(att_data1)
[1] 1470 31
Here we can see that 4 features are removed from the data on the basis of 0 entropy,null variance and EmployeeNumber is also removed as its just the id nominal type .
Splitting the data randomly into 75:25 ratio for training and testing the model.
#Spliting the data into 75:25 proportion
library(caTools)
set.seed(11)
splt=sample.split(att_data1$Attrition,SplitRatio = 0.75)
train_dat=subset(att_data1,splt==TRUE)
test_dat=subset(att_data1,splt==FALSE)
print(paste("Train data has ",nrow(train_dat),"observations"))
[1] "Train data has 1103 observations"
print(paste("Test data has ",nrow(test_dat),"observations"))
[1] "Test data has 367 observations"
#Building logistics model
set.seed(111)
model_dat=glm(Attrition~.,data = train_dat,family = "binomial")
#Summary of model
summary(model_dat)
Call:
glm(formula = Attrition ~ ., family = "binomial", data = train_dat)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.91844 -0.41889 -0.18242 -0.04541 3.08249
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.159e+01 7.064e+02 -0.016 0.986915
Age -4.859e-02 1.720e-02 -2.824 0.004737
BusinessTravelTravel_Frequently 2.618e+00 6.033e-01 4.340 1.43e-05
BusinessTravelTravel_Rarely 1.463e+00 5.633e-01 2.596 0.009418
DailyRate -5.154e-04 2.852e-04 -1.808 0.070666
DepartmentResearch & Development 1.501e+01 7.064e+02 0.021 0.983047
DepartmentSales 1.344e+01 7.064e+02 0.019 0.984827
DistanceFromHome 5.329e-02 1.396e-02 3.817 0.000135
Education2 5.362e-03 4.240e-01 0.013 0.989909
Education3 1.359e-01 3.685e-01 0.369 0.712178
Education4 2.821e-01 3.988e-01 0.708 0.479230
Education5 1.245e-01 7.755e-01 0.161 0.872434
EducationFieldLife Sciences -6.579e-01 1.157e+00 -0.569 0.569447
EducationFieldMarketing -3.030e-01 1.203e+00 -0.252 0.801096
EducationFieldMedical -8.482e-01 1.153e+00 -0.736 0.461935
EducationFieldOther -3.357e-01 1.236e+00 -0.271 0.786010
EducationFieldTechnical Degree 6.766e-01 1.174e+00 0.576 0.564325
EnvironmentSatisfaction2 -7.970e-01 3.417e-01 -2.333 0.019667
EnvironmentSatisfaction3 -1.302e+00 3.294e-01 -3.952 7.76e-05
EnvironmentSatisfaction4 -1.451e+00 3.290e-01 -4.409 1.04e-05
GenderMale 4.375e-01 2.353e-01 1.859 0.063050
HourlyRate 1.887e-04 5.786e-03 0.033 0.973992
JobInvolvement2 -1.347e+00 4.583e-01 -2.940 0.003280
JobInvolvement3 -1.496e+00 4.341e-01 -3.446 0.000569
JobInvolvement4 -2.174e+00 6.039e-01 -3.599 0.000319
JobLevel2 -1.607e+00 5.277e-01 -3.046 0.002315
JobLevel3 -2.992e-01 8.404e-01 -0.356 0.721842
JobLevel4 -1.963e+00 1.521e+00 -1.290 0.196919
JobLevel5 1.029e+00 1.891e+00 0.544 0.586243
JobRoleHuman Resources 1.625e+01 7.064e+02 0.023 0.981648
JobRoleLaboratory Technician 1.584e+00 7.824e-01 2.024 0.042953
JobRoleManager 6.024e-01 1.281e+00 0.470 0.638218
JobRoleManufacturing Director 1.476e+00 7.665e-01 1.925 0.054182
JobRoleResearch Director -1.731e+00 1.326e+00 -1.305 0.191959
JobRoleResearch Scientist 6.122e-01 8.014e-01 0.764 0.444886
JobRoleSales Executive 3.765e+00 1.682e+00 2.239 0.025170
JobRoleSales Representative 3.571e+00 1.761e+00 2.028 0.042558
JobSatisfaction2 -1.078e+00 3.498e-01 -3.080 0.002067
JobSatisfaction3 -8.495e-01 3.100e-01 -2.741 0.006132
JobSatisfaction4 -1.569e+00 3.283e-01 -4.780 1.75e-06
MaritalStatusMarried 9.250e-02 3.504e-01 0.264 0.791772
MaritalStatusSingle 3.223e-01 4.899e-01 0.658 0.510655
MonthlyIncome -6.515e-05 1.107e-04 -0.588 0.556204
MonthlyRate 1.350e-05 1.576e-05 0.856 0.391862
NumCompaniesWorked 2.090e-01 4.835e-02 4.323 1.54e-05
OverTimeYes 2.319e+00 2.580e-01 8.990 < 2e-16
PercentSalaryHike -4.636e-02 4.898e-02 -0.946 0.343904
PerformanceRating4 2.202e-01 5.075e-01 0.434 0.664319
RelationshipSatisfaction2 -4.632e-01 3.573e-01 -1.296 0.194854
RelationshipSatisfaction3 -8.227e-01 3.147e-01 -2.614 0.008939
RelationshipSatisfaction4 -8.541e-01 3.195e-01 -2.674 0.007504
StockOptionLevel1 -1.417e+00 3.881e-01 -3.650 0.000263
StockOptionLevel2 -1.433e+00 5.665e-01 -2.530 0.011416
StockOptionLevel3 -6.412e-01 5.986e-01 -1.071 0.284056
TotalWorkingYears -2.329e-02 3.626e-02 -0.642 0.520676
TrainingTimesLastYear -1.681e-01 8.844e-02 -1.901 0.057295
WorkLifeBalance2 -9.458e-01 4.704e-01 -2.011 0.044373
WorkLifeBalance3 -1.495e+00 4.416e-01 -3.385 0.000713
WorkLifeBalance4 -1.121e+00 5.287e-01 -2.121 0.033938
YearsAtCompany 1.595e-01 5.074e-02 3.143 0.001671
YearsInCurrentRole -2.031e-01 6.302e-02 -3.224 0.001265
YearsSinceLastPromotion 1.613e-01 5.549e-02 2.907 0.003654
YearsWithCurrManager -2.340e-01 6.167e-02 -3.794 0.000148
(Intercept)
Age **
BusinessTravelTravel_Frequently ***
BusinessTravelTravel_Rarely **
DailyRate .
DepartmentResearch & Development
DepartmentSales
DistanceFromHome ***
Education2
Education3
Education4
Education5
EducationFieldLife Sciences
EducationFieldMarketing
EducationFieldMedical
EducationFieldOther
EducationFieldTechnical Degree
EnvironmentSatisfaction2 *
EnvironmentSatisfaction3 ***
EnvironmentSatisfaction4 ***
GenderMale .
HourlyRate
JobInvolvement2 **
JobInvolvement3 ***
JobInvolvement4 ***
JobLevel2 **
JobLevel3
JobLevel4
JobLevel5
JobRoleHuman Resources
JobRoleLaboratory Technician *
JobRoleManager
JobRoleManufacturing Director .
JobRoleResearch Director
JobRoleResearch Scientist
JobRoleSales Executive *
JobRoleSales Representative *
JobSatisfaction2 **
JobSatisfaction3 **
JobSatisfaction4 ***
MaritalStatusMarried
MaritalStatusSingle
MonthlyIncome
MonthlyRate
NumCompaniesWorked ***
OverTimeYes ***
PercentSalaryHike
PerformanceRating4
RelationshipSatisfaction2
RelationshipSatisfaction3 **
RelationshipSatisfaction4 **
StockOptionLevel1 ***
StockOptionLevel2 *
StockOptionLevel3
TotalWorkingYears
TrainingTimesLastYear .
WorkLifeBalance2 *
WorkLifeBalance3 ***
WorkLifeBalance4 *
YearsAtCompany **
YearsInCurrentRole **
YearsSinceLastPromotion **
YearsWithCurrManager ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 974.94 on 1102 degrees of freedom
Residual deviance: 564.13 on 1040 degrees of freedom
AIC: 690.13
Number of Fisher Scoring iterations: 15
From the above summary we can see the residual deviance and AIC of the model and the important variables having p-value < 0.05.
Mkaing Prediction Now initially, we will make the prediction using model inclusive of all the variables and check the performance of our model.
#Making prediciton
predict_dat=predict(model_dat,newdata=test_dat,type="response")
Now the outcome of logistics regression model is in probablity.In order to make actual predictions we have to make a confusion matrix by setting threshold value “t” which filters the probablity above threshold value as “Yes” an below as “No”.
Initially we are settign threshold as 0.5 as its the general value and also mid value which keep senstivity and specificity in balance as increasing threshold will lead to increase in specificity but decrease senstivity which means more error in prediction employees who will churn.Similarily lower threshold value decrease specificity and increase senstivity which means more error in predicting employees who will not churn.
#Making confusion matrix with threshold 0.5
confusion_dat=table(test_dat$Attrition,predict_dat>0.5)
print("Confusion matrix for threshold 0.5")
[1] "Confusion matrix for threshold 0.5"
confusion_dat
FALSE TRUE
No 291 17
Yes 24 35
#Calcluating senstivity/tpr
tp=confusion_dat[4]
total_p=tp+confusion_dat[2]
senstivity_dat=tp/total_p
print(paste("senstivity:",senstivity_dat))
[1] "senstivity: 0.593220338983051"
#Calculating specificity/fpr
tn=confusion_dat[1]
total_n=tn+confusion_dat[3]
specificity_dat=tn/total_n
print(paste("specificity:",specificity_dat))
[1] "specificity: 0.944805194805195"
Now here we can see that specificity is way too high as compared to senstivity so in order to resolve this problem this we can find an optimal value of threshold.
In order to find the optimal value of threshold we will plot a ROCR curve (true postive rate vs false postive rate) which will indicate the value with minimum error.
#Reciever Operator Characteristics Curve
rocr_curve=function(predi){
#Prediction
rocr_pred=prediction(predi,test_dat$Attrition)
#Performance
rocr_perf=performance(rocr_pred,"tpr","fpr")
#plotting ROCR
plot(rocr_perf,colorize=TRUE,print.cutoffs.at=seq(0,1,0.1),text.adj=c(-0.2,1.7))
}
rocr_curve(predict_dat)
From the ROCR plot we can understand that 0.25 is the optimum value of threshold.
So now we will make the actual prediction using threshold=0.3 and check its accuracy ,senstivity,specificity and kappa value.
#performance model function
#confusion matrix
confus_dat1=table(test_dat$Attrition,predict_dat>0.25)
print("Confusion matrix with optimum threshold")
[1] "Confusion matrix with optimum threshold"
print(confus_dat1)
FALSE TRUE
No 263 45
Yes 15 44
model_perform=function(confu){
print('Model Performance:')
#caluclating senstivity
tp1=confu[4]
totl_p=tp1+confu[2]
sens=tp/totl_p
print(paste("senstivity:",sens))
#calculating specificity
tn1=confu[1]
totl_n=tn1+confu[3]
spec=tn1/totl_n
print(paste("specificity:",spec))
#calculating accuracy
acc_dat1=sum(diag(confu))/sum(confu)
print(paste("Accuracy:",round(acc_dat1*100),"%"))
}
model_perform(confus_dat1)
[1] "Model Performance:"
[1] "senstivity: 0.593220338983051"
[1] "specificity: 0.853896103896104"
[1] "Accuracy: 84 %"
Here we got the model with good accuracy and as we are more concerned about getting more true predictions of employees who will churn, it is optimum threshold for our concern.
Now we will try to reduce the complexity of the model by selecting important features for the model based on p-value. In summary of the model all variables which have p-value less than 0.05 are considered important.
summ_dat=as.data.frame.matrix(summary(model_dat)$coef)
sign_var=summ_dat[summ_dat$`Pr(>|z|)`<0.05,]
sign_var
Above shown are the significant variables and now we will build a model using these variables only and check the tradeoff between complexity and accuracy.
#Selecting significant variable names
name_signi=c("Age","BusinessTravel","DistanceFromHome","EnvironmentSatisfaction","Gender","JobInvolvement","JobLevel",
"JobSatisfaction","NumCompaniesWorked","OverTime","RelationshipSatisfaction",
"StockOptionLevel","WorkLifeBalance","YearsSinceLastPromotion", "YearsWithCurrManager")
#Building model
model_signi=glm(Attrition~.,data = train_dat[,c("Attrition",name_signi)],family = "binomial")
summary(model_signi)
Call:
glm(formula = Attrition ~ ., family = "binomial", data = train_dat[,
c("Attrition", name_signi)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.1400 -0.4827 -0.2572 -0.1034 3.8460
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.60837 0.89237 2.923 0.003467 **
Age -0.04958 0.01375 -3.606 0.000311 ***
BusinessTravelTravel_Frequently 2.23263 0.54159 4.122 3.75e-05 ***
BusinessTravelTravel_Rarely 1.29233 0.51517 2.509 0.012122 *
DistanceFromHome 0.04507 0.01269 3.552 0.000382 ***
EnvironmentSatisfaction2 -0.63252 0.30604 -2.067 0.038755 *
EnvironmentSatisfaction3 -0.94279 0.28913 -3.261 0.001111 **
EnvironmentSatisfaction4 -1.11141 0.29305 -3.793 0.000149 ***
GenderMale 0.44480 0.21216 2.097 0.036037 *
JobInvolvement2 -1.23489 0.41718 -2.960 0.003076 **
JobInvolvement3 -1.43074 0.39265 -3.644 0.000269 ***
JobInvolvement4 -2.15916 0.56174 -3.844 0.000121 ***
JobLevel2 -1.23814 0.25765 -4.805 1.54e-06 ***
JobLevel3 -0.78077 0.34101 -2.290 0.022044 *
JobLevel4 -2.03736 0.70595 -2.886 0.003902 **
JobLevel5 -0.99226 0.67429 -1.472 0.141142
JobSatisfaction2 -0.76296 0.31963 -2.387 0.016987 *
JobSatisfaction3 -0.59131 0.27465 -2.153 0.031319 *
JobSatisfaction4 -1.31119 0.29711 -4.413 1.02e-05 ***
NumCompaniesWorked 0.16109 0.04138 3.892 9.92e-05 ***
OverTimeYes 1.96367 0.22484 8.734 < 2e-16 ***
RelationshipSatisfaction2 -0.46350 0.31659 -1.464 0.143183
RelationshipSatisfaction3 -0.71781 0.28640 -2.506 0.012198 *
RelationshipSatisfaction4 -0.77716 0.29414 -2.642 0.008239 **
StockOptionLevel1 -1.32750 0.23591 -5.627 1.83e-08 ***
StockOptionLevel2 -1.53873 0.44518 -3.456 0.000547 ***
StockOptionLevel3 -0.71447 0.47612 -1.501 0.133455
WorkLifeBalance2 -0.94262 0.40221 -2.344 0.019100 *
WorkLifeBalance3 -1.26483 0.37405 -3.381 0.000721 ***
WorkLifeBalance4 -1.04510 0.46404 -2.252 0.024312 *
YearsSinceLastPromotion 0.16447 0.04373 3.761 0.000169 ***
YearsWithCurrManager -0.17747 0.04329 -4.100 4.13e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 974.94 on 1102 degrees of freedom
Residual deviance: 649.40 on 1071 degrees of freedom
AIC: 713.4
Number of Fisher Scoring iterations: 6
Now we can make prediction using this model and with optimum threshold value we will check the model performance.
#Making predictions for model with significance variables
pred_dat1=predict(model_signi,test_dat[,c("Attrition",name_signi)],type = "response")
#Plotting rocr cure for optimum threshold
rocr_curve(pred_dat1)
From te above plot we can observe that the optimum threshold value lies between 0.2-0.3 .So now we will evaluate model performance using this threshold value.
Evaluating model performance with significant variables only.
#making confusion matrix
print("Confusion Matrix of model with significant variables")
[1] "Confusion Matrix of model with significant variables"
confus_dat2=table(test_dat$Attrition,pred_dat1>0.28)
confus_dat2
FALSE TRUE
No 265 43
Yes 19 40
model_perform(confu = confus_dat2)
[1] "Model Performance:"
[1] "senstivity: 0.593220338983051"
[1] "specificity: 0.86038961038961"
[1] "Accuracy: 83 %"
#Making decision tres model
set.seed(1111)
dt_model=rpart(Attrition~.,method = "class",data = train_dat)
Now we can predict the test data using this model and evaluates its performance as we did in logistics regression.
#Making predictions
pred_dat2=as.data.frame.matrix(predict(dt_model,test_dat,type = "prob"))
pred_dat2=pred_dat2$Yes
#Plotting rocr curve
rocr_curve(predi = pred_dat2)
From the above we can observe that optimum threshold value between 0.2-0.3 .So now we will evaluate the model performance using this threshold value.
#Making confusion matrix
confu_dat3=table(test_dat$Attrition,pred_dat2>0.2)
print("Confusion Matrix for DT model")
[1] "Confusion Matrix for DT model"
confu_dat3
FALSE TRUE
No 291 17
Yes 33 26
model_perform(confu_dat3)
[1] "Model Performance:"
[1] "senstivity: 0.593220338983051"
[1] "specificity: 0.944805194805195"
[1] "Accuracy: 86 %"
As we can see here the accuracy has been improved a little bit but also the specificity has increased remarkably which means less chances of error in predicting the employees who will not churn and more chances of error in predicting employees who will churn.
Now like we picked the significant variables in logistics regression we will do the same with DT model and evaluate the model performance.Model Building including only important variables.
#Picking important variables
imp_varr=names(sort(dt_model$variable.importance,decreasing = TRUE))
#Model Building
dt_model2=rpart(Attrition~.,method = "class",data = train_dat[,c("Attrition",imp_varr)])
Decision Tree Model Performance: Evualating performance of decision tree model with important variables only.
#Making prediction
pred_dat3=as.data.frame.matrix(predict(dt_model2,test_dat[,c("Attrition",imp_varr)],type = "prob"))
pred_dat3=pred_dat3$Yes
#rocr curve
rocr_curve(pred_dat3)
Here we can observe that optimum value of threshold 0.3. Now we will evaluate the model performance using optimum threshold.
#making cinfusion matrix
confu_dat4=table(test_dat$Attrition,pred_dat3>0.2)
print("Confusion matrix for important variables")
[1] "Confusion matrix for important variables"
confu_dat4
FALSE TRUE
No 291 17
Yes 33 26
model_perform(confu_dat4)
[1] "Model Performance:"
[1] "senstivity: 0.593220338983051"
[1] "specificity: 0.944805194805195"
[1] "Accuracy: 86 %"
prp(dt_model2,roundint = FALSE)
Above shown is the decision tree diagram for important variables.
Now we will try to introduce new features and build a model to evaluate its performance.
#Copying test train data
train_new_dat=train_dat
test_new_dat=test_dat
#Making new feature for first company
firstcompany=att_data$NumCompaniesWorked==1
#Making another feature for loyality
loyality=att_data$YearsAtCompany/att_data$TotalWorkingYears
loyality[is.na(loyality)]=0
# Making volatility as a new feature
volatility = att_data$TotalWorkingYears/att_data$NumCompaniesWorked
volatility[is.infinite(volatility)]=att_data$TotalWorkingYears[is.infinite(volatility)]
#Adding new features in train test data
train_new_dat$firstcompany=firstcompany[splt==TRUE]
train_new_dat$loyality=loyality[splt==TRUE]
train_new_dat$volatility=volatility[splt==TRUE]
test_new_dat$firstcompany=firstcompany[splt==FALSE]
test_new_dat$loyality=loyality[splt==FALSE]
test_new_dat$volatility=volatility[splt==FALSE]
#Picking new features
new_feat=c("firstcompany","loyality","volatility")
Now as we have created some new features we can build a new model and check its performance
Model building with new feature and the significant features
#Model Building
model_new_feat=glm(Attrition~.,data = train_new_dat[,c("Attrition",name_signi,new_feat)],family = "binomial")
summary(model_new_feat)
Call:
glm(formula = Attrition ~ ., family = "binomial", data = train_new_dat[,
c("Attrition", name_signi, new_feat)])
Deviance Residuals:
Min 1Q Median 3Q Max
-2.1095 -0.4768 -0.2588 -0.1002 3.9119
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.211467 0.975990 2.266 0.023459
Age -0.044252 0.014639 -3.023 0.002504
BusinessTravelTravel_Frequently 2.271981 0.544798 4.170 3.04e-05
BusinessTravelTravel_Rarely 1.320968 0.518432 2.548 0.010834
DistanceFromHome 0.045034 0.012712 3.543 0.000396
EnvironmentSatisfaction2 -0.642659 0.306483 -2.097 0.036004
EnvironmentSatisfaction3 -0.952836 0.289995 -3.286 0.001017
EnvironmentSatisfaction4 -1.131019 0.294474 -3.841 0.000123
GenderMale 0.467585 0.213825 2.187 0.028760
JobInvolvement2 -1.266370 0.420309 -3.013 0.002587
JobInvolvement3 -1.447307 0.396168 -3.653 0.000259
JobInvolvement4 -2.154320 0.564135 -3.819 0.000134
JobLevel2 -1.198499 0.267044 -4.488 7.19e-06
JobLevel3 -0.752171 0.355111 -2.118 0.034164
JobLevel4 -1.988190 0.739288 -2.689 0.007160
JobLevel5 -0.974479 0.701801 -1.389 0.164972
JobSatisfaction2 -0.766564 0.320723 -2.390 0.016843
JobSatisfaction3 -0.592820 0.275563 -2.151 0.031452
JobSatisfaction4 -1.313512 0.298732 -4.397 1.10e-05
NumCompaniesWorked 0.189683 0.057605 3.293 0.000992
OverTimeYes 1.975797 0.225943 8.745 < 2e-16
RelationshipSatisfaction2 -0.473502 0.316516 -1.496 0.134658
RelationshipSatisfaction3 -0.721628 0.286964 -2.515 0.011914
RelationshipSatisfaction4 -0.795014 0.295916 -2.687 0.007218
StockOptionLevel1 -1.312684 0.236788 -5.544 2.96e-08
StockOptionLevel2 -1.548887 0.446240 -3.471 0.000519
StockOptionLevel3 -0.718871 0.480022 -1.498 0.134242
WorkLifeBalance2 -0.963807 0.401893 -2.398 0.016477
WorkLifeBalance3 -1.291353 0.374172 -3.451 0.000558
WorkLifeBalance4 -1.046844 0.463310 -2.259 0.023853
YearsSinceLastPromotion 0.163445 0.045236 3.613 0.000303
YearsWithCurrManager -0.184868 0.052021 -3.554 0.000380
firstcompanyTRUE 0.363822 0.316608 1.149 0.250505
loyality 0.033154 0.493545 0.067 0.946442
volatility -0.002868 0.038053 -0.075 0.939914
(Intercept) *
Age **
BusinessTravelTravel_Frequently ***
BusinessTravelTravel_Rarely *
DistanceFromHome ***
EnvironmentSatisfaction2 *
EnvironmentSatisfaction3 **
EnvironmentSatisfaction4 ***
GenderMale *
JobInvolvement2 **
JobInvolvement3 ***
JobInvolvement4 ***
JobLevel2 ***
JobLevel3 *
JobLevel4 **
JobLevel5
JobSatisfaction2 *
JobSatisfaction3 *
JobSatisfaction4 ***
NumCompaniesWorked ***
OverTimeYes ***
RelationshipSatisfaction2
RelationshipSatisfaction3 *
RelationshipSatisfaction4 **
StockOptionLevel1 ***
StockOptionLevel2 ***
StockOptionLevel3
WorkLifeBalance2 *
WorkLifeBalance3 ***
WorkLifeBalance4 *
YearsSinceLastPromotion ***
YearsWithCurrManager ***
firstcompanyTRUE
loyality
volatility
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 974.94 on 1102 degrees of freedom
Residual deviance: 647.48 on 1068 degrees of freedom
AIC: 717.48
Number of Fisher Scoring iterations: 6
Now we can check the model performance and compare it with the others.
Evaluating model performance for new features and significant features.
#Making prediction
pred_dat4=predict(model_new_feat,test_new_dat[,c("Attrition",name_signi,new_feat)],type = "response")
#plotting Rocr curve
rocr_curve(predi = pred_dat4)
Here we can clearly see that the optimum threshold value is between 0.3 and 0.4 .So now we will make the confusion matrix with 0.35 as threshold value and evaluate the performance of the model
#Confusion matrix
confu_dat_new=table(test_new_dat$Attrition,pred_dat4>0.35)
print("Confusion matrix ")
[1] "Confusion matrix "
confu_dat_new
FALSE TRUE
No 279 29
Yes 22 37
#model performace
model_perform(confu_dat_new)
[1] "Model Performance:"
[1] "senstivity: 0.593220338983051"
[1] "specificity: 0.905844155844156"
[1] "Accuracy: 86 %"
We will compare the performance of each model to sewlect the best model.
#Making functions
sensti=function(x){
tp1=x[4]
totl_p=tp1+x[2]
sens=tp/totl_p
sens
}
specif=function(y){
tn1=y[1]
totl_n=tn1+y[3]
spec=tn1/totl_n
spec
}
acc=function(z){
acc_z=sum(diag(z))/sum(z)
accur=round(acc_z*100)
accur
}
#senstivity
sens_log=sensti(confus_dat1)
sens_log_sign=sensti(confus_dat2)
sens_DT=S=sensti(confu_dat3)
sens_DT_imp=sensti(confu_dat4)
sens_log_new=sensti(confu_dat_new)
#specificity
speci_log=specif(confus_dat1)
speci_log_sign=specif(confus_dat2)
speci_DT=S=specif(confu_dat3)
speci_DT_imp=specif(confu_dat4)
speci_log_new=specif(confu_dat_new)
#Accuracy
acc_log=acc(confus_dat1)
acc_log_sign=acc(confus_dat2)
acc_DT=S=acc(confu_dat3)
acc_DT_imp=acc(confu_dat4)
acc_log_new=acc(confu_dat_new)
#Performance comparison table
data.frame(list("model_name" = c("cart all variables","cart important variables","logistic all variables","logistic significant variables","Logistic with feature engineering"),
"Sensitivity" = c(sens_DT,sens_DT_imp,sens_log,sens_log_sign,sens_log_new),
"Specificity" = c(speci_DT,speci_DT_imp,speci_log,speci_log_sign,speci_log_new),
"Accuracy" = c(acc_DT,acc_DT_imp,acc_log,acc_log_sign,acc_log_new)))
NA
NA
From the above statistics we conclude that for this use case:
Here we are considering two main factors for model selection:
1)According to Occam learning we should choose the model with less complexity (i.e model having only sufficient variables to understand the pattern) even though if it is having little less accuracy than other models.
2)Second factor depends upon buisness intrest if they are concern about senstivity ,specificity or total accuracy.As in this case company is more concerned about finding out the employees who will churn,So we should concern for senstivity to have a higher value and less specificity value.
Now after considering the above 2 points we can conclude that “logistics model with significant variables” is satisfying both the above mentioned statements without a remarkable decrease in the accuracy hence is the best model of all the models.
#plot for Decision Tree
plot(roc(test_dat$Attrition, pred_dat2), print.auc=TRUE,col="black")
Setting levels: control = No, case = Yes
Setting direction: controls < cases
#plot for Decision Tree with important variables
plot(roc(test_dat$Attrition, pred_dat3), print.auc = TRUE,col = "green", print.auc.y = .1, add = TRUE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
#plot for logistics regression with significant variables
plot(roc(test_dat$Attrition, pred_dat1), print.auc = TRUE,col = "blue", print.auc.y = .2, add = TRUE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
#plot for logistics regression with all variables
plot(roc(test_dat$Attrition, predict_dat), print.auc = TRUE,col = "red", print.auc.y = .3, add = TRUE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases
#plot for logistics regression with significant and important variables
plot(roc(test_dat$Attrition, pred_dat4), print.auc = TRUE,col = "pink", print.auc.y = .4, add = TRUE)
Setting levels: control = No, case = Yes
Setting direction: controls < cases