This model is to find out factors influencing Employee Attrition and check best fit model.
setwd("~/Downloads")
dataset <- read.csv("HR_Employee_Attrition_Data-1.csv", header=TRUE)
#Exploratory Data Analysis:
str(dataset)
## 'data.frame': 2940 obs. of 35 variables:
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
summary(dataset)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 No :2466 Non-Travel : 300 Min. : 102.0
## 1st Qu.:30.00 Yes: 474 Travel_Frequently: 554 1st Qu.: 465.0
## Median :36.00 Travel_Rarely :2086 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## Department DistanceFromHome Education
## Human Resources : 126 Min. : 1.000 Min. :1.000
## Research & Development:1922 1st Qu.: 2.000 1st Qu.:2.000
## Sales : 892 Median : 7.000 Median :3.000
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber
## Human Resources : 54 Min. :1 Min. : 1.0
## Life Sciences :1212 1st Qu.:1 1st Qu.: 735.8
## Marketing : 318 Median :1 Median :1470.5
## Medical : 928 Mean :1 Mean :1470.5
## Other : 164 3rd Qu.:1 3rd Qu.:2205.2
## Technical Degree: 264 Max. :1 Max. :2940.0
##
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement
## Min. :1.000 Female:1176 Min. : 30.00 Min. :1.00
## 1st Qu.:2.000 Male :1764 1st Qu.: 48.00 1st Qu.:2.00
## Median :3.000 Median : 66.00 Median :3.00
## Mean :2.722 Mean : 65.89 Mean :2.73
## 3rd Qu.:4.000 3rd Qu.: 84.00 3rd Qu.:3.00
## Max. :4.000 Max. :100.00 Max. :4.00
##
## JobLevel JobRole JobSatisfaction
## Min. :1.000 Sales Executive :652 Min. :1.000
## 1st Qu.:1.000 Research Scientist :584 1st Qu.:2.000
## Median :2.000 Laboratory Technician :518 Median :3.000
## Mean :2.064 Manufacturing Director :290 Mean :2.729
## 3rd Qu.:3.000 Healthcare Representative:262 3rd Qu.:4.000
## Max. :5.000 Manager :204 Max. :4.000
## (Other) :430
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked
## Divorced: 654 Min. : 1009 Min. : 2094 Min. :0.000
## Married :1346 1st Qu.: 2911 1st Qu.: 8045 1st Qu.:1.000
## Single : 940 Median : 4919 Median :14236 Median :2.000
## Mean : 6503 Mean :14313 Mean :2.693
## 3rd Qu.: 8380 3rd Qu.:20462 3rd Qu.:4.000
## Max. :19999 Max. :26999 Max. :9.000
##
## Over18 OverTime PercentSalaryHike PerformanceRating
## Y:2940 No :2108 Min. :11.00 Min. :3.000
## Yes: 832 1st Qu.:12.00 1st Qu.:3.000
## Median :14.00 Median :3.000
## Mean :15.21 Mean :3.154
## 3rd Qu.:18.00 3rd Qu.:3.000
## Max. :25.00 Max. :4.000
##
## RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
## Min. :1.000 Min. :80 Min. :0.0000 Min. : 0.00
## 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00
## Median :3.000 Median :80 Median :1.0000 Median :10.00
## Mean :2.712 Mean :80 Mean :0.7939 Mean :11.28
## 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :4.000 Max. :80 Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.799 Mean :2.761 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :4.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
#Plots:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
g <- ggplot(dataset)
#Attrition:
g+geom_bar(aes(Attrition,fill=Attrition))

#People in the company are 5 times more than the people leaving the company
#Yearssincelastpromotion vs Attrition
g+geom_bar(aes(YearsSinceLastPromotion,fill=Attrition))

#People recently promoted quit the company more than the ones not promoted
#YearsWithCurrentManager vs Attrition:
g+geom_bar(aes(YearsWithCurrManager,fill=Attrition))

#As the number of years with Current Manager increases, Attrition decreases
#TrainingTimeLastYear vs Attrition:
g+geom_bar(aes(TrainingTimesLastYear,fill=Attrition))

#Attrition seen in employees trained betweeb 2-4times last year.
#YearsatCompany vs Attrition
g+geom_point(aes(YearsAtCompany,Attrition,size=YearsAtCompany))

#People with less no of years tend to quit the company more.
#TotalWorking Years vs Attrition
g+geom_bar(aes(TotalWorkingYears,fill=Attrition))

#People with less Experience are leaving the job more.
#Present Salary Hike vs Attrition
g+(aes(PercentSalaryHike,Attrition))+geom_point(alpha=0.01)

#People with Less Percent Hike leave the company.
#OverTime vs Attrition
g+geom_bar(aes(OverTime,fill=Gender,colour=Attrition))

#Male employees working overtime leave the company more
#WorkLifeBalance vs Attrition
g+geom_bar(aes(WorkLifeBalance,fill=Attrition))

#People with better work life balance may tend to quit more.
#Marital Status vs Attrition
g+geom_bar(aes(MaritalStatus,fill=Attrition))

#Attrtion higest in Employees who are single,medium in Employees who are married and least in Divorced Employees.
#JObRole vs Attrition
g+(aes(JobRole))+geom_bar(aes(fill=Attrition))

#Job ROle of Sales Representative has the most attrition in various job roles present.
#JobInvolvement vs Attrition
g+(aes(JobInvolvement))+geom_bar(aes(fill=Attrition))

#People leaving the company are highly involved in their jobs
#JobSatisfaction vs Attrition:
g+(aes(JobSatisfaction))+geom_bar(aes(fill=Attrition))

#Low JobSatisfaction results in people leaving the company.
#StockOptionLevels vs Attrition:
g+(aes(StockOptionLevel))+geom_bar(aes(fill=Attrition))

#Attrition high in people with No or Less Stock Options
#Gender vs Attrition
g+geom_bar(aes(Attrition,fill=Gender))

#More in Male employees
#DistanceFromHome vs Attrition
g+geom_histogram(binwidth=40,aes(DistanceFromHome,fill=Attrition))

#People living in shorter distances from office leave more.
#Hourly,Daily & Monthly Rates vs Attrition:
g+geom_point(aes(DailyRate,Attrition),alpha = 0.05)

g+geom_point(aes(HourlyRate,Attrition),alpha = 0.05)

g+geom_point(aes(MonthlyRate,Attrition),alpha = 0.05)

#Education Field, Education vs Attrition
g+geom_bar(aes(Education,fill=Attrition))

g+geom_bar(aes(EducationField,fill=Attrition))

#Attrtion seems to be higher in Bachelors, Life Sciences and Medical
#Department vs Attrition
g+aes(Department,fill=Attrition)+geom_density(position = "stack")

#People from Reasearch&Development and Sales tend to quit more compared to HR
#Business Travel vs Attrition
g+geom_bar(aes(BusinessTravel,fill=Attrition))

#Attrition is directly proportional to Travel among the employees.
#Age vs Attrition
g+geom_histogram(binwidth = 10,aes(Age,fill=Attrition),colour="Black")

#Employees around 30years leave the company more
#Others:
g+geom_bar(aes(OverTime,fill=Attrition))+facet_grid(.~JobRole,scales = "free")

#High Attrition among Sales Representatives and Lab Technicians who work Overtime.
#Reordering the dataset:
dataset<-dataset[c(2,1,4,6,7,9,10,11,13,14,15,17,19,20,21,24:35,3,5,8,12,16,18,23)]
#Checking for Correlation between variables:
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library(psych)
## Warning: package 'psych' was built under R version 3.4.4
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
corrplot(cor(dataset[,2:27]))
## Warning in cor(dataset[, 2:27]): the standard deviation is zero

#Removing Unnecessary and correlated columns:
# 1.Over18 field can be calculated from the Age column:
dataset$Over18 <- NULL
#Employee Count and Employee Number fields do not carry any useful information:
dataset$EmployeeCount <-NULL
dataset$EmployeeNumber <-NULL
#Standard Hours is same across all the columns:
dataset$StandardHours<-NULL
#From Correlation table:
dataset$YearsAtCompany <-NULL
dataset$YearsInCurrentRole<-NULL
dataset$YearsSinceLastPromotion<-NULL
dataset$MonthlyIncome<-NULL
dataset$PerformanceRating<-NULL
#Target Varaible:
dataset$Attrition <- ifelse(dataset$Attrition =="No",0,1)
#Splitting the data into Training and Test sets:
library(caTools)
## Warning: package 'caTools' was built under R version 3.4.4
set.seed(123)
split=sample.split(dataset$Attrition,SplitRatio =0.7)
training_set=subset(dataset,split==TRUE)
test_set=subset(dataset,split==FALSE)
#Hypothesis:F-Statistic is significant which shows regression is significant i.e. Dependent varaible is dependant on the given independent varaibles in
regressor<-lm(formula=training_set$Attrition~.,
data=training_set)
summary(regressor)
##
## Call:
## lm(formula = training_set$Attrition ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58488 -0.20009 -0.07990 0.07289 1.28870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.598e-01 1.395e-01 4.012 6.25e-05
## Age -3.467e-03 1.102e-03 -3.146 0.001679
## DailyRate -2.918e-05 1.786e-05 -1.634 0.102343
## DistanceFromHome 3.273e-03 8.852e-04 3.698 0.000223
## Education 7.733e-03 7.182e-03 1.077 0.281720
## EnvironmentSatisfaction -4.001e-02 6.562e-03 -6.097 1.29e-09
## HourlyRate -3.801e-04 3.566e-04 -1.066 0.286560
## JobInvolvement -6.034e-02 1.014e-02 -5.950 3.16e-09
## JobLevel 7.615e-03 1.664e-02 0.458 0.647251
## JobSatisfaction -3.891e-02 6.552e-03 -5.938 3.39e-09
## MonthlyRate 4.155e-07 1.007e-06 0.413 0.679884
## NumCompaniesWorked 1.425e-02 3.151e-03 4.520 6.53e-06
## PercentSalaryHike -1.996e-03 1.974e-03 -1.011 0.312161
## RelationshipSatisfaction -1.790e-02 6.613e-03 -2.706 0.006866
## StockOptionLevel -2.172e-02 1.157e-02 -1.876 0.060740
## TotalWorkingYears -2.032e-03 1.926e-03 -1.055 0.291538
## TrainingTimesLastYear -1.286e-02 5.635e-03 -2.281 0.022630
## WorkLifeBalance -3.332e-02 1.011e-02 -3.296 0.000998
## YearsWithCurrManager -7.085e-03 2.406e-03 -2.945 0.003271
## BusinessTravelTravel_Frequently 1.767e-01 2.858e-02 6.181 7.70e-10
## BusinessTravelTravel_Rarely 8.101e-02 2.463e-02 3.289 0.001023
## DepartmentResearch & Development 1.453e-01 1.099e-01 1.322 0.186483
## DepartmentSales 1.287e-01 1.132e-01 1.137 0.255641
## EducationFieldLife Sciences -1.209e-01 7.544e-02 -1.602 0.109229
## EducationFieldMarketing -9.086e-02 7.962e-02 -1.141 0.253964
## EducationFieldMedical -1.451e-01 7.544e-02 -1.923 0.054583
## EducationFieldOther -1.348e-01 8.067e-02 -1.671 0.094852
## EducationFieldTechnical Degree -5.818e-02 7.776e-02 -0.748 0.454421
## GenderMale 4.707e-02 1.470e-02 3.202 0.001387
## JobRoleHuman Resources 2.810e-01 1.135e-01 2.476 0.013373
## JobRoleLaboratory Technician 1.586e-01 3.365e-02 4.712 2.62e-06
## JobRoleManager 7.797e-02 5.106e-02 1.527 0.126899
## JobRoleManufacturing Director 1.023e-02 3.293e-02 0.311 0.755999
## JobRoleResearch Director -1.799e-02 4.386e-02 -0.410 0.681801
## JobRoleResearch Scientist 5.601e-02 3.349e-02 1.672 0.094611
## JobRoleSales Executive 1.125e-01 6.554e-02 1.717 0.086136
## JobRoleSales Representative 2.835e-01 7.292e-02 3.888 0.000104
## MaritalStatusMarried 9.633e-03 1.925e-02 0.500 0.616917
## MaritalStatusSingle 9.623e-02 2.638e-02 3.647 0.000272
## OverTimeYes 2.011e-01 1.617e-02 12.439 < 2e-16
##
## (Intercept) ***
## Age **
## DailyRate
## DistanceFromHome ***
## Education
## EnvironmentSatisfaction ***
## HourlyRate
## JobInvolvement ***
## JobLevel
## JobSatisfaction ***
## MonthlyRate
## NumCompaniesWorked ***
## PercentSalaryHike
## RelationshipSatisfaction **
## StockOptionLevel .
## TotalWorkingYears
## TrainingTimesLastYear *
## WorkLifeBalance ***
## YearsWithCurrManager **
## BusinessTravelTravel_Frequently ***
## BusinessTravelTravel_Rarely **
## DepartmentResearch & Development
## DepartmentSales
## EducationFieldLife Sciences
## EducationFieldMarketing
## EducationFieldMedical .
## EducationFieldOther .
## EducationFieldTechnical Degree
## GenderMale **
## JobRoleHuman Resources *
## JobRoleLaboratory Technician ***
## JobRoleManager
## JobRoleManufacturing Director
## JobRoleResearch Director
## JobRoleResearch Scientist .
## JobRoleSales Executive .
## JobRoleSales Representative ***
## MaritalStatusMarried
## MaritalStatusSingle ***
## OverTimeYes ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3226 on 2018 degrees of freedom
## Multiple R-squared: 0.2459, Adjusted R-squared: 0.2314
## F-statistic: 16.87 on 39 and 2018 DF, p-value: < 2.2e-16
#CART Model:
## setting the control paramter inputs for rpart:
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
library(rattle)
## Warning: package 'rattle' was built under R version 3.4.4
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
ctrl = rpart.control(minsplit=100, minbucket = 10, cp = 0, xval = 10)
destree <- rpart(formula = training_set$Attrition ~ .,
data = training_set[,2:26], method = "class",
control =ctrl)
rpart.plot(destree)
## Warning: Bad 'data' field in model 'call'.
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.

fancyRpartPlot(destree)
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018g.1.0/
## zoneinfo/Asia/Kolkata'

printcp(destree)
##
## Classification tree:
## rpart(formula = training_set$Attrition ~ ., data = training_set[,
## 2:26], method = "class", control = ctrl)
##
## Variables actually used in tree construction:
## [1] Age Department JobLevel JobRole
## [5] OverTime StockOptionLevel
##
## Root node error: 332/2058 = 0.16132
##
## n= 2058
##
## CP nsplit rel error xerror xstd
## 1 0.0361446 0 1.00000 1.00000 0.050261
## 2 0.0090361 3 0.89157 0.94578 0.049133
## 3 0.0045181 4 0.88253 1.02108 0.050685
## 4 0.0000000 6 0.87349 1.03313 0.050924
plotcp(destree)

#Predicting the test set results:
ypred1 <- predict(destree, test_set[,2:26], type="prob")
ypred1<-as.data.frame(ypred1)
ycm1<-as.data.frame(ypred1)
ypred1<-ifelse(ypred1$`1`>ypred1$`0`,ypred1$`1`,ypred1$`0`)
y_cm1<-ifelse(ycm1$`1`>ycm1$`0`,1,0)
#Confusion Matrix:
cm1=table(test_set[,1],y_cm1)
cm1
## y_cm1
## 0 1
## 0 722 18
## 1 108 34
#Hypothesis:
#We stop when the Null hypothesis-independence between any of the input variables and the response variable cannot be rejected
#Encoding Categorical variables as factors:
train_nn<-training_set
test_nn<-test_set
train_nn$BusinessTravel <- as.numeric(factor(train_nn$BusinessTravel,
levels=c('Non-Travel','Travel_Rarely','Travel_Frequently'),
labels=c(1,2,3)))
train_nn$EducationField <- as.numeric(factor(train_nn$EducationField,
levels=c('Human Resources','Life Sciences','Medical','Marketing','Technical Degree','Other'),
labels=c(1,2,3,4,5,6)))
train_nn$Department <- as.numeric(factor(train_nn$Department,
levels=c('Sales','Research & Development','Human Resources'),
labels=c(1,2,3)))
train_nn$Gender <- as.numeric(factor(train_nn$Gender,
levels=c('Female','Male'),
labels=c(1,2)))
train_nn$JobRole <- as.numeric(factor(train_nn$JobRole,
levels=c('Healthcare Representative','Human Resources','Laboratory Technician','Manager','Manufacturing Director','Research Director','Research Scientist','Sales Executive','Sales Representative'),
labels=c(1,2,3,4,5,6,7,8,9)))
train_nn$MaritalStatus <- as.numeric(factor(train_nn$MaritalStatus,
levels=c('Married','Single','Divorced'),
labels=c(1,2,3)))
train_nn$OverTime <- as.numeric(factor(train_nn$OverTime,
levels=c('Yes','No'),
labels=c(1,2)))
test_nn$BusinessTravel <- as.numeric(factor(test_nn$BusinessTravel,
levels=c('Non-Travel','Travel_Rarely','Travel_Frequently'),
labels=c(1,2,3)))
test_nn$EducationField <- as.numeric(factor(test_nn$EducationField,
levels=c('Human Resources','Life Sciences','Medical','Marketing','Technical Degree','Other'),
labels=c(1,2,3,4,5,6)))
test_nn$Department <- as.numeric(factor(test_nn$Department,
levels=c('Sales','Research & Development','Human Resources'),
labels=c(1,2,3)))
test_nn$Gender <- as.numeric(factor(test_nn$Gender,
levels=c('Female','Male'),
labels=c(1,2)))
test_nn$JobRole <- as.numeric(factor(test_nn$JobRole,
levels=c('Healthcare Representative','Human Resources','Laboratory Technician','Manager','Manufacturing Director','Research Director','Research Scientist','Sales Executive','Sales Representative'),
labels=c(1,2,3,4,5,6,7,8,9)))
test_nn$MaritalStatus <- as.numeric(factor(test_nn$MaritalStatus,
levels=c('Married','Single','Divorced'),
labels=c(1,2,3)))
test_nn$OverTime <- as.numeric(factor(test_nn$OverTime,
levels=c('Yes','No'),
labels=c(1,2)))
#For xgboost:
train_x<-train_nn
test_x<-test_nn
#Feature Scaling:
train_nn[,2:26]=scale(train_nn[,2:26])
test_nn[,2:26]=scale(test_nn[,2:26])
#Neural Network 1:
library(h2o)
## Warning: package 'h2o' was built under R version 3.4.4
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc, ||
h2o.init(nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 12 minutes 9 seconds
## H2O cluster timezone: Asia/Kolkata
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.8
## H2O cluster version age: 3 months and 3 days
## H2O cluster name: H2O_started_from_R_guptadeepak_tag976
## H2O cluster total nodes: 1
## H2O cluster total memory: 4.00 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.0 (2017-04-21)
feature_names<-names(train_nn)[2:26]
y1="Attrition"
train_h2o<-as.h2o(train_nn)
##
|
| | 0%
|
|=================================================================| 100%
test_h2o<-as.h2o(test_nn)
##
|
| | 0%
|
|=================================================================| 100%
train_h2o[,y1]<-as.factor(as.h2o(train_nn[,y1]))
##
|
| | 0%
|
|=================================================================| 100%
test_h2o[,y1]<-as.factor(as.h2o(test_nn[,y1]))
##
|
| | 0%
|
|=================================================================| 100%
nn1<-h2o.deeplearning(x=feature_names,
y=y1,
training_frame = train_h2o,
activation = 'Rectifier',
hidden=c(13,13),
epochs = 100,
ignore_const_cols = FALSE,
stopping_metric = "logloss",
train_samples_per_iteration = -2)
##
|
| | 0%
|
|=================================================================| 100%
#Predicting the test set results:
prob_pred=h2o.predict(nn1,newdata = test_h2o)
##
|
| | 0%
|
|=================================================================| 100%
prob_pred<-as.data.frame(prob_pred)
probs<-ifelse(prob_pred$p1>prob_pred$p0,prob_pred$p1,prob_pred$p0)
#Confusion Matrix:
cm3<-table(test_set[ ,1],prob_pred$predict)
cm3
##
## 0 1
## 0 677 63
## 1 30 112
h2o.shutdown()
## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?
#Neural Network 2:
library(neuralnet)
nn2 <- neuralnet(formula =train_nn$Attrition ~ Age + BusinessTravel + DailyRate + Department+
DistanceFromHome +Education +EducationField +EnvironmentSatisfaction +
Gender + HourlyRate + JobInvolvement +JobLevel +JobRole +JobSatisfaction +
MaritalStatus + MonthlyRate + NumCompaniesWorked +OverTime +
PercentSalaryHike +RelationshipSatisfaction+ StockOptionLevel+TotalWorkingYears+
TrainingTimesLastYear+ WorkLifeBalance + YearsWithCurrManager,
data = train_nn[ ,2:26],
hidden = 2,
err.fct = "sse",
linear.output = FALSE,
lifesign = "full",
lifesign.step = 10,
threshold = 0.1,
stepmax = 2000)
## hidden: 2 thresh: 0.1 rep: 1/1 steps: 10 min thresh: 34.81853807
## 20 min thresh: 2.36829556
## 30 min thresh: 2.34135322
## 40 min thresh: 0.9307287171
## 50 min thresh: 0.5955038536
## 60 min thresh: 0.4018842495
## 70 min thresh: 0.4018842495
## 80 min thresh: 0.3649846558
## 90 min thresh: 0.3264478901
## 100 min thresh: 0.3264478901
## 110 min thresh: 0.2467378533
## 120 min thresh: 0.2130719935
## 130 min thresh: 0.2115118285
## 140 min thresh: 0.2005050281
## 150 min thresh: 0.2005050281
## 160 min thresh: 0.2005050281
## 170 min thresh: 0.1577864061
## 180 min thresh: 0.1577864061
## 190 min thresh: 0.1577864061
## 200 min thresh: 0.1577864061
## 210 min thresh: 0.1577864061
## 220 min thresh: 0.1274443571
## 230 min thresh: 0.1274443571
## 240 min thresh: 0.1274443571
## 250 min thresh: 0.1274443571
## 260 min thresh: 0.1274443571
## 270 min thresh: 0.1274443571
## 280 min thresh: 0.1274443571
## 290 min thresh: 0.1274443571
## 300 min thresh: 0.1274443571
## 310 min thresh: 0.1274443571
## 320 min thresh: 0.1274443571
## 330 min thresh: 0.1274443571
## 340 min thresh: 0.1274443571
## 350 min thresh: 0.1274443571
## 360 min thresh: 0.1274443571
## 370 min thresh: 0.1274443571
## 380 min thresh: 0.1274443571
## 390 min thresh: 0.1274443571
## 400 min thresh: 0.1274443571
## 410 min thresh: 0.1274443571
## 420 min thresh: 0.1274443571
## 430 min thresh: 0.1274443571
## 440 min thresh: 0.1241596874
## 450 min thresh: 0.117438324
## 460 min thresh: 0.117438324
## 470 min thresh: 0.1107674926
## 480 min thresh: 0.1107674926
## 490 min thresh: 0.1107674926
## 500 min thresh: 0.1107674926
## 510 min thresh: 0.1107674926
## 520 min thresh: 0.108985775
## 530 min thresh: 0.108985775
## 540 min thresh: 0.108985775
## 550 min thresh: 0.108985775
## 560 min thresh: 0.108985775
## 570 min thresh: 0.108985775
## 580 min thresh: 0.108985775
## 583 error: 84.43503 time: 0.38 secs
plot(nn2)
plot(nn2)
#Predicting test set results:
ypred3<- compute(nn2,test_nn[ ,2:26])$net.result
y_cm4<-ifelse(ypred3>0.5,1,0)
#Confusion Matrix:
cm4<-table(test_nn[ ,1],y_cm4)
cm4
## y_cm4
## 0 1
## 0 633 107
## 1 116 26
#Optimization-Random Forest,Averaging & XGBOOST:
#1.Averaging:Combining Decision Tree and Neural Network 2:
ensembleavg<-(ypred1+ypred3)/2
test_set$ensembleavg<-ensembleavg
ypredavg<-(ifelse(test_set$ensembleavg>0.5,1,0))
#Confusion Matrix:
cm_avg<-table(test_set[ ,1],ypredavg)
cm_avg
## ypredavg
## 0 1
## 0 543 197
## 1 102 40
#2.Using xgboost:
library(lattice)
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
library(readr)
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.4.4
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
##
## xgboost
classifier<-xgboost(data=as.matrix(train_x[ ,2:26]),
label = train_x$Attrition,
max_depth=5,
objective="binary:logistic",
eval_metric="auc",
nrounds=10)
## [1] train-auc:0.836221
## [2] train-auc:0.885220
## [3] train-auc:0.915821
## [4] train-auc:0.923968
## [5] train-auc:0.933943
## [6] train-auc:0.945349
## [7] train-auc:0.959170
## [8] train-auc:0.963899
## [9] train-auc:0.969419
## [10] train-auc:0.973251
#Predicting on test set:
y_xgb<-predict(classifier,as.matrix(test_x[ ,2:26]))
#Confusion Matrix:
y_cm5<-ifelse(y_xgb>0.5,1,0)
cm5<-table(test_x[ ,1],y_cm5)
cm5
## y_cm5
## 0 1
## 0 731 9
## 1 79 63
#4.RandomForest:
#Fininding optimal mtry values:
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
tclassifier <- tuneRF(x = training_set[,2:26],
y=training_set$Attrition,
mtryStart = 3,
ntreeTry=100,
stepFactor = 1.5,
improve = 0.0001,
trace=TRUE,
plot = TRUE,
doBest = TRUE,
nodesize = 10,
importance=TRUE )
## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 3 OOB error = 0.06417948007
## Searching left ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 2 OOB error = 0.06800508688
## -0.05960794337 0.0001
## Searching right ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 4 OOB error = 0.06055661666
## 0.0564489366 0.0001
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 6 OOB error = 0.05935067342
## 0.01991431006 0.0001
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 9 OOB error = 0.05764071982
## 0.0288110228 0.0001
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 13 OOB error = 0.05674237415
## 0.01558526111 0.0001
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 19 OOB error = 0.0557034709
## 0.01830912548 0.0001
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 25 OOB error = 0.05655065081
## -0.01520874547 0.0001
## Warning in randomForest.default(x, y, mtry = res[which.min(res[, 2]), 1], :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
#Optimal mtry Value=13
set.seed(1234)
rantree = randomForest(x = training_set[,2:26],
y = training_set$Attrition,
ntree = 100,
nodesize = 10,
mtry=13,
importance = TRUE)
## Warning in randomForest.default(x = training_set[, 2:26], y =
## training_set$Attrition, : The response has five or fewer unique values. Are
## you sure you want to do regression?
print(rantree)
##
## Call:
## randomForest(x = training_set[, 2:26], y = training_set$Attrition, ntree = 100, mtry = 13, nodesize = 10, importance = TRUE)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 0.05648144459
## % Var explained: 58.25
# Predicting the Test set results
y_pred2 = predict(rantree, newdata = test_set[,2:26],type="class")
y_cm2<-ifelse(y_pred2>0.5,1,0)
#Confusion Matrix:
cm2=table(test_set[,1],y_cm2)
cm2
## y_cm2
## 0 1
## 0 730 10
## 1 36 106
#Comparision of CART,Neural Network and Ensemble Model:
#1.Accuracy from Confusion Matrix:(Correct Predictions/Total predictions)*100
#2:Classification Error from Confusion Matrix:(FalsePositives+FalseNegatives/Total predictions)*100
#3.Others:
#Gini:
library(ineq)
gini_destree=ineq(ypred1, type="Gini")
gini_rantree=ineq(y_pred2,type="Gini")
gini_nn=ineq(ypred3,type="Gini")
gini_en=ineq(ypredavg,type="Gini")
gini_nn1=ineq(probs,type="Gini")