##DataMining Group Assignment
##Loading Required Libraries
setwd("C:/Users/acer/Documents/PGPBABI/Data Mining/GrpAssignment")
library(rpart)
library(rpart.plot)
library(neuralnet)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
library(ggplot2)
library(grid)
library(gridExtra)
library(corrplot)
## corrplot 0.84 loaded
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##Reading Data into a DataFrame
HRData<-read.csv(file ="HR_Employee_Attrition_Data.csv", header = TRUE)
##View the data structure
str(HRData)
## 'data.frame': 2940 obs. of 35 variables:
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
summary(HRData)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 No :2466 Non-Travel : 300 Min. : 102.0
## 1st Qu.:30.00 Yes: 474 Travel_Frequently: 554 1st Qu.: 465.0
## Median :36.00 Travel_Rarely :2086 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## Department DistanceFromHome Education
## Human Resources : 126 Min. : 1.000 Min. :1.000
## Research & Development:1922 1st Qu.: 2.000 1st Qu.:2.000
## Sales : 892 Median : 7.000 Median :3.000
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber
## Human Resources : 54 Min. :1 Min. : 1.0
## Life Sciences :1212 1st Qu.:1 1st Qu.: 735.8
## Marketing : 318 Median :1 Median :1470.5
## Medical : 928 Mean :1 Mean :1470.5
## Other : 164 3rd Qu.:1 3rd Qu.:2205.2
## Technical Degree: 264 Max. :1 Max. :2940.0
##
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement
## Min. :1.000 Female:1176 Min. : 30.00 Min. :1.00
## 1st Qu.:2.000 Male :1764 1st Qu.: 48.00 1st Qu.:2.00
## Median :3.000 Median : 66.00 Median :3.00
## Mean :2.722 Mean : 65.89 Mean :2.73
## 3rd Qu.:4.000 3rd Qu.: 84.00 3rd Qu.:3.00
## Max. :4.000 Max. :100.00 Max. :4.00
##
## JobLevel JobRole JobSatisfaction
## Min. :1.000 Sales Executive :652 Min. :1.000
## 1st Qu.:1.000 Research Scientist :584 1st Qu.:2.000
## Median :2.000 Laboratory Technician :518 Median :3.000
## Mean :2.064 Manufacturing Director :290 Mean :2.729
## 3rd Qu.:3.000 Healthcare Representative:262 3rd Qu.:4.000
## Max. :5.000 Manager :204 Max. :4.000
## (Other) :430
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked
## Divorced: 654 Min. : 1009 Min. : 2094 Min. :0.000
## Married :1346 1st Qu.: 2911 1st Qu.: 8045 1st Qu.:1.000
## Single : 940 Median : 4919 Median :14236 Median :2.000
## Mean : 6503 Mean :14313 Mean :2.693
## 3rd Qu.: 8380 3rd Qu.:20462 3rd Qu.:4.000
## Max. :19999 Max. :26999 Max. :9.000
##
## Over18 OverTime PercentSalaryHike PerformanceRating
## Y:2940 No :2108 Min. :11.00 Min. :3.000
## Yes: 832 1st Qu.:12.00 1st Qu.:3.000
## Median :14.00 Median :3.000
## Mean :15.21 Mean :3.154
## 3rd Qu.:18.00 3rd Qu.:3.000
## Max. :25.00 Max. :4.000
##
## RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
## Min. :1.000 Min. :80 Min. :0.0000 Min. : 0.00
## 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00
## Median :3.000 Median :80 Median :1.0000 Median :10.00
## Mean :2.712 Mean :80 Mean :0.7939 Mean :11.28
## 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :4.000 Max. :80 Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.799 Mean :2.761 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :4.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
##Standard Hours, Over 18, Employee Count & Employee number attributes are not useful for analysis
HRData1<-subset(HRData,select = c(1:8,11:21,23:26,28:35))
##Separating Data into Dev & Holdout (Train & Test)
set.seed(200)
HRData$Random<-runif(nrow(HRData1),0,1)
HRData.Train<-HRData1[which(HRData$Random<=0.70),]
HRData.Test<-HRData1[which(HRData$Random>0.70),]
##Creating a base data set for Neural Net
HRDataTrain.NN<-HRData.Train[,1:31]
##Converting all factor variables to integers
for(i in 1:ncol(HRDataTrain.NN)){
HRDataTrain.NN[,i]<-as.integer(HRDataTrain.NN[,i])}
##corrplot(cor(HRDataTrain.cor), method = "circle",tl.cex = 0.6)
HRDataTest.NN<-HRData.Test[,1:31]
for(i in 1:ncol(HRDataTest.NN)){
HRDataTest.NN[,i]<-as.integer(HRDataTest.NN[,i])}
##Scaling of variables
##HRDataTrain.NN$DailyRate<-scale(HRDataTrain.NN$DailyRate)
##HRDataTrain.NN$HourlyRate<-scale(HRDataTrain.NN$HourlyRate)
##HRDataTrain.NN$MonthlyRate<-scale(HRDataTrain.NN$MonthlyRate)
##HRDataTrain.NN$MonthlyIncome<-scale(HRDataTrain.NN$MonthlyIncome)
##HRDataTrain.NN$MonthlyIncome<-scale(HRDataTrain.NN$MonthlyIncome)
##HRDataTest.NN$DailyRate<-scale(HRDataTest.NN$DailyRate)
##HRDataTest.NN$HourlyRate<-scale(HRDataTest.NN$HourlyRate)
##HRDataTest.NN$MonthlyRate<-scale(HRDataTest.NN$MonthlyRate)
##HRDataTest.NN$MonthlyIncome<-scale(HRDataTest.NN$MonthlyIncome)
HRDataTrain.NNScaled<-scale(HRDataTrain.NN)
HRDataTest.NNScaled<-scale(HRDataTest.NN)
##Creating Neural Network Model
##Pasting all column names in required format
cn <- paste(colnames(HRDataTrain.NNScaled)[c(1,3:31)], collapse = ' + ')
##Creating the formula
form <- as.formula(paste('Attrition', '~', cn))
##Creating Neural Net
HRDataNN<-neuralnet(formula = form, data = HRDataTrain.NNScaled,hidden = 3,lifesign = "none",linear.output = FALSE,lifesign.step = 10,threshold = 0.01,stepmax = 7000)
plot(HRDataNN)
##Assigning Probabilities to Train Sample
HRDataTrain.NN$prob<-HRDataNN$net.result[[1]]
##Model Performance in Train Data
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
##
## Attaching package: 'ROCR'
## The following object is masked from 'package:neuralnet':
##
## prediction
TrainNNpred<-prediction(HRDataTrain.NN$prob,HRDataTrain.NN$Attrition)
TrainNNperf<-performance(TrainNNpred,"tpr","fpr")
##plot(TrainNNperf)
##AUC for Train Data
AUCTrainNN<-performance(TrainNNpred,"auc")
AUCTrainNN<-as.numeric(AUCTrainNN@y.values)
AUCTrainNN
## [1] 0.856680807
##KS for Training Data
KSNNTrain <- max(attr(TrainNNperf, 'y.values')[[1]]-attr(TrainNNperf, 'x.values')[[1]])
KSNNTrain
## [1] 0.6937123699
##Compute the NN Output for Test Data
HRDataNNResults<-compute(HRDataNN,HRDataTest.NNScaled[,c(1,3:31)],rep = 1)
##Assign Probabilities to Test Sample
HRDataTest.NN$prob<-HRDataNNResults$net.result
##Model Performance in Test Data
TestNNpred<-prediction(HRDataTest.NN$prob,HRDataTest.NN$Attrition)
TestNNperf<-performance(TestNNpred,"tpr","fpr")
##plot(TestNNperf)
##AUC for Test Data
AUCTestNN<-performance(TestNNpred,"auc")
AUCTestNN<-as.numeric(AUCTestNN@y.values)
AUCTestNN
## [1] 0.8126785714
##KS for Test Data
KSNNTest <- max(attr(TestNNperf, 'y.values')[[1]]-attr(TestNNperf, 'x.values')[[1]])
KSNNTest
## [1] 0.5788095238