##DataMining Group Assignment 

##Loading Required Libraries
setwd("C:/Users/acer/Documents/PGPBABI/Data Mining/GrpAssignment")
library(rpart)
library(rpart.plot)
library(neuralnet)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
library(ggplot2)
library(grid)
library(gridExtra)
library(corrplot)
## corrplot 0.84 loaded
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
##Reading Data into a DataFrame 
HRData<-read.csv(file ="HR_Employee_Attrition_Data.csv", header = TRUE)

##View the data structure
str(HRData)
## 'data.frame':    2940 obs. of  35 variables:
##  $ Age                     : int  41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
##  $ DailyRate               : int  1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
##  $ DistanceFromHome        : int  1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : int  2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ EnvironmentSatisfaction : int  2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
##  $ HourlyRate              : int  94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : int  3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : int  2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
##  $ JobSatisfaction         : int  4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
##  $ MonthlyIncome           : int  5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
##  $ MonthlyRate             : int  19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
##  $ NumCompaniesWorked      : int  8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
##  $ PercentSalaryHike       : int  11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : int  3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: int  1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : int  8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : int  0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : int  1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : int  4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : int  0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : int  5 7 0 0 2 6 0 0 8 7 ...
summary(HRData)
##       Age        Attrition            BusinessTravel   DailyRate     
##  Min.   :18.00   No :2466   Non-Travel       : 300   Min.   : 102.0  
##  1st Qu.:30.00   Yes: 474   Travel_Frequently: 554   1st Qu.: 465.0  
##  Median :36.00              Travel_Rarely    :2086   Median : 802.0  
##  Mean   :36.92                                       Mean   : 802.5  
##  3rd Qu.:43.00                                       3rd Qu.:1157.0  
##  Max.   :60.00                                       Max.   :1499.0  
##                                                                      
##                   Department   DistanceFromHome   Education    
##  Human Resources       : 126   Min.   : 1.000   Min.   :1.000  
##  Research & Development:1922   1st Qu.: 2.000   1st Qu.:2.000  
##  Sales                 : 892   Median : 7.000   Median :3.000  
##                                Mean   : 9.193   Mean   :2.913  
##                                3rd Qu.:14.000   3rd Qu.:4.000  
##                                Max.   :29.000   Max.   :5.000  
##                                                                
##           EducationField EmployeeCount EmployeeNumber  
##  Human Resources :  54   Min.   :1     Min.   :   1.0  
##  Life Sciences   :1212   1st Qu.:1     1st Qu.: 735.8  
##  Marketing       : 318   Median :1     Median :1470.5  
##  Medical         : 928   Mean   :1     Mean   :1470.5  
##  Other           : 164   3rd Qu.:1     3rd Qu.:2205.2  
##  Technical Degree: 264   Max.   :1     Max.   :2940.0  
##                                                        
##  EnvironmentSatisfaction    Gender       HourlyRate     JobInvolvement
##  Min.   :1.000           Female:1176   Min.   : 30.00   Min.   :1.00  
##  1st Qu.:2.000           Male  :1764   1st Qu.: 48.00   1st Qu.:2.00  
##  Median :3.000                         Median : 66.00   Median :3.00  
##  Mean   :2.722                         Mean   : 65.89   Mean   :2.73  
##  3rd Qu.:4.000                         3rd Qu.: 84.00   3rd Qu.:3.00  
##  Max.   :4.000                         Max.   :100.00   Max.   :4.00  
##                                                                       
##     JobLevel                          JobRole    JobSatisfaction
##  Min.   :1.000   Sales Executive          :652   Min.   :1.000  
##  1st Qu.:1.000   Research Scientist       :584   1st Qu.:2.000  
##  Median :2.000   Laboratory Technician    :518   Median :3.000  
##  Mean   :2.064   Manufacturing Director   :290   Mean   :2.729  
##  3rd Qu.:3.000   Healthcare Representative:262   3rd Qu.:4.000  
##  Max.   :5.000   Manager                  :204   Max.   :4.000  
##                  (Other)                  :430                  
##   MaritalStatus  MonthlyIncome    MonthlyRate    NumCompaniesWorked
##  Divorced: 654   Min.   : 1009   Min.   : 2094   Min.   :0.000     
##  Married :1346   1st Qu.: 2911   1st Qu.: 8045   1st Qu.:1.000     
##  Single  : 940   Median : 4919   Median :14236   Median :2.000     
##                  Mean   : 6503   Mean   :14313   Mean   :2.693     
##                  3rd Qu.: 8380   3rd Qu.:20462   3rd Qu.:4.000     
##                  Max.   :19999   Max.   :26999   Max.   :9.000     
##                                                                    
##  Over18   OverTime   PercentSalaryHike PerformanceRating
##  Y:2940   No :2108   Min.   :11.00     Min.   :3.000    
##           Yes: 832   1st Qu.:12.00     1st Qu.:3.000    
##                      Median :14.00     Median :3.000    
##                      Mean   :15.21     Mean   :3.154    
##                      3rd Qu.:18.00     3rd Qu.:3.000    
##                      Max.   :25.00     Max.   :4.000    
##                                                         
##  RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
##  Min.   :1.000            Min.   :80    Min.   :0.0000   Min.   : 0.00    
##  1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000   1st Qu.: 6.00    
##  Median :3.000            Median :80    Median :1.0000   Median :10.00    
##  Mean   :2.712            Mean   :80    Mean   :0.7939   Mean   :11.28    
##  3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000   3rd Qu.:15.00    
##  Max.   :4.000            Max.   :80    Max.   :3.0000   Max.   :40.00    
##                                                                           
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         Min.   :1.000   Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         Median :3.000   Median : 5.000   Median : 3.000    
##  Mean   :2.799         Mean   :2.761   Mean   : 7.008   Mean   : 4.229    
##  3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.: 9.000   3rd Qu.: 7.000    
##  Max.   :6.000         Max.   :4.000   Max.   :40.000   Max.   :18.000    
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 1.000          Median : 3.000      
##  Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :15.000          Max.   :17.000      
## 
##Standard Hours, Over 18, Employee Count & Employee number attributes are not useful for analysis
HRData1<-subset(HRData,select = c(1:8,11:21,23:26,28:35))

##Separating Data into Dev & Holdout (Train & Test)
set.seed(200)
HRData$Random<-runif(nrow(HRData1),0,1)
HRData.Train<-HRData1[which(HRData$Random<=0.70),]
HRData.Test<-HRData1[which(HRData$Random>0.70),]

##Creating a base data set for Neural Net
HRDataTrain.NN<-HRData.Train[,1:31]

##Converting all factor variables to integers
for(i in 1:ncol(HRDataTrain.NN)){
  HRDataTrain.NN[,i]<-as.integer(HRDataTrain.NN[,i])}
##corrplot(cor(HRDataTrain.cor), method = "circle",tl.cex = 0.6)

HRDataTest.NN<-HRData.Test[,1:31]
for(i in 1:ncol(HRDataTest.NN)){
  HRDataTest.NN[,i]<-as.integer(HRDataTest.NN[,i])}

##Scaling of variables
##HRDataTrain.NN$DailyRate<-scale(HRDataTrain.NN$DailyRate)
##HRDataTrain.NN$HourlyRate<-scale(HRDataTrain.NN$HourlyRate)
##HRDataTrain.NN$MonthlyRate<-scale(HRDataTrain.NN$MonthlyRate)
##HRDataTrain.NN$MonthlyIncome<-scale(HRDataTrain.NN$MonthlyIncome)
##HRDataTrain.NN$MonthlyIncome<-scale(HRDataTrain.NN$MonthlyIncome)

##HRDataTest.NN$DailyRate<-scale(HRDataTest.NN$DailyRate)
##HRDataTest.NN$HourlyRate<-scale(HRDataTest.NN$HourlyRate)
##HRDataTest.NN$MonthlyRate<-scale(HRDataTest.NN$MonthlyRate)
##HRDataTest.NN$MonthlyIncome<-scale(HRDataTest.NN$MonthlyIncome)

HRDataTrain.NNScaled<-scale(HRDataTrain.NN)
HRDataTest.NNScaled<-scale(HRDataTest.NN)

##Creating Neural Network Model
##Pasting all column names in required format
cn <- paste(colnames(HRDataTrain.NNScaled)[c(1,3:31)], collapse = ' + ')
##Creating the formula
form <- as.formula(paste('Attrition', '~', cn))

##Creating Neural Net
HRDataNN<-neuralnet(formula = form, data = HRDataTrain.NNScaled,hidden = 3,lifesign = "none",linear.output = FALSE,lifesign.step = 10,threshold = 0.01,stepmax = 7000)
plot(HRDataNN)

##Assigning Probabilities to Train Sample
HRDataTrain.NN$prob<-HRDataNN$net.result[[1]]



##Model Performance in Train Data
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
## 
## Attaching package: 'ROCR'
## The following object is masked from 'package:neuralnet':
## 
##     prediction
TrainNNpred<-prediction(HRDataTrain.NN$prob,HRDataTrain.NN$Attrition)
TrainNNperf<-performance(TrainNNpred,"tpr","fpr")
##plot(TrainNNperf)
##AUC for Train Data
AUCTrainNN<-performance(TrainNNpred,"auc")
AUCTrainNN<-as.numeric(AUCTrainNN@y.values)
AUCTrainNN
## [1] 0.856680807
##KS for Training Data
KSNNTrain <- max(attr(TrainNNperf, 'y.values')[[1]]-attr(TrainNNperf, 'x.values')[[1]])
KSNNTrain
## [1] 0.6937123699
##Compute the NN Output for Test Data
HRDataNNResults<-compute(HRDataNN,HRDataTest.NNScaled[,c(1,3:31)],rep = 1)

##Assign Probabilities to Test Sample
HRDataTest.NN$prob<-HRDataNNResults$net.result

##Model Performance in Test Data
TestNNpred<-prediction(HRDataTest.NN$prob,HRDataTest.NN$Attrition)
TestNNperf<-performance(TestNNpred,"tpr","fpr")
##plot(TestNNperf)
##AUC for Test Data
AUCTestNN<-performance(TestNNpred,"auc")
AUCTestNN<-as.numeric(AUCTestNN@y.values)
AUCTestNN
## [1] 0.8126785714
##KS for Test Data
KSNNTest <- max(attr(TestNNperf, 'y.values')[[1]]-attr(TestNNperf, 'x.values')[[1]])
KSNNTest
## [1] 0.5788095238