# Reading xlsx dataset
employee <- read.xlsx("WA_Fn-UseC_-HR-Employee-Attrition.xlsx", sheetIndex = 1, header = T)
# saving a copy of dataset
employee1 <- employee
# Looking at a quick summary of all the features
summary(employee)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 No :1233 Non-Travel : 150 Min. : 102.0
## 1st Qu.:30.00 Yes: 237 Travel_Frequently: 277 1st Qu.: 465.0
## Median :36.00 Travel_Rarely :1043 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## Department DistanceFromHome Education
## Human Resources : 63 Min. : 1.000 Min. :1.000
## Research & Development:961 1st Qu.: 2.000 1st Qu.:2.000
## Sales :446 Median : 7.000 Median :3.000
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber
## Human Resources : 27 Min. :1 Min. : 1.0
## Life Sciences :606 1st Qu.:1 1st Qu.: 491.2
## Marketing :159 Median :1 Median :1020.5
## Medical :464 Mean :1 Mean :1024.9
## Other : 82 3rd Qu.:1 3rd Qu.:1555.8
## Technical Degree:132 Max. :1 Max. :2068.0
##
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement
## Min. :1.000 Female:588 Min. : 30.00 Min. :1.00
## 1st Qu.:2.000 Male :882 1st Qu.: 48.00 1st Qu.:2.00
## Median :3.000 Median : 66.00 Median :3.00
## Mean :2.722 Mean : 65.89 Mean :2.73
## 3rd Qu.:4.000 3rd Qu.: 83.75 3rd Qu.:3.00
## Max. :4.000 Max. :100.00 Max. :4.00
##
## JobLevel JobRole JobSatisfaction
## Min. :1.000 Sales Executive :326 Min. :1.000
## 1st Qu.:1.000 Research Scientist :292 1st Qu.:2.000
## Median :2.000 Laboratory Technician :259 Median :3.000
## Mean :2.064 Manufacturing Director :145 Mean :2.729
## 3rd Qu.:3.000 Healthcare Representative:131 3rd Qu.:4.000
## Max. :5.000 Manager :102 Max. :4.000
## (Other) :215
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked
## Divorced:327 Min. : 1009 Min. : 2094 Min. :0.000
## Married :673 1st Qu.: 2911 1st Qu.: 8047 1st Qu.:1.000
## Single :470 Median : 4919 Median :14236 Median :2.000
## Mean : 6503 Mean :14313 Mean :2.693
## 3rd Qu.: 8379 3rd Qu.:20462 3rd Qu.:4.000
## Max. :19999 Max. :26999 Max. :9.000
##
## Over18 OverTime PercentSalaryHike PerformanceRating
## Y:1470 No :1054 Min. :11.00 Min. :3.000
## Yes: 416 1st Qu.:12.00 1st Qu.:3.000
## Median :14.00 Median :3.000
## Mean :15.21 Mean :3.154
## 3rd Qu.:18.00 3rd Qu.:3.000
## Max. :25.00 Max. :4.000
##
## RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
## Min. :1.000 Min. :80 Min. :0.0000 Min. : 0.00
## 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00
## Median :3.000 Median :80 Median :1.0000 Median :10.00
## Mean :2.712 Mean :80 Mean :0.7939 Mean :11.28
## 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:15.00
## Max. :4.000 Max. :80 Max. :3.0000 Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.799 Mean :2.761 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :4.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
# Looking at structure of all the features
str(employee)
## 'data.frame': 1470 obs. of 35 variables:
## $ Age : num 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ DailyRate : num 1102 279 1373 1392 591 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : num 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ EmployeeCount : num 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : num 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : num 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ HourlyRate : num 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : num 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ JobSatisfaction : num 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ MonthlyIncome : num 5993 5130 2090 2909 3468 ...
## $ MonthlyRate : num 19479 24907 2396 23159 16632 ...
## $ NumCompaniesWorked : num 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ PercentSalaryHike : num 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : num 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : num 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num 5 7 0 0 2 6 0 0 8 7 ...
part a)- exploratory data plots
set.seed(1) # setting seed for reproducibility
# Simplified parallel coordinate plot
employee[sapply(employee, is.factor)] <- data.matrix(employee[sapply(employee, is.factor)]) #factorised features
plotmd(employee, class=NULL,main="Plot showing multivariate data for clusters as the parallel coordinates ") #the plot
# Histogram with normal curve for monthly income
# Histogram
histogram.curve <- hist(employee$MonthlyIncome, breaks = 10, col = "purple", xlab = "Monthly Income", main = "Histogram with Normal Curve")
# Adding normal curve to the histogram
xfit <- seq(min(employee[,19]), max(employee[,19]), length=40)
yfit <- dnorm(xfit, mean=mean(employee[,19]), sd=sd((employee[,19])))
yfit <- yfit*diff(histogram.curve$mids[1:2])*length(employee$MonthlyIncome)
lines(xfit, yfit, col ="black", lwd=2)
# plot shows negatively skewed data
# Plot showing relationships between employees leaving the company with respect to monthly income, percent salary hike and job level
pl <- ggplot(employee1, aes(x=MonthlyIncome, y=PercentSalaryHike)) + geom_point(shape=2)+ ggtitle("Effect of Job Level(1-5), PercentSalaryHike and MonthlyIncome on Attrition(Y/N)")
pl + facet_grid(Attrition ~ JobLevel)
# as expected employees with low job level, less percent salary hike and low monthly income have the most attritions.
part b)- detection of outlier
# Calculating cook's distance to detect outliers
set.seed(1)
mod <- lm(Attrition ~ ., data=employee) #model
cooksd <- cooks.distance(mod) # distance
# Plotting cook's distance
plot(cooksd, pch="*", cex=2, main="Outliers using Cooks Distance") %>% #plot
abline(h = 5*mean(cooksd, na.rm=T), col="black") %>% # cut-off line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>5*mean(cooksd, na.rm=T),names(cooksd),""), col="red") #labels
# Row numbers with outliers
out.rows <- as.numeric(names(cooksd)[(cooksd > 5*mean(cooksd, na.rm=T))])
out.rows
## [1] 34 46 90 101 123 127 137 206 211 237 251 287 318 367
## [15] 416 437 441 454 470 483 496 505 569 596 608 609 637 661
## [29] 664 701 707 750 753 780 781 790 797 814 837 914 929 940
## [43] 967 1007 1008 1032 1034 1040 1086 1112 1163 1205 1223 1224 1247 1258
## [57] 1280 1291 1292 1299 1313 1334 1355 1391 1443 1445 1453
# Removing outlier rows as they create unwanted significant associated
employee <- employee[-out.rows,]
employee1<-employee1[-out.rows,]
part c) correlation/collinearity analysis
# Correlation
# removing columns 9,22,27 because they have same data so will make correlation NA
corr<-cor(employee[,-c(9,22,27)])
corr
## Age Attrition BusinessTravel
## Age 1.000000e+00 -0.241235226 0.019636445
## Attrition -2.412352e-01 1.000000000 0.002098248
## BusinessTravel 1.963644e-02 0.002098248 1.000000000
## DailyRate 2.310476e-02 -0.058319995 -0.000815657
## Department -4.201805e-02 0.085492709 -0.007634083
## DistanceFromHome 6.266418e-03 0.068354557 -0.017803361
## Education 2.146926e-01 -0.038796966 0.002908125
## EducationField -4.480291e-02 0.029848172 0.015811896
## EmployeeNumber -9.702243e-03 -0.022634629 -0.016431969
## EnvironmentSatisfaction 1.469162e-02 -0.120241888 0.002453639
## Gender -4.122449e-02 0.039492323 -0.035947443
## HourlyRate 1.613261e-02 0.002572074 0.041196029
## JobInvolvement 3.871854e-02 -0.141030728 0.045739456
## JobLevel 5.239563e-01 -0.219497700 0.020296950
## JobRole -1.322736e-01 0.094286680 0.002831521
## JobSatisfaction -7.774664e-03 -0.114204328 -0.039164384
## MaritalStatus -1.064525e-01 0.208359011 0.023416102
## MonthlyIncome 5.088638e-01 -0.198923117 0.034144215
## MonthlyRate 3.016012e-02 0.014796272 -0.011028878
## NumCompaniesWorked 2.967921e-01 0.014202358 0.024705717
## OverTime 2.287736e-02 0.294476347 0.023679845
## PercentSalaryHike 9.864678e-04 -0.003293167 -0.038124864
## PerformanceRating 1.583229e-05 0.015586698 -0.030986915
## RelationshipSatisfaction 3.610487e-02 -0.051456831 -0.034317569
## StockOptionLevel 3.677617e-02 -0.186496581 -0.019275490
## TotalWorkingYears 6.880815e-01 -0.240099771 0.030888134
## TrainingTimesLastYear -1.941224e-02 -0.040992900 0.014250344
## WorkLifeBalance -2.427393e-02 -0.056972270 -0.023709286
## YearsAtCompany 3.050558e-01 -0.203731125 -0.028113293
## YearsInCurrentRole 2.107093e-01 -0.207953876 -0.023266247
## YearsSinceLastPromotion 2.082714e-01 -0.088742382 -0.040261666
## YearsWithCurrManager 2.037952e-01 -0.198888529 -0.029797694
## DailyRate Department DistanceFromHome
## Age 0.0231047610 -0.042018047 0.006266418
## Attrition -0.0583199950 0.085492709 0.068354557
## BusinessTravel -0.0008156570 -0.007634083 -0.017803361
## DailyRate 1.0000000000 0.002029361 -0.003525394
## Department 0.0020293615 1.000000000 0.024209259
## DistanceFromHome -0.0035253940 0.024209259 1.000000000
## Education -0.0189777008 0.016452257 0.022888165
## EducationField 0.0316413282 0.006454538 0.011384844
## EmployeeNumber -0.0549409525 -0.005473843 0.036025057
## EnvironmentSatisfaction 0.0291488963 -0.018813324 -0.015044993
## Gender -0.0190357932 -0.044979171 0.010758566
## HourlyRate 0.0223223491 -0.002801767 0.026240816
## JobInvolvement 0.0472827141 -0.021061850 0.003896336
## JobLevel -0.0008977702 0.088549690 0.008434338
## JobRole -0.0159384835 0.656711843 0.009654757
## JobSatisfaction 0.0347407676 0.014461880 0.003890915
## MaritalStatus -0.0804617141 0.051451243 -0.015707123
## MonthlyIncome 0.0061972952 0.042057830 -0.010942523
## MonthlyRate -0.0324882144 0.019866429 0.029712236
## NumCompaniesWorked 0.0372170524 -0.030122220 -0.019232157
## OverTime 0.0087315802 0.020033010 0.029637371
## PercentSalaryHike 0.0262917302 -0.009466719 0.033048239
## PerformanceRating 0.0031410418 -0.023319523 0.020795807
## RelationshipSatisfaction 0.0157215199 -0.023629422 0.008668048
## StockOptionLevel 0.0547204997 -0.015568682 0.044635608
## TotalWorkingYears 0.0200988352 -0.026826131 0.010349834
## TrainingTimesLastYear 0.0163429387 0.022975228 -0.030464697
## WorkLifeBalance -0.0481454260 0.029358048 -0.014495145
## YearsAtCompany -0.0250310357 0.018213796 0.019387322
## YearsInCurrentRole 0.0179730236 0.048258092 0.028096643
## YearsSinceLastPromotion -0.0215009506 0.035529507 0.014560296
## YearsWithCurrManager -0.0225905812 0.027870683 0.019781222
## Education EducationField EmployeeNumber
## Age 0.2146926081 -0.0448029097 -0.009702243
## Attrition -0.0387969657 0.0298481715 -0.022634629
## BusinessTravel 0.0029081248 0.0158118956 -0.016431969
## DailyRate -0.0189777008 0.0316413282 -0.054940953
## Department 0.0164522571 0.0064545383 -0.005473843
## DistanceFromHome 0.0228881650 0.0113848442 0.036025057
## Education 1.0000000000 -0.0336744731 0.045101638
## EducationField -0.0336744731 1.0000000000 -0.001623660
## EmployeeNumber 0.0451016375 -0.0016236599 1.000000000
## EnvironmentSatisfaction -0.0238703063 0.0386173604 0.014450765
## Gender -0.0144213401 -0.0075878311 0.020857824
## HourlyRate 0.0123159988 -0.0144524940 0.023628316
## JobInvolvement 0.0269079421 -0.0013451940 -0.009192887
## JobLevel 0.1023230535 -0.0506793463 -0.013019091
## JobRole -0.0008458784 0.0135761763 -0.002640955
## JobSatisfaction -0.0006136880 -0.0521276121 -0.046512690
## MaritalStatus 0.0053035230 0.0079194775 -0.010995947
## MonthlyIncome 0.0958874368 -0.0463107472 -0.008861990
## MonthlyRate -0.0306881444 -0.0225801068 0.019077097
## NumCompaniesWorked 0.1313135568 -0.0025932950 -0.007102560
## OverTime -0.0234470121 0.0105761273 -0.021695289
## PercentSalaryHike -0.0120362978 -0.0062307376 -0.005568529
## PerformanceRating -0.0229583541 0.0003541062 -0.016418022
## RelationshipSatisfaction -0.0151478121 0.0000697670 -0.067313962
## StockOptionLevel 0.0206305626 -0.0065788492 0.060263738
## TotalWorkingYears 0.1463242155 -0.0334772543 -0.010257302
## TrainingTimesLastYear -0.0248864201 0.0482077581 0.029429843
## WorkLifeBalance 0.0102399469 0.0414885831 0.014637932
## YearsAtCompany 0.0660656281 -0.0283056386 -0.010669690
## YearsInCurrentRole 0.0676415247 -0.0203814956 -0.007081814
## YearsSinceLastPromotion 0.0538988593 -0.0046341349 -0.004174551
## YearsWithCurrManager 0.0696125207 -0.0102769855 -0.011989551
## EnvironmentSatisfaction Gender
## Age 0.0146916164 -0.041224486
## Attrition -0.1202418880 0.039492323
## BusinessTravel 0.0024536391 -0.035947443
## DailyRate 0.0291488963 -0.019035793
## Department -0.0188133241 -0.044979171
## DistanceFromHome -0.0150449928 0.010758566
## Education -0.0238703063 -0.014421340
## EducationField 0.0386173604 -0.007587831
## EmployeeNumber 0.0144507652 0.020857824
## EnvironmentSatisfaction 1.0000000000 -0.001696106
## Gender -0.0016961060 1.000000000
## HourlyRate -0.0498142258 0.001655487
## JobInvolvement -0.0136146280 0.024109178
## JobLevel 0.0124521190 -0.035010291
## JobRole -0.0231222962 -0.044404301
## JobSatisfaction -0.0031327988 0.034341907
## MaritalStatus -0.0048023120 -0.045677815
## MonthlyIncome 0.0045789639 -0.027848933
## MonthlyRate 0.0415924990 -0.055934891
## NumCompaniesWorked 0.0213068457 -0.056162873
## OverTime 0.0703015173 -0.039197537
## PercentSalaryHike -0.0378422520 0.007508565
## PerformanceRating -0.0317644462 -0.011998275
## RelationshipSatisfaction 0.0064354574 0.021074097
## StockOptionLevel 0.0211301706 0.016718935
## TotalWorkingYears 0.0019383244 -0.043684637
## TrainingTimesLastYear -0.0256852358 -0.052455306
## WorkLifeBalance 0.0506216914 -0.003991358
## YearsAtCompany 0.0001709952 -0.015745729
## YearsInCurrentRole 0.0194891303 -0.027327397
## YearsSinceLastPromotion 0.0070097149 -0.019356602
## YearsWithCurrManager -0.0076318203 -0.017030727
## HourlyRate JobInvolvement JobLevel
## Age 0.0161326138 0.038718542 0.5239562560
## Attrition 0.0025720743 -0.141030728 -0.2194977004
## BusinessTravel 0.0411960286 0.045739456 0.0202969498
## DailyRate 0.0223223491 0.047282714 -0.0008977702
## Department -0.0028017665 -0.021061850 0.0885496898
## DistanceFromHome 0.0262408156 0.003896336 0.0084343378
## Education 0.0123159988 0.026907942 0.1023230535
## EducationField -0.0144524940 -0.001345194 -0.0506793463
## EmployeeNumber 0.0236283161 -0.009192887 -0.0130190913
## EnvironmentSatisfaction -0.0498142258 -0.013614628 0.0124521190
## Gender 0.0016554869 0.024109178 -0.0350102905
## HourlyRate 1.0000000000 0.035934307 -0.0358828684
## JobInvolvement 0.0359343073 1.000000000 -0.0017770047
## JobLevel -0.0358828684 -0.001777005 1.0000000000
## JobRole -0.0133158468 0.004715372 -0.0925911338
## JobSatisfaction -0.0620497153 -0.012133053 0.0001012888
## MaritalStatus -0.0234166488 -0.045352277 -0.0922862704
## MonthlyIncome -0.0197716482 -0.004188563 0.9504530481
## MonthlyRate -0.0075871031 -0.013528111 0.0305521165
## NumCompaniesWorked 0.0336919452 0.018504978 0.1488872115
## OverTime -0.0092643281 -0.010960646 0.0022840840
## PercentSalaryHike -0.0055634040 -0.025049993 -0.0266438988
## PerformanceRating 0.0027413009 -0.035838034 -0.0147970990
## RelationshipSatisfaction -0.0001790547 0.034570544 0.0118489146
## StockOptionLevel 0.0559621563 0.019529748 0.0199108184
## TotalWorkingYears -0.0069644058 0.004595564 0.7888010566
## TrainingTimesLastYear -0.0159184366 -0.020924551 -0.0131568752
## WorkLifeBalance -0.0039903119 -0.026534673 0.0374544986
## YearsAtCompany -0.0258502546 -0.005085375 0.5322274942
## YearsInCurrentRole -0.0191355168 0.015949201 0.3813053410
## YearsSinceLastPromotion -0.0225901106 -0.024658219 0.3555828341
## YearsWithCurrManager -0.0178939956 0.035292723 0.3747403236
## JobRole JobSatisfaction MaritalStatus
## Age -0.1322736333 -0.0077746644 -0.106452490
## Attrition 0.0942866802 -0.1142043276 0.208359011
## BusinessTravel 0.0028315211 -0.0391643840 0.023416102
## DailyRate -0.0159384835 0.0347407676 -0.080461714
## Department 0.6567118435 0.0144618800 0.051451243
## DistanceFromHome 0.0096547571 0.0038909155 -0.015707123
## Education -0.0008458784 -0.0006136880 0.005303523
## EducationField 0.0135761763 -0.0521276121 0.007919478
## EmployeeNumber -0.0026409545 -0.0465126897 -0.010995947
## EnvironmentSatisfaction -0.0231222962 -0.0031327988 -0.004802312
## Gender -0.0444043008 0.0343419072 -0.045677815
## HourlyRate -0.0133158468 -0.0620497153 -0.023416649
## JobInvolvement 0.0047153716 -0.0121330533 -0.045352277
## JobLevel -0.0925911338 0.0001012888 -0.092286270
## JobRole 1.0000000000 0.0167890512 0.067907097
## JobSatisfaction 0.0167890512 1.0000000000 0.027825252
## MaritalStatus 0.0679070967 0.0278252519 1.000000000
## MonthlyIncome -0.0992000792 -0.0058601375 -0.089279698
## MonthlyRate 0.0001118104 -0.0042075978 0.027429678
## NumCompaniesWorked -0.0568287829 -0.0524370651 -0.021433148
## OverTime 0.0465877563 0.0311789723 -0.012049470
## PercentSalaryHike -0.0028575848 0.0309858821 0.005476102
## PerformanceRating -0.0207639261 0.0097129903 0.004796253
## RelationshipSatisfaction -0.0166132878 -0.0155536785 0.029975223
## StockOptionLevel -0.0259231089 0.0006706405 -0.672821557
## TotalWorkingYears -0.1507431251 -0.0201923394 -0.087269456
## TrainingTimesLastYear -0.0122381451 -0.0096470558 0.007328126
## WorkLifeBalance 0.0318996304 -0.0286460096 0.011755395
## YearsAtCompany -0.0785332701 -0.0090430452 -0.074421767
## YearsInCurrentRole -0.0248936788 -0.0013594036 -0.081654023
## YearsSinceLastPromotion -0.0502783241 -0.0261099023 -0.042581170
## YearsWithCurrManager -0.0405299009 -0.0219930738 -0.055524457
## MonthlyIncome MonthlyRate NumCompaniesWorked
## Age 0.508863763 0.0301601239 0.296792079
## Attrition -0.198923117 0.0147962718 0.014202358
## BusinessTravel 0.034144215 -0.0110288776 0.024705717
## DailyRate 0.006197295 -0.0324882144 0.037217052
## Department 0.042057830 0.0198664291 -0.030122220
## DistanceFromHome -0.010942523 0.0297122359 -0.019232157
## Education 0.095887437 -0.0306881444 0.131313557
## EducationField -0.046310747 -0.0225801068 -0.002593295
## EmployeeNumber -0.008861990 0.0190770967 -0.007102560
## EnvironmentSatisfaction 0.004578964 0.0415924990 0.021306846
## Gender -0.027848933 -0.0559348910 -0.056162873
## HourlyRate -0.019771648 -0.0075871031 0.033691945
## JobInvolvement -0.004188563 -0.0135281109 0.018504978
## JobLevel 0.950453048 0.0305521165 0.148887211
## JobRole -0.099200079 0.0001118104 -0.056828783
## JobSatisfaction -0.005860137 -0.0042075978 -0.052437065
## MaritalStatus -0.089279698 0.0274296775 -0.021433148
## MonthlyIncome 1.000000000 0.0260869043 0.153465709
## MonthlyRate 0.026086904 1.0000000000 0.014641404
## NumCompaniesWorked 0.153465709 0.0146414037 1.000000000
## OverTime 0.003715759 0.0187146490 -0.034688388
## PercentSalaryHike -0.020461847 -0.0125556313 -0.005704167
## PerformanceRating -0.010667695 -0.0232817107 -0.016408194
## RelationshipSatisfaction 0.014833942 -0.0051338443 0.045151771
## StockOptionLevel 0.011418110 -0.0287286714 0.007190490
## TotalWorkingYears 0.778051117 0.0222044642 0.240701974
## TrainingTimesLastYear -0.015742666 0.0014956357 -0.063375104
## WorkLifeBalance 0.026783146 0.0059768742 -0.016092762
## YearsAtCompany 0.509622233 -0.0242591182 -0.128561053
## YearsInCurrentRole 0.352950217 -0.0136375516 -0.093938392
## YearsSinceLastPromotion 0.344207352 0.0030161680 -0.044193769
## YearsWithCurrManager 0.342195690 -0.0305885965 -0.111025318
## OverTime PercentSalaryHike PerformanceRating
## Age 0.022877358 0.0009864678 1.583229e-05
## Attrition 0.294476347 -0.0032931669 1.558670e-02
## BusinessTravel 0.023679845 -0.0381248643 -3.098692e-02
## DailyRate 0.008731580 0.0262917302 3.141042e-03
## Department 0.020033010 -0.0094667189 -2.331952e-02
## DistanceFromHome 0.029637371 0.0330482392 2.079581e-02
## Education -0.023447012 -0.0120362978 -2.295835e-02
## EducationField 0.010576127 -0.0062307376 3.541062e-04
## EmployeeNumber -0.021695289 -0.0055685294 -1.641802e-02
## EnvironmentSatisfaction 0.070301517 -0.0378422520 -3.176445e-02
## Gender -0.039197537 0.0075085645 -1.199827e-02
## HourlyRate -0.009264328 -0.0055634040 2.741301e-03
## JobInvolvement -0.010960646 -0.0250499926 -3.583803e-02
## JobLevel 0.002284084 -0.0266438988 -1.479710e-02
## JobRole 0.046587756 -0.0028575848 -2.076393e-02
## JobSatisfaction 0.031178972 0.0309858821 9.712990e-03
## MaritalStatus -0.012049470 0.0054761017 4.796253e-03
## MonthlyIncome 0.003715759 -0.0204618467 -1.066770e-02
## MonthlyRate 0.018714649 -0.0125556313 -2.328171e-02
## NumCompaniesWorked -0.034688388 -0.0057041671 -1.640819e-02
## OverTime 1.000000000 -0.0061237227 5.055235e-03
## PercentSalaryHike -0.006123723 1.0000000000 7.752059e-01
## PerformanceRating 0.005055235 0.7752058871 1.000000e+00
## RelationshipSatisfaction 0.048662479 -0.0401537684 -3.397377e-02
## StockOptionLevel -0.011625668 0.0138140123 3.415102e-03
## TotalWorkingYears 0.007939681 -0.0129033951 9.577344e-03
## TrainingTimesLastYear -0.073015320 -0.0113671248 -1.566635e-02
## WorkLifeBalance -0.031521995 -0.0087864625 -2.909185e-03
## YearsAtCompany -0.007088169 -0.0261976925 1.387656e-02
## YearsInCurrentRole -0.028518772 0.0063355128 4.155506e-02
## YearsSinceLastPromotion -0.010481661 -0.0064953485 3.403031e-02
## YearsWithCurrManager -0.042310383 -0.0092123602 2.500311e-02
## RelationshipSatisfaction StockOptionLevel
## Age 0.0361048738 0.0367761664
## Attrition -0.0514568313 -0.1864965809
## BusinessTravel -0.0343175689 -0.0192754897
## DailyRate 0.0157215199 0.0547204997
## Department -0.0236294221 -0.0155686822
## DistanceFromHome 0.0086680478 0.0446356078
## Education -0.0151478121 0.0206305626
## EducationField 0.0000697670 -0.0065788492
## EmployeeNumber -0.0673139618 0.0602637376
## EnvironmentSatisfaction 0.0064354574 0.0211301706
## Gender 0.0210740975 0.0167189350
## HourlyRate -0.0001790547 0.0559621563
## JobInvolvement 0.0345705438 0.0195297484
## JobLevel 0.0118489146 0.0199108184
## JobRole -0.0166132878 -0.0259231089
## JobSatisfaction -0.0155536785 0.0006706405
## MaritalStatus 0.0299752234 -0.6728215570
## MonthlyIncome 0.0148339422 0.0114181100
## MonthlyRate -0.0051338443 -0.0287286714
## NumCompaniesWorked 0.0451517708 0.0071904904
## OverTime 0.0486624787 -0.0116256675
## PercentSalaryHike -0.0401537684 0.0138140123
## PerformanceRating -0.0339737731 0.0034151022
## RelationshipSatisfaction 1.0000000000 -0.0479248925
## StockOptionLevel -0.0479248925 1.0000000000
## TotalWorkingYears 0.0117373620 0.0146367497
## TrainingTimesLastYear 0.0061291010 0.0086376356
## WorkLifeBalance 0.0237329952 -0.0107843273
## YearsAtCompany 0.0110440046 0.0243968665
## YearsInCurrentRole -0.0219658801 0.0665099557
## YearsSinceLastPromotion 0.0278727903 0.0271476406
## YearsWithCurrManager 0.0035068027 0.0337954615
## TotalWorkingYears TrainingTimesLastYear
## Age 0.6880815297 -0.019412238
## Attrition -0.2400997709 -0.040992900
## BusinessTravel 0.0308881340 0.014250344
## DailyRate 0.0200988352 0.016342939
## Department -0.0268261305 0.022975228
## DistanceFromHome 0.0103498343 -0.030464697
## Education 0.1463242155 -0.024886420
## EducationField -0.0334772543 0.048207758
## EmployeeNumber -0.0102573015 0.029429843
## EnvironmentSatisfaction 0.0019383244 -0.025685236
## Gender -0.0436846367 -0.052455306
## HourlyRate -0.0069644058 -0.015918437
## JobInvolvement 0.0045955642 -0.020924551
## JobLevel 0.7888010566 -0.013156875
## JobRole -0.1507431251 -0.012238145
## JobSatisfaction -0.0201923394 -0.009647056
## MaritalStatus -0.0872694565 0.007328126
## MonthlyIncome 0.7780511171 -0.015742666
## MonthlyRate 0.0222044642 0.001495636
## NumCompaniesWorked 0.2407019744 -0.063375104
## OverTime 0.0079396814 -0.073015320
## PercentSalaryHike -0.0129033951 -0.011367125
## PerformanceRating 0.0095773439 -0.015666347
## RelationshipSatisfaction 0.0117373620 0.006129101
## StockOptionLevel 0.0146367497 0.008637636
## TotalWorkingYears 1.0000000000 -0.029071802
## TrainingTimesLastYear -0.0290718016 1.000000000
## WorkLifeBalance -0.0002674499 0.023441833
## YearsAtCompany 0.6163595405 0.008867461
## YearsInCurrentRole 0.4540586911 0.003822341
## YearsSinceLastPromotion 0.3970743736 0.007722920
## YearsWithCurrManager 0.4601606175 0.002206372
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Age -0.0242739281 0.3050557627 0.210709268
## Attrition -0.0569722697 -0.2037311255 -0.207953876
## BusinessTravel -0.0237092855 -0.0281132930 -0.023266247
## DailyRate -0.0481454260 -0.0250310357 0.017973024
## Department 0.0293580481 0.0182137956 0.048258092
## DistanceFromHome -0.0144951446 0.0193873224 0.028096643
## Education 0.0102399469 0.0660656281 0.067641525
## EducationField 0.0414885831 -0.0283056386 -0.020381496
## EmployeeNumber 0.0146379323 -0.0106696896 -0.007081814
## EnvironmentSatisfaction 0.0506216914 0.0001709952 0.019489130
## Gender -0.0039913579 -0.0157457294 -0.027327397
## HourlyRate -0.0039903119 -0.0258502546 -0.019135517
## JobInvolvement -0.0265346733 -0.0050853745 0.015949201
## JobLevel 0.0374544986 0.5322274942 0.381305341
## JobRole 0.0318996304 -0.0785332701 -0.024893679
## JobSatisfaction -0.0286460096 -0.0090430452 -0.001359404
## MaritalStatus 0.0117553950 -0.0744217666 -0.081654023
## MonthlyIncome 0.0267831462 0.5096222334 0.352950217
## MonthlyRate 0.0059768742 -0.0242591182 -0.013637552
## NumCompaniesWorked -0.0160927618 -0.1285610532 -0.093938392
## OverTime -0.0315219945 -0.0070881690 -0.028518772
## PercentSalaryHike -0.0087864625 -0.0261976925 0.006335513
## PerformanceRating -0.0029091848 0.0138765650 0.041555057
## RelationshipSatisfaction 0.0237329952 0.0110440046 -0.021965880
## StockOptionLevel -0.0107843273 0.0243968665 0.066509956
## TotalWorkingYears -0.0002674499 0.6163595405 0.454058691
## TrainingTimesLastYear 0.0234418326 0.0088674608 0.003822341
## WorkLifeBalance 1.0000000000 0.0178191280 0.046796548
## YearsAtCompany 0.0178191280 1.0000000000 0.760942474
## YearsInCurrentRole 0.0467965479 0.7609424740 1.000000000
## YearsSinceLastPromotion 0.0199432436 0.6131197286 0.538446283
## YearsWithCurrManager 0.0097478105 0.7803833444 0.717532190
## YearsSinceLastPromotion YearsWithCurrManager
## Age 0.208271410 0.203795174
## Attrition -0.088742382 -0.198888529
## BusinessTravel -0.040261666 -0.029797694
## DailyRate -0.021500951 -0.022590581
## Department 0.035529507 0.027870683
## DistanceFromHome 0.014560296 0.019781222
## Education 0.053898859 0.069612521
## EducationField -0.004634135 -0.010276986
## EmployeeNumber -0.004174551 -0.011989551
## EnvironmentSatisfaction 0.007009715 -0.007631820
## Gender -0.019356602 -0.017030727
## HourlyRate -0.022590111 -0.017893996
## JobInvolvement -0.024658219 0.035292723
## JobLevel 0.355582834 0.374740324
## JobRole -0.050278324 -0.040529901
## JobSatisfaction -0.026109902 -0.021993074
## MaritalStatus -0.042581170 -0.055524457
## MonthlyIncome 0.344207352 0.342195690
## MonthlyRate 0.003016168 -0.030588596
## NumCompaniesWorked -0.044193769 -0.111025318
## OverTime -0.010481661 -0.042310383
## PercentSalaryHike -0.006495348 -0.009212360
## PerformanceRating 0.034030310 0.025003111
## RelationshipSatisfaction 0.027872790 0.003506803
## StockOptionLevel 0.027147641 0.033795462
## TotalWorkingYears 0.397074374 0.460160618
## TrainingTimesLastYear 0.007722920 0.002206372
## WorkLifeBalance 0.019943244 0.009747810
## YearsAtCompany 0.613119729 0.780383344
## YearsInCurrentRole 0.538446283 0.717532190
## YearsSinceLastPromotion 1.000000000 0.507459792
## YearsWithCurrManager 0.507459792 1.000000000
# Collinearity
corrgram(corr,order=TRUE,lower.panel=panel.shade,upper.panel=panel.pie)
# The plot shows that there are lot of irrelevant features so they need to be removed before building the classification models
# Removing features with same data in all the cases
employee1<-employee1[-c(9,22,27)]
employee2<-employee1
part a)-Imputation missing values
sum(is.na(employee2))
## [1] 0
# no NAs in dataset`
# Creating 117 random NAs
set.seed(1)
n_missing<-117
# selecting random sampling
y<-data.frame(row=sample(nrow(employee2),size=n_missing,replace = T),col=sample(ncol(employee2),size = n_missing,replace = T))
# replacing with NAs
employee2[as.matrix(y)]<-NA
sum(is.na(employee2)) #verifying
## [1] 117
# Looking at the pattern of NAs generated in dataset
md.pattern(employee2)
## Age PercentSalaryHike TotalWorkingYears Attrition Department
## 1289 1 1 1 1 1
## 2 1 1 1 0 1
## 3 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 0
## 5 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 6 1 1 1 1 1
## 4 1 1 1 1 1
## 5 1 1 1 1 1
## 1 1 1 1 1 1
## 3 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 10 1 1 1 1 1
## 2 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 6 1 1 1 1 1
## 1 1 0 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 0 1 1
## 3 1 1 1 1 1
## 3 1 1 1 1 1
## 6 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 0 1 1 2 2
## EducationField HourlyRate JobLevel JobSatisfaction PerformanceRating
## 1289 1 1 1 1 1
## 2 1 1 1 1 1
## 3 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 5 1 1 1 1 1
## 2 0 1 1 1 1
## 6 1 1 1 1 1
## 4 1 1 1 1 1
## 5 1 1 1 1 1
## 1 1 0 1 1 1
## 3 1 1 1 1 1
## 2 1 1 0 1 1
## 5 1 1 1 1 1
## 2 1 1 1 0 1
## 10 1 1 1 1 1
## 2 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 6 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 0
## 5 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 3 1 1 1 1 1
## 3 1 1 1 1 1
## 6 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 0 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 2 2 2 2 2
## YearsWithCurrManager BusinessTravel JobInvolvement MonthlyIncome
## 1289 1 1 1 1
## 2 1 1 1 1
## 3 1 0 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 6 1 1 1 1
## 4 1 1 1 1
## 5 1 1 1 1
## 1 1 1 1 1
## 3 1 1 0 1
## 2 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 10 1 1 1 1
## 2 1 1 1 0
## 2 1 1 1 1
## 5 1 1 1 1
## 6 1 1 1 1
## 1 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 3 1 1 1 1
## 3 1 1 1 1
## 6 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 2 0 1 1 1
## 1 1 1 1 1
## 1 1 1 1 0
## 1 1 1 1 1
## 2 3 3 3
## MonthlyRate TrainingTimesLastYear WorkLifeBalance YearsInCurrentRole
## 1289 1 1 1 1
## 2 1 1 1 1
## 3 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 6 1 1 1 1
## 4 1 1 1 1
## 5 1 1 1 1
## 1 1 1 1 1
## 3 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 10 1 1 1 1
## 2 1 1 1 1
## 2 0 1 1 1
## 5 1 1 1 1
## 6 1 1 1 1
## 1 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 3 1 0 1 1
## 3 1 1 0 1
## 6 1 1 1 1
## 2 1 1 1 0
## 5 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 0 1 1 1
## 1 1 1 1 0
## 3 3 3 3
## EnvironmentSatisfaction StockOptionLevel DailyRate DistanceFromHome
## 1289 1 1 1 1
## 2 1 1 1 1
## 3 1 1 1 1
## 5 1 1 0 1
## 2 1 1 1 1
## 5 1 1 1 0
## 5 1 1 1 1
## 2 1 1 1 1
## 6 1 1 1 1
## 4 0 1 1 1
## 5 1 1 1 1
## 1 1 1 1 1
## 3 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 10 1 1 1 1
## 2 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 6 1 1 1 1
## 1 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 4 1 0 1 1
## 1 1 1 1 1
## 3 1 1 1 1
## 3 1 1 1 1
## 6 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 4 4 5 5
## Education JobRole NumCompaniesWorked RelationshipSatisfaction
## 1289 1 1 1 1
## 2 1 1 1 1
## 3 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 5 0 1 1 1
## 2 1 1 1 1
## 6 1 1 1 1
## 4 1 1 1 1
## 5 1 1 1 1
## 1 1 1 1 1
## 3 1 1 1 1
## 2 1 1 1 1
## 5 1 0 1 1
## 2 1 1 1 1
## 10 1 1 1 1
## 2 1 1 1 1
## 2 1 1 1 1
## 5 1 1 0 1
## 6 1 1 1 1
## 1 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 0
## 4 1 1 1 1
## 1 1 1 1 1
## 3 1 1 1 1
## 3 1 1 1 1
## 6 1 1 1 1
## 2 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 5 5 5 5
## YearsSinceLastPromotion Gender OverTime YearsAtCompany EmployeeNumber
## 1289 1 1 1 1 1
## 2 1 1 1 1 1
## 3 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 6 1 1 1 1 0
## 4 1 1 1 1 1
## 5 1 0 1 1 1
## 1 1 1 1 1 1
## 3 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 10 1 1 1 1 1
## 2 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 6 1 1 0 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 3 1 1 1 1 1
## 3 1 1 1 1 1
## 6 1 1 1 0 1
## 2 1 1 1 1 1
## 5 0 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 0
## 1 1 1 1 1 1
## 1 1 0 1 1 1
## 5 6 6 6 7
## MaritalStatus
## 1289 1 0
## 2 1 1
## 3 1 1
## 5 1 1
## 2 1 1
## 5 1 1
## 5 1 1
## 2 1 1
## 6 1 1
## 4 1 1
## 5 1 1
## 1 1 1
## 3 1 1
## 2 1 1
## 5 1 1
## 2 1 1
## 10 0 1
## 2 1 1
## 2 1 1
## 5 1 1
## 6 1 1
## 1 1 1
## 2 1 1
## 5 1 1
## 4 1 1
## 1 1 1
## 3 1 1
## 3 1 1
## 6 1 1
## 2 1 1
## 5 1 1
## 2 1 1
## 1 1 2
## 1 1 2
## 1 1 2
## 10 117
# verifying for NAs once again
sum(is.na(employee2))
## [1] 0
# Note: See section named 'other' for comparing accuracies with imputed dataset vs original dataset at the bottom of the code/page.
part b) and c) Dummy codes and Normalization/Standardization of features
# Dummy codes for all the columns(predictors) except Attrition column (response)
set.seed(1)
employee1[,-2][sapply(employee1[,-2], is.factor)] <- data.matrix(employee1[,-2][sapply(employee1[,-2], is.factor)])
# Normalization
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x))) } #normalize fun min-max
employee.n <- as.data.frame(lapply(employee1[,-2], normalize))
# combining response and predictors
employee.n<-cbind(employee1$Attrition, employee.n)
#Verifying one of the features
summary(employee.n$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2857 0.4286 0.4465 0.5714 1.0000
part d)- Feature engineering-PCA
# Principal Component Analysis
pca = prcomp(employee.n[2:32], scale. = TRUE)
# Sqrt of eigenvalues
pca$sdev
## [1] 2.1647948 1.3826349 1.3455357 1.3094111 1.2603586 1.0916380 1.0779099
## [8] 1.0680234 1.0518422 1.0389891 1.0150137 1.0099269 0.9989128 0.9905508
## [15] 0.9787930 0.9721271 0.9676461 0.9437598 0.9371604 0.9155697 0.9105246
## [22] 0.8519604 0.7386985 0.7169205 0.5714016 0.5611696 0.5266126 0.4712836
## [29] 0.4362052 0.3750620 0.2148756
# Scree plot
fviz_eig(pca)
# Note that the elbow is at 2 dimensions
# So, reducing dataset to 2 dimensions from 10
# Circle of correlations
fviz_pca_var(pca,col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),repel = TRUE)
# Looking at rotation (or loadings in some methods) values to select features
# for dimension 1
pca$rotation[,1][order(pca$rotation[,1])] # selecting top 5 and bottom 5 of the list to ensure contributions in opposite directions, also circle of correlations can be used to verify this.
## TotalWorkingYears YearsAtCompany JobLevel
## -0.4025337444 -0.3871538079 -0.3828129161
## MonthlyIncome YearsInCurrentRole YearsWithCurrManager
## -0.3737297815 -0.3310094178 -0.3285591888
## YearsSinceLastPromotion Age Education
## -0.2920206353 -0.2804944941 -0.0783450760
## NumCompaniesWorked StockOptionLevel WorkLifeBalance
## -0.0493835921 -0.0342325831 -0.0122944552
## DistanceFromHome RelationshipSatisfaction JobInvolvement
## -0.0093213955 -0.0081754856 -0.0064832230
## Department PerformanceRating EnvironmentSatisfaction
## -0.0062809651 -0.0061742201 -0.0052079207
## MonthlyRate DailyRate BusinessTravel
## -0.0036171666 -0.0017954140 0.0007696971
## EmployeeNumber OverTime TrainingTimesLastYear
## 0.0056997083 0.0058668933 0.0066309936
## PercentSalaryHike JobSatisfaction HourlyRate
## 0.0088169474 0.0097840834 0.0108770719
## Gender EducationField JobRole
## 0.0208039036 0.0239458907 0.0652706405
## MaritalStatus
## 0.0670899520
# for dimension 2
pca$rotation[,2][order(pca$rotation[,2])] # selecting top 5 and bottom 5 of these
## YearsInCurrentRole YearsWithCurrManager JobRole
## -0.308535600 -0.308068262 -0.293929060
## Department YearsAtCompany YearsSinceLastPromotion
## -0.268780231 -0.251755297 -0.235079683
## PerformanceRating PercentSalaryHike MaritalStatus
## -0.197156220 -0.179008833 -0.143085244
## TrainingTimesLastYear WorkLifeBalance JobSatisfaction
## -0.057492879 -0.055894247 -0.040312477
## DistanceFromHome EducationField Gender
## -0.036724395 -0.036544074 -0.004765982
## EmployeeNumber OverTime EnvironmentSatisfaction
## 0.010060889 0.012210029 0.038777578
## MonthlyRate RelationshipSatisfaction JobInvolvement
## 0.041024308 0.050836624 0.056707297
## HourlyRate DailyRate BusinessTravel
## 0.060063555 0.062760587 0.087842393
## StockOptionLevel Education JobLevel
## 0.098859230 0.132041432 0.172756610
## TotalWorkingYears MonthlyIncome Age
## 0.190960013 0.192154519 0.328434360
## NumCompaniesWorked
## 0.409461497
# now union of features selected from dimensions 1 and 2 can be used to reduce overall number of features in dataset
part e) - new derived features
# We saw from PCA that education level itself doesn't conribute itself, but is an important criteria in people analytics
# To include this, calculating monthly income per degree or income per education level
set.seed(1)
IncomePerDegree<-data.frame(employee1$MonthlyIncome/employee1$Education)
# Again, normalizing this new derived feature
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x))) }
IncomePerDegree <- as.data.frame(lapply(IncomePerDegree, normalize))
IncomePerDegree <- setNames(IncomePerDegree, "Incomeperdegree")
# Removing redudant and insignificant variables from the results of pca and adding derived feature
employee.n<-employee.n[, -c(3,4,5,6,7,8,13,16,19,21,22,23,24,25,27,28,31)]
employee.n<-cbind(employee.n,IncomePerDegree )
Part a) creation of training & validation datasets
# dividing into 3/4 parts as the dataset is noisy
set.seed(1)
index <- createDataPartition(employee.n[,1], p=0.75, list = FALSE)
employee_train <- employee.n[index,]
employee_test <- employee.n[-index,]
Part b),c),d),e)
Model 1-KNN
# Holdout method
trctr <- trainControl(method = "none")# tune parameter #no folds
model_knn <- train(employee_train[, 2:16], employee_train[, 1], method='knn', trControl = trctr)
pred_knn<-predict(object=model_knn,employee_test[,2:16]) #predictions
table(pred_knn)
## pred_knn
## No Yes
## 329 21
# Accuracy (Holdout method)
confusionMatrix(pred_knn, employee_test[,1]) # Accuracy=86.57%
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 295 34
## Yes 13 8
##
## Accuracy : 0.8657
## 95% CI : (0.8255, 0.8996)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.818287
##
## Kappa : 0.1891
## Mcnemar's Test P-Value : 0.003531
##
## Sensitivity : 0.9578
## Specificity : 0.1905
## Pos Pred Value : 0.8967
## Neg Pred Value : 0.3810
## Prevalence : 0.8800
## Detection Rate : 0.8429
## Detection Prevalence : 0.9400
## Balanced Accuracy : 0.5741
##
## 'Positive' Class : No
##
#the dataset is a numerical one so no need to RMSE and similar methods for evaluating the fit of model
# knn with 10 fold cross validation
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)# tune control
knn_fit <- train(`employee1$Attrition` ~ ., data = employee_train, method = "knn", trControl=trctrl, tuneLength = 10)
pred_knn_cv<-predict(object=knn_fit,employee_test[,-1])# predictions
table(pred_knn_cv)
## pred_knn_cv
## No Yes
## 345 5
# Accuracy (CV method)
confusionMatrix(pred_knn_cv, employee_test[,1]) # Accuracy = 88.29%
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 306 39
## Yes 2 3
##
## Accuracy : 0.8829
## 95% CI : (0.8445, 0.9146)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.4755
##
## Kappa : 0.1048
## Mcnemar's Test P-Value : 1.885e-08
##
## Sensitivity : 0.99351
## Specificity : 0.07143
## Pos Pred Value : 0.88696
## Neg Pred Value : 0.60000
## Prevalence : 0.88000
## Detection Rate : 0.87429
## Detection Prevalence : 0.98571
## Balanced Accuracy : 0.53247
##
## 'Positive' Class : No
##
# repeated cv sampling instead of no sampling with k=15, number=10, repeats=3 increases accuracy from 86.57% to 88.29%.
Model 2-svm
# Holdout method
set.seed(1)
model_svm <- ksvm( employee_train[,1] ~ ., data = employee_train[,2:16], kernel = "vanilladot", cross=0) # model
## Setting default kernel parameters
pred_svm <- predict(model_svm, employee_test[,2:16]) # predictions
table(pred_svm)
## pred_svm
## No Yes
## 350 0
# Accuracy (Holdout method)
confusionMatrix(pred_svm, employee_test[,1])
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 308 42
## Yes 0 0
##
## Accuracy : 0.88
## 95% CI : (0.8413, 0.9121)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.541
##
## Kappa : 0
## Mcnemar's Test P-Value : 2.509e-10
##
## Sensitivity : 1.00
## Specificity : 0.00
## Pos Pred Value : 0.88
## Neg Pred Value : NaN
## Prevalence : 0.88
## Detection Rate : 0.88
## Detection Prevalence : 1.00
## Balanced Accuracy : 0.50
##
## 'Positive' Class : No
##
# I tried changing the inner product in feature space between the two vector arguments by using different kernals
# vanilladot,rbfdot,ploydot,laplacedot, anovadot gave accuracies of 88%,
# splinedot of 80.86%
# tanhdot and besseldot of 77.14%
# Accuracy (CV)
model_svm_cv<- ksvm( `employee1$Attrition` ~ ., data = employee_train, kernel = "vanilladot", cross=10)
## Setting default kernel parameters
pred_svm_cv <- predict(model_svm_cv, employee_test[,2:16])
confusionMatrix(pred_svm_cv, employee_test[,1])
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 308 42
## Yes 0 0
##
## Accuracy : 0.88
## 95% CI : (0.8413, 0.9121)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.541
##
## Kappa : 0
## Mcnemar's Test P-Value : 2.509e-10
##
## Sensitivity : 1.00
## Specificity : 0.00
## Pos Pred Value : 0.88
## Neg Pred Value : NaN
## Prevalence : 0.88
## Detection Rate : 0.88
## Detection Prevalence : 1.00
## Balanced Accuracy : 0.50
##
## 'Positive' Class : No
##
# This gives accuracy same as holdout method; 88%
Model 3- Decision Tree
# rpart
set.seed(1)
rtree_fit <- rpart(employee_train[,1] ~ ., employee_train[,2:16], method='class')
rpart.plot(rtree_fit)
pred_rtree <- predict(rtree_fit, employee_test[,2:16], type= 'class')
confusionMatrix(pred_rtree, employee_test[,1]) #88.57%
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 298 30
## Yes 10 12
##
## Accuracy : 0.8857
## 95% CI : (0.8476, 0.9171)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.410232
##
## Kappa : 0.3188
## Mcnemar's Test P-Value : 0.002663
##
## Sensitivity : 0.9675
## Specificity : 0.2857
## Pos Pred Value : 0.9085
## Neg Pred Value : 0.5455
## Prevalence : 0.8800
## Detection Rate : 0.8514
## Detection Prevalence : 0.9371
## Balanced Accuracy : 0.6266
##
## 'Positive' Class : No
##
# printing cp table
printcp(rtree_fit)
##
## Classification tree:
## rpart(formula = employee_train[, 1] ~ ., data = employee_train[,
## 2:16], method = "class")
##
## Variables actually used in tree construction:
## [1] Age EmployeeNumber EnvironmentSatisfaction
## [4] HourlyRate MaritalStatus NumCompaniesWorked
## [7] TotalWorkingYears YearsInCurrentRole
##
## Root node error: 128/1053 = 0.12156
##
## n= 1053
##
## CP nsplit rel error xerror xstd
## 1 0.031250 0 1.00000 1.0000 0.082842
## 2 0.023438 2 0.93750 1.0391 0.084216
## 3 0.015625 3 0.91406 1.0625 0.085022
## 4 0.010417 7 0.83594 1.1562 0.088111
## 5 0.010000 10 0.80469 1.1641 0.088359
# plotting cross-validation results
plotcp(rtree_fit)
# the plot shows that dividing trees into more nodes increases relative validation errors
# for this reason, this model is not so good
# Decision tree using C50 (no bias like rpart)
# also, this method does not require pruning
set.seed(1)
fit <- C5.0(employee_train[,2:16], employee_train[,1], trials=10)# boosting by adding trials =10
# I tried changing trials and found out 10 gives the best accuracy
summary(fit)
##
## Call:
## C5.0.default(x = employee_train[, 2:16], y = employee_train[, 1], trials
## = 10)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Apr 22 16:48:15 2018
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 1053 cases (16 attributes) from undefined.data
##
## ----- Trial 0: -----
##
## Decision tree:
##
## JobLevel > 0:
## :...JobRole <= 0.75: No (431/9)
## : JobRole > 0.75:
## : :...MaritalStatus <= 0.5: No (162/10)
## : MaritalStatus > 0.5:
## : :...EnvironmentSatisfaction > 0: No (58/12)
## : EnvironmentSatisfaction <= 0:
## : :...YearsInCurrentRole > 0.1666667: No (7/1)
## : YearsInCurrentRole <= 0.1666667:
## : :...Age <= 0.9047619: Yes (8)
## : Age > 0.9047619: No (2)
## JobLevel <= 0:
## :...MaritalStatus <= 0.5:
## :...TotalWorkingYears > 0.02631579: No (203/21)
## : TotalWorkingYears <= 0.02631579:
## : :...EnvironmentSatisfaction > 0.6666667: No (15/1)
## : EnvironmentSatisfaction <= 0.6666667:
## : :...YearsAtCompany <= 0: Yes (2)
## : YearsAtCompany > 0:
## : :...JobRole > 0.5: No (10/2)
## : JobRole <= 0.5:
## : :...Gender <= 0: Yes (4)
## : Gender > 0:
## : :...EnvironmentSatisfaction <= 0: Yes (2)
## : EnvironmentSatisfaction > 0: No (6/1)
## MaritalStatus > 0.5:
## :...NumCompaniesWorked > 0.5555556: Yes (11/2)
## NumCompaniesWorked <= 0.5555556:
## :...Incomeperdegree <= 0.008630767: Yes (7)
## Incomeperdegree > 0.008630767:
## :...Age > 0.3571429: No (42/4)
## Age <= 0.3571429:
## :...EmployeeNumber > 0.9245283: No (8)
## EmployeeNumber <= 0.9245283:
## :...HourlyRate <= 0.2: No (12/1)
## HourlyRate > 0.2:
## :...YearsInCurrentRole > 0.2222222: No (8/1)
## YearsInCurrentRole <= 0.2222222:
## :...JobRole <= 0.5: Yes (19/5)
## JobRole > 0.5:
## :...MonthlyIncome <= 0.07704055: Yes (19/4)
## MonthlyIncome > 0.07704055: No (17/4)
##
## ----- Trial 1: -----
##
## Decision tree:
##
## YearsInCurrentRole > 0.3888889: No (160.2/11.6)
## YearsInCurrentRole <= 0.3888889:
## :...MaritalStatus <= 0: No (165/29.4)
## MaritalStatus > 0:
## :...JobRole > 0.75:
## :...TotalWorkingYears <= 0.02631579: Yes (29.4/5.4)
## : TotalWorkingYears > 0.02631579:
## : :...HourlyRate <= 0.2142857: No (27/0.8)
## : HourlyRate > 0.2142857:
## : :...HourlyRate > 0.8: No (33.2/5.4)
## : HourlyRate <= 0.8:
## : :...MaritalStatus <= 0.5: No (74.1/34.9)
## : MaritalStatus > 0.5: Yes (82.7/23.9)
## JobRole <= 0.75:
## :...Age > 0.7142857: No (44.7)
## Age <= 0.7142857:
## :...JobRole <= 0: No (37/3.9)
## JobRole > 0:
## :...HourlyRate <= 0.1: No (26.2/0.8)
## HourlyRate > 0.1:
## :...EnvironmentSatisfaction <= 0:
## :...Incomeperdegree > 0.1344445: No (12.3)
## : Incomeperdegree <= 0.1344445:
## : :...NumCompaniesWorked <= 0: No (3.8)
## : NumCompaniesWorked > 0: Yes (66.5/20.8)
## EnvironmentSatisfaction > 0:
## :...Gender <= 0: No (98.8/20.1)
## Gender > 0:
## :...EnvironmentSatisfaction > 0.6666667: No (72.5/17)
## EnvironmentSatisfaction <= 0.6666667:
## :...NumCompaniesWorked <= 0.4444444: No (95.7/34.8)
## NumCompaniesWorked > 0.4444444: Yes (24/4.6)
##
## ----- Trial 2: -----
##
## Decision tree:
##
## YearsWithCurrManager > 0.4117647: No (148.8/16.4)
## YearsWithCurrManager <= 0.4117647:
## :...JobRole <= 0.125: No (69.6/10.7)
## JobRole > 0.125:
## :...Incomeperdegree <= 0.009876952: Yes (32.3/6.3)
## Incomeperdegree > 0.009876952:
## :...TotalWorkingYears > 0.4736842: No (66.6/3.2)
## TotalWorkingYears <= 0.4736842:
## :...EmployeeNumber > 0.934688: No (37.8/3.8)
## EmployeeNumber <= 0.934688:
## :...EnvironmentSatisfaction > 0.6666667:
## :...MaritalStatus <= 0: No (28.8)
## : MaritalStatus > 0:
## : :...NumCompaniesWorked > 0.6666667: No (13.9)
## : NumCompaniesWorked <= 0.6666667:
## : :...YearsWithCurrManager <= 0: Yes (52.4/20.8)
## : YearsWithCurrManager > 0: No (94.5/28.4)
## EnvironmentSatisfaction <= 0.6666667:
## :...NumCompaniesWorked <= 0.2222222:
## :...NumCompaniesWorked > 0.1111111: No (28.3/0.6)
## : NumCompaniesWorked <= 0.1111111:
## : :...JobLevel > 0:
## : :...TotalWorkingYears <= 0.3947369: No (74.3/15.1)
## : : TotalWorkingYears > 0.3947369: Yes (5)
## : JobLevel <= 0:
## : :...MonthlyIncome > 0.1524487: No (9.5)
## : MonthlyIncome <= 0.1524487:
## : :...MaritalStatus <= 0.5: No (78.9/32.9)
## : MaritalStatus > 0.5:
## : :...EmployeeNumber <= 0.1185293: No (17.7/3.2)
## : EmployeeNumber > 0.1185293: Yes (64.4/18.9)
## NumCompaniesWorked > 0.2222222:
## :...EnvironmentSatisfaction > 0.3333333:
## :...TotalWorkingYears <= 0.2368421: Yes (47.9/20.2)
## : TotalWorkingYears > 0.2368421: No (22/0.6)
## EnvironmentSatisfaction <= 0.3333333:
## :...JobRole > 0.75: Yes (73/16.5)
## JobRole <= 0.75:
## :...Age > 0.6190476: No (11.4)
## Age <= 0.6190476:
## :...NumCompaniesWorked > 0.7777778: No (5.7)
## NumCompaniesWorked <= 0.7777778:
## :...MaritalStatus <= 0.5: Yes (54.2/12)
## MaritalStatus > 0.5: No (15.8/4.4)
##
## ----- Trial 3: -----
##
## Decision tree:
##
## YearsInCurrentRole > 0.5: No (40.4)
## YearsInCurrentRole <= 0.5:
## :...MonthlyIncome > 0.6680358: No (49.2)
## MonthlyIncome <= 0.6680358:
## :...TotalWorkingYears <= 0.05263158:
## :...NumCompaniesWorked <= 0: Yes (9.7/1)
## : NumCompaniesWorked > 0:
## : :...HourlyRate <= 0.3285714: Yes (49.6/14)
## : HourlyRate > 0.3285714:
## : :...Incomeperdegree <= 0.008630767: Yes (5.7)
## : Incomeperdegree > 0.008630767: No (85.3/29.1)
## TotalWorkingYears > 0.05263158:
## :...HourlyRate <= 0.2142857: No (118.5/16.9)
## HourlyRate > 0.2142857:
## :...MaritalStatus <= 0: No (102.9/24.5)
## MaritalStatus > 0:
## :...Gender <= 0:
## :...TotalWorkingYears > 0.2368421: No (63.8)
## : TotalWorkingYears <= 0.2368421:
## : :...Age > 0.3095238: No (63.6/17.9)
## : Age <= 0.3095238:
## : :...NumCompaniesWorked <= 0.6666667: Yes (64.2/20.7)
## : NumCompaniesWorked > 0.6666667: No (9.7/2.1)
## Gender > 0:
## :...YearsWithCurrManager > 0.2941177:
## :...JobRole <= 0.125: No (7.6)
## : JobRole > 0.125:
## : :...Age <= 0.2142857: No (5.8)
## : Age > 0.2142857:
## : :...EmployeeNumber <= 0.06337687: No (6.7)
## : EmployeeNumber > 0.06337687: Yes (98.4/30.3)
## YearsWithCurrManager <= 0.2941177:
## :...YearsInCurrentRole > 0.2222222: No (30.8)
## YearsInCurrentRole <= 0.2222222:
## :...EmployeeNumber <= 0.1601355: Yes (44.7/7.9)
## EmployeeNumber > 0.1601355:
## :...JobRole <= 0.125: No (10.2)
## JobRole > 0.125:
## :...YearsInCurrentRole <= 0.05555556: No (36/4.2)
## YearsInCurrentRole > 0.05555556:
## :...TotalWorkingYears <= 0.07894737: No (8.7)
## TotalWorkingYears > 0.07894737: [S1]
##
## SubTree [S1]
##
## YearsInCurrentRole <= 0.1666667: No (115.9/44.7)
## YearsInCurrentRole > 0.1666667: Yes (25.7/8.8)
##
## ----- Trial 4: -----
##
## Decision tree:
##
## YearsWithCurrManager > 0.4117647: No (128.9/16.7)
## YearsWithCurrManager <= 0.4117647:
## :...TotalWorkingYears <= 0.02631579:
## :...Age > 0.3333333: No (19.6/3.8)
## : Age <= 0.3333333:
## : :...TotalWorkingYears <= 0: No (17/5.6)
## : TotalWorkingYears > 0: Yes (91.8/27.1)
## TotalWorkingYears > 0.02631579:
## :...NumCompaniesWorked > 0.4444444:
## :...EnvironmentSatisfaction > 0.6666667: No (33.8/2.6)
## : EnvironmentSatisfaction <= 0.6666667:
## : :...Gender <= 0:
## : :...MaritalStatus <= 0.5: No (42.1/8.3)
## : : MaritalStatus > 0.5: Yes (25/8.3)
## : Gender > 0:
## : :...YearsAtCompany <= 0.08108108: Yes (75.8/16.1)
## : YearsAtCompany > 0.08108108: No (38.6/14.1)
## NumCompaniesWorked <= 0.4444444:
## :...EmployeeNumber > 0.9351717: No (31.3)
## EmployeeNumber <= 0.9351717:
## :...JobLevel > 0.25: No (77.3/8.8)
## JobLevel <= 0.25:
## :...MaritalStatus <= 0: No (51/10)
## MaritalStatus > 0:
## :...HourlyRate <= 0.2142857: No (53.2/7.4)
## HourlyRate > 0.2142857:
## :...YearsInCurrentRole > 0.2222222: No (89.4/23.4)
## YearsInCurrentRole <= 0.2222222:
## :...Age > 0.7380952: Yes (15.5/2.1)
## Age <= 0.7380952:
## :...TotalWorkingYears > 0.2631579: No (18.5/1.3)
## TotalWorkingYears <= 0.2631579:
## :...TotalWorkingYears > 0.2368421: Yes (18.9/5.5)
## TotalWorkingYears <= 0.2368421:
## :...MaritalStatus <= 0.5:
## :...Age > 0.5: No (11.9)
## : Age <= 0.5:
## : :...HourlyRate > 0.7857143: No (19.9)
## : HourlyRate <= 0.7857143: [S1]
## MaritalStatus > 0.5:
## :...Incomeperdegree > 0.1726026: Yes (5.8)
## Incomeperdegree <= 0.1726026: [S2]
##
## SubTree [S1]
##
## TotalWorkingYears > 0.2105263: No (4.4)
## TotalWorkingYears <= 0.2105263:
## :...Age <= 0.3571429: No (49.3/18.8)
## Age > 0.3571429: Yes (27.4/3.8)
##
## SubTree [S2]
##
## EmployeeNumber > 0.6806967: No (20.1/2.1)
## EmployeeNumber <= 0.6806967:
## :...EmployeeNumber > 0.6303822: Yes (8.7)
## EmployeeNumber <= 0.6303822:
## :...HourlyRate <= 0.7: Yes (48.2/16.7)
## HourlyRate > 0.7: No (29.6/7.2)
##
## ----- Trial 5: -----
##
## Decision tree:
##
## MonthlyIncome > 0.6680358: No (41.4)
## MonthlyIncome <= 0.6680358:
## :...JobRole <= 0: No (44.2/4)
## JobRole > 0:
## :...EnvironmentSatisfaction <= 0:
## :...Incomeperdegree > 0.1570805: No (16.5)
## : Incomeperdegree <= 0.1570805:
## : :...MonthlyIncome > 0.4799895: Yes (18/2.1)
## : MonthlyIncome <= 0.4799895:
## : :...YearsInCurrentRole > 0.1666667: No (53.5/15.5)
## : YearsInCurrentRole <= 0.1666667:
## : :...TotalWorkingYears <= 0.1842105: Yes (85.5/26.1)
## : TotalWorkingYears > 0.1842105: No (51.9/18.4)
## EnvironmentSatisfaction > 0:
## :...YearsWithCurrManager > 0.4117647: No (62.6/6.2)
## YearsWithCurrManager <= 0.4117647:
## :...TotalWorkingYears > 0.4210526: No (32.2/4.1)
## TotalWorkingYears <= 0.4210526:
## :...Age > 0.6190476: No (49.3/9)
## Age <= 0.6190476:
## :...Incomeperdegree <= 0.009876952: Yes (26.6/5.9)
## Incomeperdegree > 0.009876952:
## :...YearsWithCurrManager > 0.3529412:
## :...Age > 0.4285714: No (11.8)
## : Age <= 0.4285714:
## : :...YearsInCurrentRole <= 0.05555556: No (4.7)
## : YearsInCurrentRole > 0.05555556:
## : :...EmployeeNumber <= 0.8703435: Yes (80.2/25.6)
## : EmployeeNumber > 0.8703435: No (5.5)
## YearsWithCurrManager <= 0.3529412:
## :...YearsInCurrentRole > 0.2222222: No (44.1/6.2)
## YearsInCurrentRole <= 0.2222222:
## :...MaritalStatus > 0.5:
## :...EmployeeNumber <= 0.01306241: Yes (11.2/0.3)
## : EmployeeNumber > 0.01306241:
## : :...TotalWorkingYears > 0.2368421: No (13.4)
## : TotalWorkingYears <= 0.2368421:
## : :...YearsAtCompany > 0.1351351: Yes (19.8/5.5)
## : YearsAtCompany <= 0.1351351: [S1]
## MaritalStatus <= 0.5:
## :...YearsInCurrentRole > 0.1111111: No (36.3/3.3)
## YearsInCurrentRole <= 0.1111111: [S2]
##
## SubTree [S1]
##
## YearsInCurrentRole <= 0: Yes (59.4/25)
## YearsInCurrentRole > 0: No (96.1/34.6)
##
## SubTree [S2]
##
## YearsWithCurrManager > 0.1764706: Yes (14.1/3.9)
## YearsWithCurrManager <= 0.1764706:
## :...MonthlyIncome <= 0.06824645: No (34.9/2.8)
## MonthlyIncome > 0.06824645:
## :...YearsWithCurrManager > 0.1176471: No (8.4)
## YearsWithCurrManager <= 0.1176471:
## :...YearsInCurrentRole <= 0: No (55.9/16.1)
## YearsInCurrentRole > 0:
## :...Age <= 0.2142857: No (9.7)
## Age > 0.2142857:
## :...Incomeperdegree <= 0.1149708: Yes (57.2/18.6)
## Incomeperdegree > 0.1149708: No (8.7)
##
## ----- Trial 6: -----
##
## Decision tree:
##
## YearsWithCurrManager > 0.5294118: No (26.3)
## YearsWithCurrManager <= 0.5294118:
## :...MonthlyIncome > 0.6680358: No (26.2)
## MonthlyIncome <= 0.6680358:
## :...MaritalStatus > 0.5:
## :...Gender <= 0:
## : :...JobRole <= 0.875: No (121.6/32)
## : : JobRole > 0.875: Yes (27.3/6.3)
## : Gender > 0:
## : :...JobRole <= 0.25:
## : :...JobRole <= 0.125: No (5)
## : : JobRole > 0.125: Yes (64/20.1)
## : JobRole > 0.25:
## : :...JobRole <= 0.625: No (10.9)
## : JobRole > 0.625:
## : :...YearsInCurrentRole > 0.4444444: Yes (14.3/0.6)
## : YearsInCurrentRole <= 0.4444444:
## : :...MonthlyIncome > 0.4136914: Yes (18.9/1.7)
## : MonthlyIncome <= 0.4136914:
## : :...Incomeperdegree > 0.1554888: No (12.3)
## : Incomeperdegree <= 0.1554888:
## : :...NumCompaniesWorked <= 0.5555556: No (110.1/38.3)
## : NumCompaniesWorked > 0.5555556: Yes (28.4/9.4)
## MaritalStatus <= 0.5:
## :...YearsInCurrentRole > 0.4444444: No (26)
## YearsInCurrentRole <= 0.4444444:
## :...EnvironmentSatisfaction > 0.3333333:
## :...Age <= 0.1904762: Yes (47.8/22.1)
## : Age > 0.1904762: No (247.7/42.6)
## EnvironmentSatisfaction <= 0.3333333:
## :...NumCompaniesWorked <= 0: No (20.9)
## NumCompaniesWorked > 0:
## :...YearsInCurrentRole > 0.2777778: Yes (57.5/19.1)
## YearsInCurrentRole <= 0.2777778:
## :...YearsInCurrentRole > 0.1666667: No (11.2)
## YearsInCurrentRole <= 0.1666667:
## :...JobRole > 0.875: Yes (19.5/4.6)
## JobRole <= 0.875:
## :...NumCompaniesWorked <= 0.2222222: No (60.8/12.9)
## NumCompaniesWorked > 0.2222222:
## :...NumCompaniesWorked > 0.7777778: No (7.5)
## NumCompaniesWorked <= 0.7777778:
## :...YearsAtCompany <= 0: No (6.2)
## YearsAtCompany > 0:
## :...Gender <= 0: No (23.5/6.5)
## Gender > 0: Yes (59.3/17.9)
##
## ----- Trial 7: -----
##
## Decision tree:
##
## YearsInCurrentRole > 0.3888889: No (102.7/15.5)
## YearsInCurrentRole <= 0.3888889:
## :...TotalWorkingYears > 0.4736842: No (46.4/6.1)
## TotalWorkingYears <= 0.4736842:
## :...MaritalStatus <= 0.5:
## :...EnvironmentSatisfaction > 0.6666667:
## : :...NumCompaniesWorked <= 0: Yes (21.6/7.6)
## : : NumCompaniesWorked > 0: No (95.3/11.3)
## : EnvironmentSatisfaction <= 0.6666667:
## : :...NumCompaniesWorked <= 0: No (36.9/2.3)
## : NumCompaniesWorked > 0:
## : :...HourlyRate > 0.4142857:
## : :...YearsWithCurrManager > 0.4117647: Yes (12.6/3.7)
## : : YearsWithCurrManager <= 0.4117647:
## : : :...JobLevel > 0.25: No (13.6)
## : : JobLevel <= 0.25:
## : : :...HourlyRate <= 0.5285714: No (32.7/1.3)
## : : HourlyRate > 0.5285714:
## : : :...HourlyRate <= 0.5857143: Yes (32.4/9.9)
## : : HourlyRate > 0.5857143: No (147.6/40.4)
## : HourlyRate <= 0.4142857:
## : :...YearsAtCompany > 0.3243243: Yes (7.7)
## : YearsAtCompany <= 0.3243243:
## : :...YearsAtCompany > 0.1891892: No (14.6)
## : YearsAtCompany <= 0.1891892:
## : :...EmployeeNumber <= 0.1132075: No (8.3)
## : EmployeeNumber > 0.1132075:
## : :...MonthlyIncome > 0.1246445: No (33.7/11.9)
## : MonthlyIncome <= 0.1246445:
## : :...MonthlyIncome <= 0.07035282: No (17.8/5.5)
## : MonthlyIncome > 0.07035282: Yes (51.3/4.3)
## MaritalStatus > 0.5:
## :...TotalWorkingYears > 0.2894737: No (20.1/3.1)
## TotalWorkingYears <= 0.2894737:
## :...JobRole > 0.75:
## :...MonthlyIncome <= 0.06966825: Yes (15.7)
## : MonthlyIncome > 0.06966825:
## : :...MonthlyIncome <= 0.08788836: No (10.5)
## : MonthlyIncome > 0.08788836:
## : :...YearsInCurrentRole <= 0.05555556: Yes (18.6/3.2)
## : YearsInCurrentRole > 0.05555556:
## : :...HourlyRate <= 0.2142857: No (11.1)
## : HourlyRate > 0.2142857:
## : :...Incomeperdegree <= 0.06218617: No (23.6/5.2)
## : Incomeperdegree > 0.06218617:
## : :...Incomeperdegree <= 0.1468215: Yes (58.4/12)
## : Incomeperdegree > 0.1468215: No (7.9)
## JobRole <= 0.75:
## :...JobLevel > 0: No (35/4.5)
## JobLevel <= 0:
## :...NumCompaniesWorked > 0.5555556: Yes (17.7/3.4)
## NumCompaniesWorked <= 0.5555556:
## :...YearsInCurrentRole > 0.2222222: No (11.1)
## YearsInCurrentRole <= 0.2222222:
## :...HourlyRate <= 0.2: No (15.5/1.7)
## HourlyRate > 0.2:
## :...MonthlyIncome > 0.1631385: No (6.6)
## MonthlyIncome <= 0.1631385:
## :...NumCompaniesWorked <= 0: Yes (25/3.1)
## NumCompaniesWorked > 0:
## :...Incomeperdegree > 0.09834931: Yes (12.3/1.7)
## Incomeperdegree <= 0.09834931: [S1]
##
## SubTree [S1]
##
## EnvironmentSatisfaction <= 0: Yes (14.9/3.2)
## EnvironmentSatisfaction > 0: No (73.7/25.2)
##
## ----- Trial 8: -----
##
## Decision tree:
##
## YearsInCurrentRole > 0.3888889: No (70.8)
## YearsInCurrentRole <= 0.3888889:
## :...TotalWorkingYears <= 0.05263158:
## :...MaritalStatus <= 0: No (25.1/8)
## : MaritalStatus > 0:
## : :...NumCompaniesWorked <= 0: Yes (8.7/0.6)
## : NumCompaniesWorked > 0:
## : :...Age > 0.3571429: No (9.1)
## : Age <= 0.3571429:
## : :...Incomeperdegree <= 0.008630767: Yes (15.4)
## : Incomeperdegree > 0.008630767:
## : :...MonthlyIncome <= 0.0249605: No (12.8/1.4)
## : MonthlyIncome > 0.0249605:
## : :...EmployeeNumber > 0.8234156: Yes (28.6/3.8)
## : EmployeeNumber <= 0.8234156:
## : :...JobRole <= 0.5: Yes (36.4/8.9)
## : JobRole > 0.5: No (47/18.2)
## TotalWorkingYears > 0.05263158:
## :...HourlyRate <= 0.2142857: No (110.1/13.2)
## HourlyRate > 0.2142857:
## :...NumCompaniesWorked > 0.2222222:
## :...EnvironmentSatisfaction <= 0.3333333:
## : :...JobRole > 0.75:
## : : :...YearsInCurrentRole <= 0: No (9.8)
## : : : YearsInCurrentRole > 0:
## : : : :...Incomeperdegree <= 0.1714566: Yes (66.7/9)
## : : : Incomeperdegree > 0.1714566: No (5.5)
## : : JobRole <= 0.75:
## : : :...EmployeeNumber > 0.8064829: No (11)
## : : EmployeeNumber <= 0.8064829:
## : : :...Incomeperdegree <= 0.02843391: Yes (33/5.5)
## : : Incomeperdegree > 0.02843391:
## : : :...Gender <= 0: No (20.8)
## : : Gender > 0: Yes (54.2/24.9)
## : EnvironmentSatisfaction > 0.3333333:
## : :...YearsAtCompany > 0.1621622: No (20.3)
## : YearsAtCompany <= 0.1621622:
## : :...JobLevel > 0.25: No (7.6)
## : JobLevel <= 0.25:
## : :...NumCompaniesWorked > 0.7777778: No (10.6)
## : NumCompaniesWorked <= 0.7777778:
## : :...MaritalStatus <= 0.5: No (58.2/17.1)
## : MaritalStatus > 0.5: Yes (42.1/14.6)
## NumCompaniesWorked <= 0.2222222:
## :...MaritalStatus <= 0: No (42.7)
## MaritalStatus > 0:
## :...Incomeperdegree <= 0.01425728: Yes (18/3.2)
## Incomeperdegree > 0.01425728:
## :...NumCompaniesWorked > 0.1111111: No (27.9/3.9)
## NumCompaniesWorked <= 0.1111111:
## :...JobLevel <= 0:
## :...Age > 0.4761905: Yes (24.1/6.3)
## : Age <= 0.4761905:
## : :...Age > 0.3333333: No (26.5)
## : Age <= 0.3333333:
## : :...MonthlyIncome <= 0.07440758: Yes (34.9/12.1)
## : MonthlyIncome > 0.07440758: No (57.3/11.2)
## JobLevel > 0:
## :...MonthlyIncome <= 0.1891522: No (23.7)
## MonthlyIncome > 0.1891522:
## :...HourlyRate > 0.8285714: No (14)
## HourlyRate <= 0.8285714:
## :...MonthlyIncome <= 0.1902054: Yes (4.4)
## MonthlyIncome > 0.1902054: No (70.6/21.9)
##
## ----- Trial 9: -----
##
## Decision tree:
##
## YearsInCurrentRole > 0.3888889: No (58.6)
## YearsInCurrentRole <= 0.3888889:
## :...TotalWorkingYears <= 0.05263158:
## :...Incomeperdegree > 0.119402: No (9.7)
## : Incomeperdegree <= 0.119402:
## : :...HourlyRate <= 0.7285714: Yes (137.5/49)
## : HourlyRate > 0.7285714: No (34.1/10.1)
## TotalWorkingYears > 0.05263158:
## :...MaritalStatus <= 0.5:
## :...HourlyRate <= 0.2142857: No (53.2)
## : HourlyRate > 0.2142857:
## : :...HourlyRate <= 0.4142857:
## : :...EnvironmentSatisfaction > 0.6666667: No (11.3)
## : : EnvironmentSatisfaction <= 0.6666667:
## : : :...JobLevel > 0.5: No (7)
## : : JobLevel <= 0.5:
## : : :...EmployeeNumber <= 0.516207: Yes (61.1/15.8)
## : : EmployeeNumber > 0.516207: No (23.2/3.7)
## : HourlyRate > 0.4142857:
## : :...EnvironmentSatisfaction > 0.3333333: No (114.9/6.5)
## : EnvironmentSatisfaction <= 0.3333333:
## : :...HourlyRate <= 0.5285714: No (31.1)
## : HourlyRate > 0.5285714:
## : :...Gender <= 0: No (42.7/3.1)
## : Gender > 0:
## : :...JobLevel > 0.5: No (8.5)
## : JobLevel <= 0.5:
## : :...EmployeeNumber <= 0.2597968: Yes (37.4/12.8)
## : EmployeeNumber > 0.2597968: No (61.6/13.3)
## MaritalStatus > 0.5:
## :...EnvironmentSatisfaction > 0.6666667:
## :...JobLevel > 0.25: Yes (13.1/4)
## : JobLevel <= 0.25:
## : :...Incomeperdegree <= 0.1678063: No (97.4/17)
## : Incomeperdegree > 0.1678063: Yes (5.8)
## EnvironmentSatisfaction <= 0.6666667:
## :...EmployeeNumber <= 0.09143686: Yes (26.1/5.1)
## EmployeeNumber > 0.09143686:
## :...Gender <= 0: No (66.7/18.6)
## Gender > 0:
## :...JobRole <= 0.125: No (7.5)
## JobRole > 0.125:
## :...HourlyRate <= 0.5: No (56.6/14.4)
## HourlyRate > 0.5:
## :...Age > 0.9285714: No (5)
## Age <= 0.9285714:
## :...EmployeeNumber <= 0.1306241: No (4.9)
## EmployeeNumber > 0.1306241: Yes (62.2/16.2)
##
##
## Evaluation on training data (1053 cases):
##
## Trial Decision Tree
## ----- ----------------
## Size Errors
##
## 0 22 78( 7.4%)
## 1 17 136(12.9%)
## 2 23 154(14.6%)
## 3 23 169(16.0%)
## 4 27 127(12.1%)
## 5 29 177(16.8%)
## 6 24 174(16.5%)
## 7 33 114(10.8%)
## 8 33 117(11.1%)
## 9 25 146(13.9%)
## boost 30( 2.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 923 2 (a): class No
## 28 100 (b): class Yes
##
##
## Attribute usage:
##
## 100.00% JobLevel
## 100.00% JobRole
## 100.00% MonthlyIncome
## 100.00% YearsInCurrentRole
## 100.00% YearsWithCurrManager
## 94.40% MaritalStatus
## 93.16% EnvironmentSatisfaction
## 93.16% TotalWorkingYears
## 90.69% HourlyRate
## 88.22% NumCompaniesWorked
## 82.72% EmployeeNumber
## 82.05% Incomeperdegree
## 81.96% Age
## 69.14% Gender
## 40.74% YearsAtCompany
##
##
## Time: 0.1 secs
#print(fit)
#plot(fit)
pred_c.50tree <- predict(fit, employee_test[,2:16])# predictions
confusionMatrix(pred_c.50tree, employee_test[,1]) # 88.57%
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 300 32
## Yes 8 10
##
## Accuracy : 0.8857
## 95% CI : (0.8476, 0.9171)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.4102318
##
## Kappa : 0.2816
## Mcnemar's Test P-Value : 0.0002762
##
## Sensitivity : 0.9740
## Specificity : 0.2381
## Pos Pred Value : 0.9036
## Neg Pred Value : 0.5556
## Prevalence : 0.8800
## Detection Rate : 0.8571
## Detection Prevalence : 0.9486
## Balanced Accuracy : 0.6061
##
## 'Positive' Class : No
##
Model 4-Random Forest
set.seed(1)
model_rf <- randomForest(employee_train[,1] ~ ., data = employee_train[,2:16], importance = TRUE) #model
# Tuning; found mtry=4 and ntree=500 gives best fit
model_rf2 <- randomForest(employee_train[,1] ~ ., data = employee_train[,2:16], ntree = 500, mtry = 4, importance = TRUE)
# predictions and accuracy
pred_rf2 <- predict(model_rf2, employee_test[,2:16], type = "class")
confusionMatrix(pred_rf2, employee_test[,1]) # 88.29%
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 302 35
## Yes 6 7
##
## Accuracy : 0.8829
## 95% CI : (0.8445, 0.9146)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.4755
##
## Kappa : 0.2097
## Mcnemar's Test P-Value : 1.226e-05
##
## Sensitivity : 0.9805
## Specificity : 0.1667
## Pos Pred Value : 0.8961
## Neg Pred Value : 0.5385
## Prevalence : 0.8800
## Detection Rate : 0.8629
## Detection Prevalence : 0.9629
## Balanced Accuracy : 0.5736
##
## 'Positive' Class : No
##
# Cross Validation
model_rf_cv<-rfcv(employee_train[,2:16], employee_train[,1], cv.fold=10)
model_rf_cv$error.cv
## 15 8 4 1
## 0.1168091 0.1206078 0.1405508 0.1272555
# accuracy using 15 features = 88.32%
part f) comparison of models
# Tabulating accuracies
Model <- c('Decision Tree-C5.0','Random Forest','kNN','SVM-vanilladot')
Accuracy_percent <- c(88.57,88.32,88.29,88.00)
mytable<- data.frame(Model, Accuracy_percent)
qplot(1:10, 1:10, geom = "blank") + theme(line = element_blank(), text = element_blank()) + annotation_custom(grob = tableGrob(mytable))
# Decision Tree, followed by random forest gave better accuracies
set.seed(1)
# Plotting the ROC curves for the four models
plot(roc(employee_test[,1], as.numeric(pred_knn_cv)), col='red')
par(new=TRUE)
plot(roc(employee_test[,1], as.numeric(pred_svm_cv)), col='green')
par(new=TRUE)
plot(roc(employee_test[,1], as.numeric(pred_c.50tree)), col='blue')
par(new=TRUE)
plot(roc(employee_test[,1], as.numeric(pred_rf2)), col='pink')
legend("bottomright", c("knn", "svm", "decision tree", "random forest"), fill=c('red','green','blue','pink'), title="Model")
# As expected # Decision Tree, followed by random forest gave better ROC curves
part g) Interpretation of results/prediction with interval
# Using knn model to predict Employee Attrition for a new test case
t1<-c(25,2063,2,1,72,2,5,4,9992,4,8,15,3,2,5000) #test case
data_new<-employee.n[,-1] # removing the response variable from dataset
data_new[nrow(employee.n)+1,] <- t1 # adding the test case to normalized dataset
# Normalizing new testrow
set.seed(1)
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x))) }
data_new.n <- as.data.frame(lapply(data_new, normalize))
# saving normalized test case
t1<-data_new.n[1404,]
# train test and labels
data_new.train <- data.frame(data_new.n[1:1403,])
data_new.test1 <- data.frame(data_new.n[1404,])
data_labels <- employee.n[,1]
# model
test_pred_1 <- knn(train = data_new.train, test =data_new.test1 , cl = data_labels, k=15)
# prediction
test_pred_1 # 86.57 accurate in 95% CI
## [1] No
## Levels: No Yes
part h) construction of stacked ensemble model
set.seed(1)
# Combining prediction from four models
predCom <- data.frame(pred_knn_cv, pred_rf2,pred_svm,pred_c.50tree, y= employee_test[,1],stringsAsFactors = F)
# Training the ensemble model using random forest
modelStack <- train(y ~ ., data = predCom, method = "rf")
predStack<-predict(modelStack, employee_test[,2:16]) # predictions
confusionMatrix(predStack, employee_test[,1])
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 304 36
## Yes 4 6
##
## Accuracy : 0.8857
## 95% CI : (0.8476, 0.9171)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.4102
##
## Kappa : 0.1935
## Mcnemar's Test P-Value : 9.509e-07
##
## Sensitivity : 0.9870
## Specificity : 0.1429
## Pos Pred Value : 0.8941
## Neg Pred Value : 0.6000
## Prevalence : 0.8800
## Detection Rate : 0.8686
## Detection Prevalence : 0.9714
## Balanced Accuracy : 0.5649
##
## 'Positive' Class : No
##
# Accuracy of stacked ensemble model is 88.57%; which is same as accuracy of decision tree
set.seed(1)
# iterating over dataset with imputed missing values to compare accuracies
employee2[,-2][sapply(employee2[,-2], is.factor)] <- data.matrix(employee2[,-2][sapply(employee2[,-2], is.factor)])
# Normalization as usual
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x))) }
employee.n.m <- as.data.frame(lapply(employee2[,-2], normalize))
employee.n.m<-cbind(employee2$Attrition, employee.n.m)
# adding the new derived column
IncomePerDegree<-data.frame(employee2$MonthlyIncome/employee2$Education)
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x))) }
IncomePerDegree <- as.data.frame(lapply(IncomePerDegree, normalize))
IncomePerDegree <- setNames(IncomePerDegree, "Incomeperdegree")
# removing redudant and insignificant variables from results of pca and adding derived feature
employee.n.m<-employee.n.m[, -c(3,4,5,6,7,8,13,16,19,21,22,23,24,25,27,28,31)]
employee.n.m<-cbind(employee.n.m,IncomePerDegree )
# splitting up train and test datasets
index <- createDataPartition(employee.n.m[,1], p=0.75, list = FALSE)
employee_train_m <- employee.n.m[index,]
employee_test_m <- employee.n.m[-index,]
# verifying model 1 - knn
trctr.m <- trainControl(method = "repeatedcv", number = 10, repeats = 3)# tune control
knn_fit_m <- train(`employee2$Attrition` ~ ., data = employee_train_m, method = "knn", trControl=trctr.m, tuneLength = 10)
pred_knn.m<-predict(object=knn_fit_m,employee_test_m[,-1])# predictions
confusionMatrix(pred_knn.m, employee_test_m[,1])
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 307 41
## Yes 1 1
##
## Accuracy : 0.88
## 95% CI : (0.8413, 0.9121)
## No Information Rate : 0.88
## P-Value [Acc > NIR] : 0.541
##
## Kappa : 0.0349
## Mcnemar's Test P-Value : 1.768e-09
##
## Sensitivity : 0.99675
## Specificity : 0.02381
## Pos Pred Value : 0.88218
## Neg Pred Value : 0.50000
## Prevalence : 0.88000
## Detection Rate : 0.87714
## Detection Prevalence : 0.99429
## Balanced Accuracy : 0.51028
##
## 'Positive' Class : No
##
# Accuracy remains almost the same.
# Although for original dataset
#Prediction
#No Yes
#No 306 39
#Yes 2 3