Data Acquisition

IBM Watson Dataset

Project Rubric

Youtube Video Link

# Reading xlsx dataset
employee <- read.xlsx("WA_Fn-UseC_-HR-Employee-Attrition.xlsx", sheetIndex = 1, header = T)
# saving a copy of dataset
employee1 <- employee

Data Exploration

# Looking at a quick summary of all the features
summary(employee)
##       Age        Attrition            BusinessTravel   DailyRate     
##  Min.   :18.00   No :1233   Non-Travel       : 150   Min.   : 102.0  
##  1st Qu.:30.00   Yes: 237   Travel_Frequently: 277   1st Qu.: 465.0  
##  Median :36.00              Travel_Rarely    :1043   Median : 802.0  
##  Mean   :36.92                                       Mean   : 802.5  
##  3rd Qu.:43.00                                       3rd Qu.:1157.0  
##  Max.   :60.00                                       Max.   :1499.0  
##                                                                      
##                   Department  DistanceFromHome   Education    
##  Human Resources       : 63   Min.   : 1.000   Min.   :1.000  
##  Research & Development:961   1st Qu.: 2.000   1st Qu.:2.000  
##  Sales                 :446   Median : 7.000   Median :3.000  
##                               Mean   : 9.193   Mean   :2.913  
##                               3rd Qu.:14.000   3rd Qu.:4.000  
##                               Max.   :29.000   Max.   :5.000  
##                                                               
##           EducationField EmployeeCount EmployeeNumber  
##  Human Resources : 27    Min.   :1     Min.   :   1.0  
##  Life Sciences   :606    1st Qu.:1     1st Qu.: 491.2  
##  Marketing       :159    Median :1     Median :1020.5  
##  Medical         :464    Mean   :1     Mean   :1024.9  
##  Other           : 82    3rd Qu.:1     3rd Qu.:1555.8  
##  Technical Degree:132    Max.   :1     Max.   :2068.0  
##                                                        
##  EnvironmentSatisfaction    Gender      HourlyRate     JobInvolvement
##  Min.   :1.000           Female:588   Min.   : 30.00   Min.   :1.00  
##  1st Qu.:2.000           Male  :882   1st Qu.: 48.00   1st Qu.:2.00  
##  Median :3.000                        Median : 66.00   Median :3.00  
##  Mean   :2.722                        Mean   : 65.89   Mean   :2.73  
##  3rd Qu.:4.000                        3rd Qu.: 83.75   3rd Qu.:3.00  
##  Max.   :4.000                        Max.   :100.00   Max.   :4.00  
##                                                                      
##     JobLevel                          JobRole    JobSatisfaction
##  Min.   :1.000   Sales Executive          :326   Min.   :1.000  
##  1st Qu.:1.000   Research Scientist       :292   1st Qu.:2.000  
##  Median :2.000   Laboratory Technician    :259   Median :3.000  
##  Mean   :2.064   Manufacturing Director   :145   Mean   :2.729  
##  3rd Qu.:3.000   Healthcare Representative:131   3rd Qu.:4.000  
##  Max.   :5.000   Manager                  :102   Max.   :4.000  
##                  (Other)                  :215                  
##   MaritalStatus MonthlyIncome    MonthlyRate    NumCompaniesWorked
##  Divorced:327   Min.   : 1009   Min.   : 2094   Min.   :0.000     
##  Married :673   1st Qu.: 2911   1st Qu.: 8047   1st Qu.:1.000     
##  Single  :470   Median : 4919   Median :14236   Median :2.000     
##                 Mean   : 6503   Mean   :14313   Mean   :2.693     
##                 3rd Qu.: 8379   3rd Qu.:20462   3rd Qu.:4.000     
##                 Max.   :19999   Max.   :26999   Max.   :9.000     
##                                                                   
##  Over18   OverTime   PercentSalaryHike PerformanceRating
##  Y:1470   No :1054   Min.   :11.00     Min.   :3.000    
##           Yes: 416   1st Qu.:12.00     1st Qu.:3.000    
##                      Median :14.00     Median :3.000    
##                      Mean   :15.21     Mean   :3.154    
##                      3rd Qu.:18.00     3rd Qu.:3.000    
##                      Max.   :25.00     Max.   :4.000    
##                                                         
##  RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
##  Min.   :1.000            Min.   :80    Min.   :0.0000   Min.   : 0.00    
##  1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000   1st Qu.: 6.00    
##  Median :3.000            Median :80    Median :1.0000   Median :10.00    
##  Mean   :2.712            Mean   :80    Mean   :0.7939   Mean   :11.28    
##  3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000   3rd Qu.:15.00    
##  Max.   :4.000            Max.   :80    Max.   :3.0000   Max.   :40.00    
##                                                                           
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         Min.   :1.000   Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         Median :3.000   Median : 5.000   Median : 3.000    
##  Mean   :2.799         Mean   :2.761   Mean   : 7.008   Mean   : 4.229    
##  3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.: 9.000   3rd Qu.: 7.000    
##  Max.   :6.000         Max.   :4.000   Max.   :40.000   Max.   :18.000    
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 1.000          Median : 3.000      
##  Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :15.000          Max.   :17.000      
## 
# Looking at structure of all the features
str(employee)
## 'data.frame':    1470 obs. of  35 variables:
##  $ Age                     : num  41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
##  $ DailyRate               : num  1102 279 1373 1392 591 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
##  $ DistanceFromHome        : num  1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : num  2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
##  $ EmployeeCount           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : num  1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : num  2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
##  $ HourlyRate              : num  94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : num  3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : num  2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
##  $ JobSatisfaction         : num  4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
##  $ MonthlyIncome           : num  5993 5130 2090 2909 3468 ...
##  $ MonthlyRate             : num  19479 24907 2396 23159 16632 ...
##  $ NumCompaniesWorked      : num  8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
##  $ PercentSalaryHike       : num  11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : num  3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: num  1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : num  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : num  0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : num  8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : num  0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : num  1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : num  6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : num  4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : num  0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : num  5 7 0 0 2 6 0 0 8 7 ...

part a)- exploratory data plots

set.seed(1) # setting seed for reproducibility
# Simplified parallel coordinate plot
employee[sapply(employee, is.factor)] <- data.matrix(employee[sapply(employee, is.factor)]) #factorised features
plotmd(employee, class=NULL,main="Plot showing multivariate data for clusters as the parallel coordinates ") #the plot 

# Histogram with normal curve for monthly income
# Histogram
histogram.curve <- hist(employee$MonthlyIncome, breaks = 10, col = "purple", xlab = "Monthly Income", main = "Histogram with Normal Curve")
# Adding normal curve to the histogram
xfit <- seq(min(employee[,19]), max(employee[,19]), length=40)
yfit <- dnorm(xfit, mean=mean(employee[,19]), sd=sd((employee[,19])))
yfit <- yfit*diff(histogram.curve$mids[1:2])*length(employee$MonthlyIncome)
lines(xfit, yfit, col ="black", lwd=2)

# plot shows negatively skewed data
# Plot showing relationships between employees leaving the company with respect to monthly income, percent salary hike and job level
pl <- ggplot(employee1, aes(x=MonthlyIncome, y=PercentSalaryHike)) + geom_point(shape=2)+ ggtitle("Effect of Job Level(1-5), PercentSalaryHike and MonthlyIncome on Attrition(Y/N)")
pl + facet_grid(Attrition ~ JobLevel)

# as expected employees with low job level, less percent salary  hike and  low monthly income have the most attritions.

part b)- detection of outlier

# Calculating cook's distance to detect outliers
set.seed(1)
mod <- lm(Attrition ~ ., data=employee) #model
cooksd <- cooks.distance(mod) # distance
# Plotting cook's distance
plot(cooksd, pch="*", cex=2, main="Outliers using Cooks Distance") %>% #plot
abline(h = 5*mean(cooksd, na.rm=T), col="black") %>%  # cut-off line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>5*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  #labels

# Row numbers with outliers
out.rows <- as.numeric(names(cooksd)[(cooksd > 5*mean(cooksd, na.rm=T))]) 
out.rows
##  [1]   34   46   90  101  123  127  137  206  211  237  251  287  318  367
## [15]  416  437  441  454  470  483  496  505  569  596  608  609  637  661
## [29]  664  701  707  750  753  780  781  790  797  814  837  914  929  940
## [43]  967 1007 1008 1032 1034 1040 1086 1112 1163 1205 1223 1224 1247 1258
## [57] 1280 1291 1292 1299 1313 1334 1355 1391 1443 1445 1453
# Removing outlier rows as they create unwanted significant associated
employee <- employee[-out.rows,]
employee1<-employee1[-out.rows,]

part c) correlation/collinearity analysis

# Correlation
# removing columns 9,22,27 because they have same data so will make correlation NA
corr<-cor(employee[,-c(9,22,27)])
corr
##                                    Age    Attrition BusinessTravel
## Age                       1.000000e+00 -0.241235226    0.019636445
## Attrition                -2.412352e-01  1.000000000    0.002098248
## BusinessTravel            1.963644e-02  0.002098248    1.000000000
## DailyRate                 2.310476e-02 -0.058319995   -0.000815657
## Department               -4.201805e-02  0.085492709   -0.007634083
## DistanceFromHome          6.266418e-03  0.068354557   -0.017803361
## Education                 2.146926e-01 -0.038796966    0.002908125
## EducationField           -4.480291e-02  0.029848172    0.015811896
## EmployeeNumber           -9.702243e-03 -0.022634629   -0.016431969
## EnvironmentSatisfaction   1.469162e-02 -0.120241888    0.002453639
## Gender                   -4.122449e-02  0.039492323   -0.035947443
## HourlyRate                1.613261e-02  0.002572074    0.041196029
## JobInvolvement            3.871854e-02 -0.141030728    0.045739456
## JobLevel                  5.239563e-01 -0.219497700    0.020296950
## JobRole                  -1.322736e-01  0.094286680    0.002831521
## JobSatisfaction          -7.774664e-03 -0.114204328   -0.039164384
## MaritalStatus            -1.064525e-01  0.208359011    0.023416102
## MonthlyIncome             5.088638e-01 -0.198923117    0.034144215
## MonthlyRate               3.016012e-02  0.014796272   -0.011028878
## NumCompaniesWorked        2.967921e-01  0.014202358    0.024705717
## OverTime                  2.287736e-02  0.294476347    0.023679845
## PercentSalaryHike         9.864678e-04 -0.003293167   -0.038124864
## PerformanceRating         1.583229e-05  0.015586698   -0.030986915
## RelationshipSatisfaction  3.610487e-02 -0.051456831   -0.034317569
## StockOptionLevel          3.677617e-02 -0.186496581   -0.019275490
## TotalWorkingYears         6.880815e-01 -0.240099771    0.030888134
## TrainingTimesLastYear    -1.941224e-02 -0.040992900    0.014250344
## WorkLifeBalance          -2.427393e-02 -0.056972270   -0.023709286
## YearsAtCompany            3.050558e-01 -0.203731125   -0.028113293
## YearsInCurrentRole        2.107093e-01 -0.207953876   -0.023266247
## YearsSinceLastPromotion   2.082714e-01 -0.088742382   -0.040261666
## YearsWithCurrManager      2.037952e-01 -0.198888529   -0.029797694
##                              DailyRate   Department DistanceFromHome
## Age                       0.0231047610 -0.042018047      0.006266418
## Attrition                -0.0583199950  0.085492709      0.068354557
## BusinessTravel           -0.0008156570 -0.007634083     -0.017803361
## DailyRate                 1.0000000000  0.002029361     -0.003525394
## Department                0.0020293615  1.000000000      0.024209259
## DistanceFromHome         -0.0035253940  0.024209259      1.000000000
## Education                -0.0189777008  0.016452257      0.022888165
## EducationField            0.0316413282  0.006454538      0.011384844
## EmployeeNumber           -0.0549409525 -0.005473843      0.036025057
## EnvironmentSatisfaction   0.0291488963 -0.018813324     -0.015044993
## Gender                   -0.0190357932 -0.044979171      0.010758566
## HourlyRate                0.0223223491 -0.002801767      0.026240816
## JobInvolvement            0.0472827141 -0.021061850      0.003896336
## JobLevel                 -0.0008977702  0.088549690      0.008434338
## JobRole                  -0.0159384835  0.656711843      0.009654757
## JobSatisfaction           0.0347407676  0.014461880      0.003890915
## MaritalStatus            -0.0804617141  0.051451243     -0.015707123
## MonthlyIncome             0.0061972952  0.042057830     -0.010942523
## MonthlyRate              -0.0324882144  0.019866429      0.029712236
## NumCompaniesWorked        0.0372170524 -0.030122220     -0.019232157
## OverTime                  0.0087315802  0.020033010      0.029637371
## PercentSalaryHike         0.0262917302 -0.009466719      0.033048239
## PerformanceRating         0.0031410418 -0.023319523      0.020795807
## RelationshipSatisfaction  0.0157215199 -0.023629422      0.008668048
## StockOptionLevel          0.0547204997 -0.015568682      0.044635608
## TotalWorkingYears         0.0200988352 -0.026826131      0.010349834
## TrainingTimesLastYear     0.0163429387  0.022975228     -0.030464697
## WorkLifeBalance          -0.0481454260  0.029358048     -0.014495145
## YearsAtCompany           -0.0250310357  0.018213796      0.019387322
## YearsInCurrentRole        0.0179730236  0.048258092      0.028096643
## YearsSinceLastPromotion  -0.0215009506  0.035529507      0.014560296
## YearsWithCurrManager     -0.0225905812  0.027870683      0.019781222
##                              Education EducationField EmployeeNumber
## Age                       0.2146926081  -0.0448029097   -0.009702243
## Attrition                -0.0387969657   0.0298481715   -0.022634629
## BusinessTravel            0.0029081248   0.0158118956   -0.016431969
## DailyRate                -0.0189777008   0.0316413282   -0.054940953
## Department                0.0164522571   0.0064545383   -0.005473843
## DistanceFromHome          0.0228881650   0.0113848442    0.036025057
## Education                 1.0000000000  -0.0336744731    0.045101638
## EducationField           -0.0336744731   1.0000000000   -0.001623660
## EmployeeNumber            0.0451016375  -0.0016236599    1.000000000
## EnvironmentSatisfaction  -0.0238703063   0.0386173604    0.014450765
## Gender                   -0.0144213401  -0.0075878311    0.020857824
## HourlyRate                0.0123159988  -0.0144524940    0.023628316
## JobInvolvement            0.0269079421  -0.0013451940   -0.009192887
## JobLevel                  0.1023230535  -0.0506793463   -0.013019091
## JobRole                  -0.0008458784   0.0135761763   -0.002640955
## JobSatisfaction          -0.0006136880  -0.0521276121   -0.046512690
## MaritalStatus             0.0053035230   0.0079194775   -0.010995947
## MonthlyIncome             0.0958874368  -0.0463107472   -0.008861990
## MonthlyRate              -0.0306881444  -0.0225801068    0.019077097
## NumCompaniesWorked        0.1313135568  -0.0025932950   -0.007102560
## OverTime                 -0.0234470121   0.0105761273   -0.021695289
## PercentSalaryHike        -0.0120362978  -0.0062307376   -0.005568529
## PerformanceRating        -0.0229583541   0.0003541062   -0.016418022
## RelationshipSatisfaction -0.0151478121   0.0000697670   -0.067313962
## StockOptionLevel          0.0206305626  -0.0065788492    0.060263738
## TotalWorkingYears         0.1463242155  -0.0334772543   -0.010257302
## TrainingTimesLastYear    -0.0248864201   0.0482077581    0.029429843
## WorkLifeBalance           0.0102399469   0.0414885831    0.014637932
## YearsAtCompany            0.0660656281  -0.0283056386   -0.010669690
## YearsInCurrentRole        0.0676415247  -0.0203814956   -0.007081814
## YearsSinceLastPromotion   0.0538988593  -0.0046341349   -0.004174551
## YearsWithCurrManager      0.0696125207  -0.0102769855   -0.011989551
##                          EnvironmentSatisfaction       Gender
## Age                                 0.0146916164 -0.041224486
## Attrition                          -0.1202418880  0.039492323
## BusinessTravel                      0.0024536391 -0.035947443
## DailyRate                           0.0291488963 -0.019035793
## Department                         -0.0188133241 -0.044979171
## DistanceFromHome                   -0.0150449928  0.010758566
## Education                          -0.0238703063 -0.014421340
## EducationField                      0.0386173604 -0.007587831
## EmployeeNumber                      0.0144507652  0.020857824
## EnvironmentSatisfaction             1.0000000000 -0.001696106
## Gender                             -0.0016961060  1.000000000
## HourlyRate                         -0.0498142258  0.001655487
## JobInvolvement                     -0.0136146280  0.024109178
## JobLevel                            0.0124521190 -0.035010291
## JobRole                            -0.0231222962 -0.044404301
## JobSatisfaction                    -0.0031327988  0.034341907
## MaritalStatus                      -0.0048023120 -0.045677815
## MonthlyIncome                       0.0045789639 -0.027848933
## MonthlyRate                         0.0415924990 -0.055934891
## NumCompaniesWorked                  0.0213068457 -0.056162873
## OverTime                            0.0703015173 -0.039197537
## PercentSalaryHike                  -0.0378422520  0.007508565
## PerformanceRating                  -0.0317644462 -0.011998275
## RelationshipSatisfaction            0.0064354574  0.021074097
## StockOptionLevel                    0.0211301706  0.016718935
## TotalWorkingYears                   0.0019383244 -0.043684637
## TrainingTimesLastYear              -0.0256852358 -0.052455306
## WorkLifeBalance                     0.0506216914 -0.003991358
## YearsAtCompany                      0.0001709952 -0.015745729
## YearsInCurrentRole                  0.0194891303 -0.027327397
## YearsSinceLastPromotion             0.0070097149 -0.019356602
## YearsWithCurrManager               -0.0076318203 -0.017030727
##                             HourlyRate JobInvolvement      JobLevel
## Age                       0.0161326138    0.038718542  0.5239562560
## Attrition                 0.0025720743   -0.141030728 -0.2194977004
## BusinessTravel            0.0411960286    0.045739456  0.0202969498
## DailyRate                 0.0223223491    0.047282714 -0.0008977702
## Department               -0.0028017665   -0.021061850  0.0885496898
## DistanceFromHome          0.0262408156    0.003896336  0.0084343378
## Education                 0.0123159988    0.026907942  0.1023230535
## EducationField           -0.0144524940   -0.001345194 -0.0506793463
## EmployeeNumber            0.0236283161   -0.009192887 -0.0130190913
## EnvironmentSatisfaction  -0.0498142258   -0.013614628  0.0124521190
## Gender                    0.0016554869    0.024109178 -0.0350102905
## HourlyRate                1.0000000000    0.035934307 -0.0358828684
## JobInvolvement            0.0359343073    1.000000000 -0.0017770047
## JobLevel                 -0.0358828684   -0.001777005  1.0000000000
## JobRole                  -0.0133158468    0.004715372 -0.0925911338
## JobSatisfaction          -0.0620497153   -0.012133053  0.0001012888
## MaritalStatus            -0.0234166488   -0.045352277 -0.0922862704
## MonthlyIncome            -0.0197716482   -0.004188563  0.9504530481
## MonthlyRate              -0.0075871031   -0.013528111  0.0305521165
## NumCompaniesWorked        0.0336919452    0.018504978  0.1488872115
## OverTime                 -0.0092643281   -0.010960646  0.0022840840
## PercentSalaryHike        -0.0055634040   -0.025049993 -0.0266438988
## PerformanceRating         0.0027413009   -0.035838034 -0.0147970990
## RelationshipSatisfaction -0.0001790547    0.034570544  0.0118489146
## StockOptionLevel          0.0559621563    0.019529748  0.0199108184
## TotalWorkingYears        -0.0069644058    0.004595564  0.7888010566
## TrainingTimesLastYear    -0.0159184366   -0.020924551 -0.0131568752
## WorkLifeBalance          -0.0039903119   -0.026534673  0.0374544986
## YearsAtCompany           -0.0258502546   -0.005085375  0.5322274942
## YearsInCurrentRole       -0.0191355168    0.015949201  0.3813053410
## YearsSinceLastPromotion  -0.0225901106   -0.024658219  0.3555828341
## YearsWithCurrManager     -0.0178939956    0.035292723  0.3747403236
##                                JobRole JobSatisfaction MaritalStatus
## Age                      -0.1322736333   -0.0077746644  -0.106452490
## Attrition                 0.0942866802   -0.1142043276   0.208359011
## BusinessTravel            0.0028315211   -0.0391643840   0.023416102
## DailyRate                -0.0159384835    0.0347407676  -0.080461714
## Department                0.6567118435    0.0144618800   0.051451243
## DistanceFromHome          0.0096547571    0.0038909155  -0.015707123
## Education                -0.0008458784   -0.0006136880   0.005303523
## EducationField            0.0135761763   -0.0521276121   0.007919478
## EmployeeNumber           -0.0026409545   -0.0465126897  -0.010995947
## EnvironmentSatisfaction  -0.0231222962   -0.0031327988  -0.004802312
## Gender                   -0.0444043008    0.0343419072  -0.045677815
## HourlyRate               -0.0133158468   -0.0620497153  -0.023416649
## JobInvolvement            0.0047153716   -0.0121330533  -0.045352277
## JobLevel                 -0.0925911338    0.0001012888  -0.092286270
## JobRole                   1.0000000000    0.0167890512   0.067907097
## JobSatisfaction           0.0167890512    1.0000000000   0.027825252
## MaritalStatus             0.0679070967    0.0278252519   1.000000000
## MonthlyIncome            -0.0992000792   -0.0058601375  -0.089279698
## MonthlyRate               0.0001118104   -0.0042075978   0.027429678
## NumCompaniesWorked       -0.0568287829   -0.0524370651  -0.021433148
## OverTime                  0.0465877563    0.0311789723  -0.012049470
## PercentSalaryHike        -0.0028575848    0.0309858821   0.005476102
## PerformanceRating        -0.0207639261    0.0097129903   0.004796253
## RelationshipSatisfaction -0.0166132878   -0.0155536785   0.029975223
## StockOptionLevel         -0.0259231089    0.0006706405  -0.672821557
## TotalWorkingYears        -0.1507431251   -0.0201923394  -0.087269456
## TrainingTimesLastYear    -0.0122381451   -0.0096470558   0.007328126
## WorkLifeBalance           0.0318996304   -0.0286460096   0.011755395
## YearsAtCompany           -0.0785332701   -0.0090430452  -0.074421767
## YearsInCurrentRole       -0.0248936788   -0.0013594036  -0.081654023
## YearsSinceLastPromotion  -0.0502783241   -0.0261099023  -0.042581170
## YearsWithCurrManager     -0.0405299009   -0.0219930738  -0.055524457
##                          MonthlyIncome   MonthlyRate NumCompaniesWorked
## Age                        0.508863763  0.0301601239        0.296792079
## Attrition                 -0.198923117  0.0147962718        0.014202358
## BusinessTravel             0.034144215 -0.0110288776        0.024705717
## DailyRate                  0.006197295 -0.0324882144        0.037217052
## Department                 0.042057830  0.0198664291       -0.030122220
## DistanceFromHome          -0.010942523  0.0297122359       -0.019232157
## Education                  0.095887437 -0.0306881444        0.131313557
## EducationField            -0.046310747 -0.0225801068       -0.002593295
## EmployeeNumber            -0.008861990  0.0190770967       -0.007102560
## EnvironmentSatisfaction    0.004578964  0.0415924990        0.021306846
## Gender                    -0.027848933 -0.0559348910       -0.056162873
## HourlyRate                -0.019771648 -0.0075871031        0.033691945
## JobInvolvement            -0.004188563 -0.0135281109        0.018504978
## JobLevel                   0.950453048  0.0305521165        0.148887211
## JobRole                   -0.099200079  0.0001118104       -0.056828783
## JobSatisfaction           -0.005860137 -0.0042075978       -0.052437065
## MaritalStatus             -0.089279698  0.0274296775       -0.021433148
## MonthlyIncome              1.000000000  0.0260869043        0.153465709
## MonthlyRate                0.026086904  1.0000000000        0.014641404
## NumCompaniesWorked         0.153465709  0.0146414037        1.000000000
## OverTime                   0.003715759  0.0187146490       -0.034688388
## PercentSalaryHike         -0.020461847 -0.0125556313       -0.005704167
## PerformanceRating         -0.010667695 -0.0232817107       -0.016408194
## RelationshipSatisfaction   0.014833942 -0.0051338443        0.045151771
## StockOptionLevel           0.011418110 -0.0287286714        0.007190490
## TotalWorkingYears          0.778051117  0.0222044642        0.240701974
## TrainingTimesLastYear     -0.015742666  0.0014956357       -0.063375104
## WorkLifeBalance            0.026783146  0.0059768742       -0.016092762
## YearsAtCompany             0.509622233 -0.0242591182       -0.128561053
## YearsInCurrentRole         0.352950217 -0.0136375516       -0.093938392
## YearsSinceLastPromotion    0.344207352  0.0030161680       -0.044193769
## YearsWithCurrManager       0.342195690 -0.0305885965       -0.111025318
##                              OverTime PercentSalaryHike PerformanceRating
## Age                       0.022877358      0.0009864678      1.583229e-05
## Attrition                 0.294476347     -0.0032931669      1.558670e-02
## BusinessTravel            0.023679845     -0.0381248643     -3.098692e-02
## DailyRate                 0.008731580      0.0262917302      3.141042e-03
## Department                0.020033010     -0.0094667189     -2.331952e-02
## DistanceFromHome          0.029637371      0.0330482392      2.079581e-02
## Education                -0.023447012     -0.0120362978     -2.295835e-02
## EducationField            0.010576127     -0.0062307376      3.541062e-04
## EmployeeNumber           -0.021695289     -0.0055685294     -1.641802e-02
## EnvironmentSatisfaction   0.070301517     -0.0378422520     -3.176445e-02
## Gender                   -0.039197537      0.0075085645     -1.199827e-02
## HourlyRate               -0.009264328     -0.0055634040      2.741301e-03
## JobInvolvement           -0.010960646     -0.0250499926     -3.583803e-02
## JobLevel                  0.002284084     -0.0266438988     -1.479710e-02
## JobRole                   0.046587756     -0.0028575848     -2.076393e-02
## JobSatisfaction           0.031178972      0.0309858821      9.712990e-03
## MaritalStatus            -0.012049470      0.0054761017      4.796253e-03
## MonthlyIncome             0.003715759     -0.0204618467     -1.066770e-02
## MonthlyRate               0.018714649     -0.0125556313     -2.328171e-02
## NumCompaniesWorked       -0.034688388     -0.0057041671     -1.640819e-02
## OverTime                  1.000000000     -0.0061237227      5.055235e-03
## PercentSalaryHike        -0.006123723      1.0000000000      7.752059e-01
## PerformanceRating         0.005055235      0.7752058871      1.000000e+00
## RelationshipSatisfaction  0.048662479     -0.0401537684     -3.397377e-02
## StockOptionLevel         -0.011625668      0.0138140123      3.415102e-03
## TotalWorkingYears         0.007939681     -0.0129033951      9.577344e-03
## TrainingTimesLastYear    -0.073015320     -0.0113671248     -1.566635e-02
## WorkLifeBalance          -0.031521995     -0.0087864625     -2.909185e-03
## YearsAtCompany           -0.007088169     -0.0261976925      1.387656e-02
## YearsInCurrentRole       -0.028518772      0.0063355128      4.155506e-02
## YearsSinceLastPromotion  -0.010481661     -0.0064953485      3.403031e-02
## YearsWithCurrManager     -0.042310383     -0.0092123602      2.500311e-02
##                          RelationshipSatisfaction StockOptionLevel
## Age                                  0.0361048738     0.0367761664
## Attrition                           -0.0514568313    -0.1864965809
## BusinessTravel                      -0.0343175689    -0.0192754897
## DailyRate                            0.0157215199     0.0547204997
## Department                          -0.0236294221    -0.0155686822
## DistanceFromHome                     0.0086680478     0.0446356078
## Education                           -0.0151478121     0.0206305626
## EducationField                       0.0000697670    -0.0065788492
## EmployeeNumber                      -0.0673139618     0.0602637376
## EnvironmentSatisfaction              0.0064354574     0.0211301706
## Gender                               0.0210740975     0.0167189350
## HourlyRate                          -0.0001790547     0.0559621563
## JobInvolvement                       0.0345705438     0.0195297484
## JobLevel                             0.0118489146     0.0199108184
## JobRole                             -0.0166132878    -0.0259231089
## JobSatisfaction                     -0.0155536785     0.0006706405
## MaritalStatus                        0.0299752234    -0.6728215570
## MonthlyIncome                        0.0148339422     0.0114181100
## MonthlyRate                         -0.0051338443    -0.0287286714
## NumCompaniesWorked                   0.0451517708     0.0071904904
## OverTime                             0.0486624787    -0.0116256675
## PercentSalaryHike                   -0.0401537684     0.0138140123
## PerformanceRating                   -0.0339737731     0.0034151022
## RelationshipSatisfaction             1.0000000000    -0.0479248925
## StockOptionLevel                    -0.0479248925     1.0000000000
## TotalWorkingYears                    0.0117373620     0.0146367497
## TrainingTimesLastYear                0.0061291010     0.0086376356
## WorkLifeBalance                      0.0237329952    -0.0107843273
## YearsAtCompany                       0.0110440046     0.0243968665
## YearsInCurrentRole                  -0.0219658801     0.0665099557
## YearsSinceLastPromotion              0.0278727903     0.0271476406
## YearsWithCurrManager                 0.0035068027     0.0337954615
##                          TotalWorkingYears TrainingTimesLastYear
## Age                           0.6880815297          -0.019412238
## Attrition                    -0.2400997709          -0.040992900
## BusinessTravel                0.0308881340           0.014250344
## DailyRate                     0.0200988352           0.016342939
## Department                   -0.0268261305           0.022975228
## DistanceFromHome              0.0103498343          -0.030464697
## Education                     0.1463242155          -0.024886420
## EducationField               -0.0334772543           0.048207758
## EmployeeNumber               -0.0102573015           0.029429843
## EnvironmentSatisfaction       0.0019383244          -0.025685236
## Gender                       -0.0436846367          -0.052455306
## HourlyRate                   -0.0069644058          -0.015918437
## JobInvolvement                0.0045955642          -0.020924551
## JobLevel                      0.7888010566          -0.013156875
## JobRole                      -0.1507431251          -0.012238145
## JobSatisfaction              -0.0201923394          -0.009647056
## MaritalStatus                -0.0872694565           0.007328126
## MonthlyIncome                 0.7780511171          -0.015742666
## MonthlyRate                   0.0222044642           0.001495636
## NumCompaniesWorked            0.2407019744          -0.063375104
## OverTime                      0.0079396814          -0.073015320
## PercentSalaryHike            -0.0129033951          -0.011367125
## PerformanceRating             0.0095773439          -0.015666347
## RelationshipSatisfaction      0.0117373620           0.006129101
## StockOptionLevel              0.0146367497           0.008637636
## TotalWorkingYears             1.0000000000          -0.029071802
## TrainingTimesLastYear        -0.0290718016           1.000000000
## WorkLifeBalance              -0.0002674499           0.023441833
## YearsAtCompany                0.6163595405           0.008867461
## YearsInCurrentRole            0.4540586911           0.003822341
## YearsSinceLastPromotion       0.3970743736           0.007722920
## YearsWithCurrManager          0.4601606175           0.002206372
##                          WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Age                        -0.0242739281   0.3050557627        0.210709268
## Attrition                  -0.0569722697  -0.2037311255       -0.207953876
## BusinessTravel             -0.0237092855  -0.0281132930       -0.023266247
## DailyRate                  -0.0481454260  -0.0250310357        0.017973024
## Department                  0.0293580481   0.0182137956        0.048258092
## DistanceFromHome           -0.0144951446   0.0193873224        0.028096643
## Education                   0.0102399469   0.0660656281        0.067641525
## EducationField              0.0414885831  -0.0283056386       -0.020381496
## EmployeeNumber              0.0146379323  -0.0106696896       -0.007081814
## EnvironmentSatisfaction     0.0506216914   0.0001709952        0.019489130
## Gender                     -0.0039913579  -0.0157457294       -0.027327397
## HourlyRate                 -0.0039903119  -0.0258502546       -0.019135517
## JobInvolvement             -0.0265346733  -0.0050853745        0.015949201
## JobLevel                    0.0374544986   0.5322274942        0.381305341
## JobRole                     0.0318996304  -0.0785332701       -0.024893679
## JobSatisfaction            -0.0286460096  -0.0090430452       -0.001359404
## MaritalStatus               0.0117553950  -0.0744217666       -0.081654023
## MonthlyIncome               0.0267831462   0.5096222334        0.352950217
## MonthlyRate                 0.0059768742  -0.0242591182       -0.013637552
## NumCompaniesWorked         -0.0160927618  -0.1285610532       -0.093938392
## OverTime                   -0.0315219945  -0.0070881690       -0.028518772
## PercentSalaryHike          -0.0087864625  -0.0261976925        0.006335513
## PerformanceRating          -0.0029091848   0.0138765650        0.041555057
## RelationshipSatisfaction    0.0237329952   0.0110440046       -0.021965880
## StockOptionLevel           -0.0107843273   0.0243968665        0.066509956
## TotalWorkingYears          -0.0002674499   0.6163595405        0.454058691
## TrainingTimesLastYear       0.0234418326   0.0088674608        0.003822341
## WorkLifeBalance             1.0000000000   0.0178191280        0.046796548
## YearsAtCompany              0.0178191280   1.0000000000        0.760942474
## YearsInCurrentRole          0.0467965479   0.7609424740        1.000000000
## YearsSinceLastPromotion     0.0199432436   0.6131197286        0.538446283
## YearsWithCurrManager        0.0097478105   0.7803833444        0.717532190
##                          YearsSinceLastPromotion YearsWithCurrManager
## Age                                  0.208271410          0.203795174
## Attrition                           -0.088742382         -0.198888529
## BusinessTravel                      -0.040261666         -0.029797694
## DailyRate                           -0.021500951         -0.022590581
## Department                           0.035529507          0.027870683
## DistanceFromHome                     0.014560296          0.019781222
## Education                            0.053898859          0.069612521
## EducationField                      -0.004634135         -0.010276986
## EmployeeNumber                      -0.004174551         -0.011989551
## EnvironmentSatisfaction              0.007009715         -0.007631820
## Gender                              -0.019356602         -0.017030727
## HourlyRate                          -0.022590111         -0.017893996
## JobInvolvement                      -0.024658219          0.035292723
## JobLevel                             0.355582834          0.374740324
## JobRole                             -0.050278324         -0.040529901
## JobSatisfaction                     -0.026109902         -0.021993074
## MaritalStatus                       -0.042581170         -0.055524457
## MonthlyIncome                        0.344207352          0.342195690
## MonthlyRate                          0.003016168         -0.030588596
## NumCompaniesWorked                  -0.044193769         -0.111025318
## OverTime                            -0.010481661         -0.042310383
## PercentSalaryHike                   -0.006495348         -0.009212360
## PerformanceRating                    0.034030310          0.025003111
## RelationshipSatisfaction             0.027872790          0.003506803
## StockOptionLevel                     0.027147641          0.033795462
## TotalWorkingYears                    0.397074374          0.460160618
## TrainingTimesLastYear                0.007722920          0.002206372
## WorkLifeBalance                      0.019943244          0.009747810
## YearsAtCompany                       0.613119729          0.780383344
## YearsInCurrentRole                   0.538446283          0.717532190
## YearsSinceLastPromotion              1.000000000          0.507459792
## YearsWithCurrManager                 0.507459792          1.000000000
# Collinearity
corrgram(corr,order=TRUE,lower.panel=panel.shade,upper.panel=panel.pie)

# The plot shows that there are lot of irrelevant features so they need to be removed before building the classification models
# Removing features with same data in all the cases
employee1<-employee1[-c(9,22,27)] 
employee2<-employee1

Data Cleaning & Shaping

part a)-Imputation missing values

sum(is.na(employee2))
## [1] 0
# no NAs in dataset`
# Creating 117 random NAs
set.seed(1)
n_missing<-117
# selecting random sampling
y<-data.frame(row=sample(nrow(employee2),size=n_missing,replace = T),col=sample(ncol(employee2),size = n_missing,replace = T))
# replacing with NAs
employee2[as.matrix(y)]<-NA
sum(is.na(employee2)) #verifying
## [1] 117
# Looking at the pattern of NAs generated in dataset
md.pattern(employee2)
##      Age PercentSalaryHike TotalWorkingYears Attrition Department
## 1289   1                 1                 1         1          1
##    2   1                 1                 1         0          1
##    3   1                 1                 1         1          1
##    5   1                 1                 1         1          1
##    2   1                 1                 1         1          0
##    5   1                 1                 1         1          1
##    5   1                 1                 1         1          1
##    2   1                 1                 1         1          1
##    6   1                 1                 1         1          1
##    4   1                 1                 1         1          1
##    5   1                 1                 1         1          1
##    1   1                 1                 1         1          1
##    3   1                 1                 1         1          1
##    2   1                 1                 1         1          1
##    5   1                 1                 1         1          1
##    2   1                 1                 1         1          1
##   10   1                 1                 1         1          1
##    2   1                 1                 1         1          1
##    2   1                 1                 1         1          1
##    5   1                 1                 1         1          1
##    6   1                 1                 1         1          1
##    1   1                 0                 1         1          1
##    2   1                 1                 1         1          1
##    5   1                 1                 1         1          1
##    4   1                 1                 1         1          1
##    1   1                 1                 0         1          1
##    3   1                 1                 1         1          1
##    3   1                 1                 1         1          1
##    6   1                 1                 1         1          1
##    2   1                 1                 1         1          1
##    5   1                 1                 1         1          1
##    2   1                 1                 1         1          1
##    1   1                 1                 1         1          1
##    1   1                 1                 1         1          1
##    1   1                 1                 1         1          1
##        0                 1                 1         2          2
##      EducationField HourlyRate JobLevel JobSatisfaction PerformanceRating
## 1289              1          1        1               1                 1
##    2              1          1        1               1                 1
##    3              1          1        1               1                 1
##    5              1          1        1               1                 1
##    2              1          1        1               1                 1
##    5              1          1        1               1                 1
##    5              1          1        1               1                 1
##    2              0          1        1               1                 1
##    6              1          1        1               1                 1
##    4              1          1        1               1                 1
##    5              1          1        1               1                 1
##    1              1          0        1               1                 1
##    3              1          1        1               1                 1
##    2              1          1        0               1                 1
##    5              1          1        1               1                 1
##    2              1          1        1               0                 1
##   10              1          1        1               1                 1
##    2              1          1        1               1                 1
##    2              1          1        1               1                 1
##    5              1          1        1               1                 1
##    6              1          1        1               1                 1
##    1              1          1        1               1                 1
##    2              1          1        1               1                 0
##    5              1          1        1               1                 1
##    4              1          1        1               1                 1
##    1              1          1        1               1                 1
##    3              1          1        1               1                 1
##    3              1          1        1               1                 1
##    6              1          1        1               1                 1
##    2              1          1        1               1                 1
##    5              1          1        1               1                 1
##    2              1          1        1               1                 1
##    1              1          0        1               1                 1
##    1              1          1        1               1                 1
##    1              1          1        1               1                 1
##                   2          2        2               2                 2
##      YearsWithCurrManager BusinessTravel JobInvolvement MonthlyIncome
## 1289                    1              1              1             1
##    2                    1              1              1             1
##    3                    1              0              1             1
##    5                    1              1              1             1
##    2                    1              1              1             1
##    5                    1              1              1             1
##    5                    1              1              1             1
##    2                    1              1              1             1
##    6                    1              1              1             1
##    4                    1              1              1             1
##    5                    1              1              1             1
##    1                    1              1              1             1
##    3                    1              1              0             1
##    2                    1              1              1             1
##    5                    1              1              1             1
##    2                    1              1              1             1
##   10                    1              1              1             1
##    2                    1              1              1             0
##    2                    1              1              1             1
##    5                    1              1              1             1
##    6                    1              1              1             1
##    1                    1              1              1             1
##    2                    1              1              1             1
##    5                    1              1              1             1
##    4                    1              1              1             1
##    1                    1              1              1             1
##    3                    1              1              1             1
##    3                    1              1              1             1
##    6                    1              1              1             1
##    2                    1              1              1             1
##    5                    1              1              1             1
##    2                    0              1              1             1
##    1                    1              1              1             1
##    1                    1              1              1             0
##    1                    1              1              1             1
##                         2              3              3             3
##      MonthlyRate TrainingTimesLastYear WorkLifeBalance YearsInCurrentRole
## 1289           1                     1               1                  1
##    2           1                     1               1                  1
##    3           1                     1               1                  1
##    5           1                     1               1                  1
##    2           1                     1               1                  1
##    5           1                     1               1                  1
##    5           1                     1               1                  1
##    2           1                     1               1                  1
##    6           1                     1               1                  1
##    4           1                     1               1                  1
##    5           1                     1               1                  1
##    1           1                     1               1                  1
##    3           1                     1               1                  1
##    2           1                     1               1                  1
##    5           1                     1               1                  1
##    2           1                     1               1                  1
##   10           1                     1               1                  1
##    2           1                     1               1                  1
##    2           0                     1               1                  1
##    5           1                     1               1                  1
##    6           1                     1               1                  1
##    1           1                     1               1                  1
##    2           1                     1               1                  1
##    5           1                     1               1                  1
##    4           1                     1               1                  1
##    1           1                     1               1                  1
##    3           1                     0               1                  1
##    3           1                     1               0                  1
##    6           1                     1               1                  1
##    2           1                     1               1                  0
##    5           1                     1               1                  1
##    2           1                     1               1                  1
##    1           1                     1               1                  1
##    1           0                     1               1                  1
##    1           1                     1               1                  0
##                3                     3               3                  3
##      EnvironmentSatisfaction StockOptionLevel DailyRate DistanceFromHome
## 1289                       1                1         1                1
##    2                       1                1         1                1
##    3                       1                1         1                1
##    5                       1                1         0                1
##    2                       1                1         1                1
##    5                       1                1         1                0
##    5                       1                1         1                1
##    2                       1                1         1                1
##    6                       1                1         1                1
##    4                       0                1         1                1
##    5                       1                1         1                1
##    1                       1                1         1                1
##    3                       1                1         1                1
##    2                       1                1         1                1
##    5                       1                1         1                1
##    2                       1                1         1                1
##   10                       1                1         1                1
##    2                       1                1         1                1
##    2                       1                1         1                1
##    5                       1                1         1                1
##    6                       1                1         1                1
##    1                       1                1         1                1
##    2                       1                1         1                1
##    5                       1                1         1                1
##    4                       1                0         1                1
##    1                       1                1         1                1
##    3                       1                1         1                1
##    3                       1                1         1                1
##    6                       1                1         1                1
##    2                       1                1         1                1
##    5                       1                1         1                1
##    2                       1                1         1                1
##    1                       1                1         1                1
##    1                       1                1         1                1
##    1                       1                1         1                1
##                            4                4         5                5
##      Education JobRole NumCompaniesWorked RelationshipSatisfaction
## 1289         1       1                  1                        1
##    2         1       1                  1                        1
##    3         1       1                  1                        1
##    5         1       1                  1                        1
##    2         1       1                  1                        1
##    5         1       1                  1                        1
##    5         0       1                  1                        1
##    2         1       1                  1                        1
##    6         1       1                  1                        1
##    4         1       1                  1                        1
##    5         1       1                  1                        1
##    1         1       1                  1                        1
##    3         1       1                  1                        1
##    2         1       1                  1                        1
##    5         1       0                  1                        1
##    2         1       1                  1                        1
##   10         1       1                  1                        1
##    2         1       1                  1                        1
##    2         1       1                  1                        1
##    5         1       1                  0                        1
##    6         1       1                  1                        1
##    1         1       1                  1                        1
##    2         1       1                  1                        1
##    5         1       1                  1                        0
##    4         1       1                  1                        1
##    1         1       1                  1                        1
##    3         1       1                  1                        1
##    3         1       1                  1                        1
##    6         1       1                  1                        1
##    2         1       1                  1                        1
##    5         1       1                  1                        1
##    2         1       1                  1                        1
##    1         1       1                  1                        1
##    1         1       1                  1                        1
##    1         1       1                  1                        1
##              5       5                  5                        5
##      YearsSinceLastPromotion Gender OverTime YearsAtCompany EmployeeNumber
## 1289                       1      1        1              1              1
##    2                       1      1        1              1              1
##    3                       1      1        1              1              1
##    5                       1      1        1              1              1
##    2                       1      1        1              1              1
##    5                       1      1        1              1              1
##    5                       1      1        1              1              1
##    2                       1      1        1              1              1
##    6                       1      1        1              1              0
##    4                       1      1        1              1              1
##    5                       1      0        1              1              1
##    1                       1      1        1              1              1
##    3                       1      1        1              1              1
##    2                       1      1        1              1              1
##    5                       1      1        1              1              1
##    2                       1      1        1              1              1
##   10                       1      1        1              1              1
##    2                       1      1        1              1              1
##    2                       1      1        1              1              1
##    5                       1      1        1              1              1
##    6                       1      1        0              1              1
##    1                       1      1        1              1              1
##    2                       1      1        1              1              1
##    5                       1      1        1              1              1
##    4                       1      1        1              1              1
##    1                       1      1        1              1              1
##    3                       1      1        1              1              1
##    3                       1      1        1              1              1
##    6                       1      1        1              0              1
##    2                       1      1        1              1              1
##    5                       0      1        1              1              1
##    2                       1      1        1              1              1
##    1                       1      1        1              1              0
##    1                       1      1        1              1              1
##    1                       1      0        1              1              1
##                            5      6        6              6              7
##      MaritalStatus    
## 1289             1   0
##    2             1   1
##    3             1   1
##    5             1   1
##    2             1   1
##    5             1   1
##    5             1   1
##    2             1   1
##    6             1   1
##    4             1   1
##    5             1   1
##    1             1   1
##    3             1   1
##    2             1   1
##    5             1   1
##    2             1   1
##   10             0   1
##    2             1   1
##    2             1   1
##    5             1   1
##    6             1   1
##    1             1   1
##    2             1   1
##    5             1   1
##    4             1   1
##    1             1   1
##    3             1   1
##    3             1   1
##    6             1   1
##    2             1   1
##    5             1   1
##    2             1   1
##    1             1   2
##    1             1   2
##    1             1   2
##                 10 117
# verifying for NAs once again
sum(is.na(employee2)) 
## [1] 0
# Note: See section named 'other' for comparing accuracies with imputed dataset vs original dataset at the bottom of the code/page.

part b) and c) Dummy codes and Normalization/Standardization of features

# Dummy codes for all the columns(predictors) except Attrition column (response)
set.seed(1)
employee1[,-2][sapply(employee1[,-2], is.factor)] <- data.matrix(employee1[,-2][sapply(employee1[,-2], is.factor)])
# Normalization
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) } #normalize fun min-max
employee.n <- as.data.frame(lapply(employee1[,-2], normalize)) 
# combining response and predictors
employee.n<-cbind(employee1$Attrition, employee.n)
#Verifying one of the features
summary(employee.n$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2857  0.4286  0.4465  0.5714  1.0000

part d)- Feature engineering-PCA

# Principal Component Analysis
pca = prcomp(employee.n[2:32], scale. = TRUE)
# Sqrt of eigenvalues
pca$sdev
##  [1] 2.1647948 1.3826349 1.3455357 1.3094111 1.2603586 1.0916380 1.0779099
##  [8] 1.0680234 1.0518422 1.0389891 1.0150137 1.0099269 0.9989128 0.9905508
## [15] 0.9787930 0.9721271 0.9676461 0.9437598 0.9371604 0.9155697 0.9105246
## [22] 0.8519604 0.7386985 0.7169205 0.5714016 0.5611696 0.5266126 0.4712836
## [29] 0.4362052 0.3750620 0.2148756
# Scree plot
fviz_eig(pca)

# Note that the elbow is at 2 dimensions
# So, reducing dataset to 2 dimensions from 10
# Circle of correlations
fviz_pca_var(pca,col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),repel = TRUE)

# Looking at rotation (or loadings in some methods) values to select features
# for dimension 1
pca$rotation[,1][order(pca$rotation[,1])] # selecting top 5 and bottom 5 of the list to ensure contributions in opposite directions, also circle of correlations can be used to verify this.
##        TotalWorkingYears           YearsAtCompany                 JobLevel 
##            -0.4025337444            -0.3871538079            -0.3828129161 
##            MonthlyIncome       YearsInCurrentRole     YearsWithCurrManager 
##            -0.3737297815            -0.3310094178            -0.3285591888 
##  YearsSinceLastPromotion                      Age                Education 
##            -0.2920206353            -0.2804944941            -0.0783450760 
##       NumCompaniesWorked         StockOptionLevel          WorkLifeBalance 
##            -0.0493835921            -0.0342325831            -0.0122944552 
##         DistanceFromHome RelationshipSatisfaction           JobInvolvement 
##            -0.0093213955            -0.0081754856            -0.0064832230 
##               Department        PerformanceRating  EnvironmentSatisfaction 
##            -0.0062809651            -0.0061742201            -0.0052079207 
##              MonthlyRate                DailyRate           BusinessTravel 
##            -0.0036171666            -0.0017954140             0.0007696971 
##           EmployeeNumber                 OverTime    TrainingTimesLastYear 
##             0.0056997083             0.0058668933             0.0066309936 
##        PercentSalaryHike          JobSatisfaction               HourlyRate 
##             0.0088169474             0.0097840834             0.0108770719 
##                   Gender           EducationField                  JobRole 
##             0.0208039036             0.0239458907             0.0652706405 
##            MaritalStatus 
##             0.0670899520
# for dimension 2
pca$rotation[,2][order(pca$rotation[,2])] # selecting top 5 and bottom 5 of these 
##       YearsInCurrentRole     YearsWithCurrManager                  JobRole 
##             -0.308535600             -0.308068262             -0.293929060 
##               Department           YearsAtCompany  YearsSinceLastPromotion 
##             -0.268780231             -0.251755297             -0.235079683 
##        PerformanceRating        PercentSalaryHike            MaritalStatus 
##             -0.197156220             -0.179008833             -0.143085244 
##    TrainingTimesLastYear          WorkLifeBalance          JobSatisfaction 
##             -0.057492879             -0.055894247             -0.040312477 
##         DistanceFromHome           EducationField                   Gender 
##             -0.036724395             -0.036544074             -0.004765982 
##           EmployeeNumber                 OverTime  EnvironmentSatisfaction 
##              0.010060889              0.012210029              0.038777578 
##              MonthlyRate RelationshipSatisfaction           JobInvolvement 
##              0.041024308              0.050836624              0.056707297 
##               HourlyRate                DailyRate           BusinessTravel 
##              0.060063555              0.062760587              0.087842393 
##         StockOptionLevel                Education                 JobLevel 
##              0.098859230              0.132041432              0.172756610 
##        TotalWorkingYears            MonthlyIncome                      Age 
##              0.190960013              0.192154519              0.328434360 
##       NumCompaniesWorked 
##              0.409461497
# now union of features selected from dimensions 1 and 2 can be used to reduce overall number of features in dataset

part e) - new derived features

# We saw from PCA that education level itself doesn't conribute itself, but is an important criteria in people analytics
# To include this, calculating monthly income per degree or income per education level
set.seed(1)
IncomePerDegree<-data.frame(employee1$MonthlyIncome/employee1$Education)
# Again, normalizing this new derived feature
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) }
IncomePerDegree <- as.data.frame(lapply(IncomePerDegree, normalize))
IncomePerDegree <- setNames(IncomePerDegree, "Incomeperdegree")
# Removing redudant and insignificant variables from the results of pca and adding derived feature
employee.n<-employee.n[, -c(3,4,5,6,7,8,13,16,19,21,22,23,24,25,27,28,31)]
employee.n<-cbind(employee.n,IncomePerDegree )

Model Construction & Evaluation

Part a) creation of training & validation datasets

# dividing into 3/4 parts as the dataset is noisy
set.seed(1)
index <- createDataPartition(employee.n[,1], p=0.75, list = FALSE)
employee_train <- employee.n[index,]
employee_test <- employee.n[-index,]

Part b),c),d),e)

Model 1-KNN

# Holdout method
trctr <- trainControl(method = "none")# tune parameter #no folds
model_knn <- train(employee_train[, 2:16], employee_train[, 1], method='knn', trControl = trctr)
pred_knn<-predict(object=model_knn,employee_test[,2:16]) #predictions
table(pred_knn) 
## pred_knn
##  No Yes 
## 329  21
# Accuracy (Holdout method) 
confusionMatrix(pred_knn, employee_test[,1]) # Accuracy=86.57%
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  295  34
##        Yes  13   8
##                                           
##                Accuracy : 0.8657          
##                  95% CI : (0.8255, 0.8996)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.818287        
##                                           
##                   Kappa : 0.1891          
##  Mcnemar's Test P-Value : 0.003531        
##                                           
##             Sensitivity : 0.9578          
##             Specificity : 0.1905          
##          Pos Pred Value : 0.8967          
##          Neg Pred Value : 0.3810          
##              Prevalence : 0.8800          
##          Detection Rate : 0.8429          
##    Detection Prevalence : 0.9400          
##       Balanced Accuracy : 0.5741          
##                                           
##        'Positive' Class : No              
## 
#the dataset is a numerical one so no need to RMSE and similar methods for evaluating the fit of model
# knn with 10 fold cross validation
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)# tune control
knn_fit <- train(`employee1$Attrition` ~ ., data = employee_train, method = "knn", trControl=trctrl, tuneLength = 10)
pred_knn_cv<-predict(object=knn_fit,employee_test[,-1])# predictions
table(pred_knn_cv)
## pred_knn_cv
##  No Yes 
## 345   5
# Accuracy (CV method)
confusionMatrix(pred_knn_cv, employee_test[,1]) # Accuracy = 88.29%
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  306  39
##        Yes   2   3
##                                           
##                Accuracy : 0.8829          
##                  95% CI : (0.8445, 0.9146)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.4755          
##                                           
##                   Kappa : 0.1048          
##  Mcnemar's Test P-Value : 1.885e-08       
##                                           
##             Sensitivity : 0.99351         
##             Specificity : 0.07143         
##          Pos Pred Value : 0.88696         
##          Neg Pred Value : 0.60000         
##              Prevalence : 0.88000         
##          Detection Rate : 0.87429         
##    Detection Prevalence : 0.98571         
##       Balanced Accuracy : 0.53247         
##                                           
##        'Positive' Class : No              
## 
# repeated cv sampling instead of no sampling with k=15, number=10, repeats=3 increases accuracy from 86.57% to 88.29%.

Model 2-svm

# Holdout method
set.seed(1)
model_svm <- ksvm( employee_train[,1]  ~ ., data = employee_train[,2:16], kernel = "vanilladot", cross=0) # model
##  Setting default kernel parameters
pred_svm <- predict(model_svm, employee_test[,2:16]) # predictions
table(pred_svm)
## pred_svm
##  No Yes 
## 350   0
# Accuracy (Holdout method)
confusionMatrix(pred_svm, employee_test[,1]) 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  308  42
##        Yes   0   0
##                                           
##                Accuracy : 0.88            
##                  95% CI : (0.8413, 0.9121)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.541           
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 2.509e-10       
##                                           
##             Sensitivity : 1.00            
##             Specificity : 0.00            
##          Pos Pred Value : 0.88            
##          Neg Pred Value :  NaN            
##              Prevalence : 0.88            
##          Detection Rate : 0.88            
##    Detection Prevalence : 1.00            
##       Balanced Accuracy : 0.50            
##                                           
##        'Positive' Class : No              
## 
# I tried changing the inner product in feature space between the two vector arguments by using different kernals
# vanilladot,rbfdot,ploydot,laplacedot, anovadot gave accuracies of 88%,
# splinedot of 80.86%
# tanhdot and besseldot of 77.14% 
# Accuracy (CV)
model_svm_cv<- ksvm( `employee1$Attrition`  ~ ., data = employee_train, kernel = "vanilladot", cross=10)
##  Setting default kernel parameters
pred_svm_cv <- predict(model_svm_cv, employee_test[,2:16])
confusionMatrix(pred_svm_cv, employee_test[,1])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  308  42
##        Yes   0   0
##                                           
##                Accuracy : 0.88            
##                  95% CI : (0.8413, 0.9121)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.541           
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 2.509e-10       
##                                           
##             Sensitivity : 1.00            
##             Specificity : 0.00            
##          Pos Pred Value : 0.88            
##          Neg Pred Value :  NaN            
##              Prevalence : 0.88            
##          Detection Rate : 0.88            
##    Detection Prevalence : 1.00            
##       Balanced Accuracy : 0.50            
##                                           
##        'Positive' Class : No              
## 
# This gives accuracy same as holdout method; 88%

Model 3- Decision Tree

# rpart
set.seed(1)
rtree_fit <- rpart(employee_train[,1] ~ ., employee_train[,2:16], method='class') 
rpart.plot(rtree_fit)

pred_rtree <- predict(rtree_fit, employee_test[,2:16], type= 'class')
confusionMatrix(pred_rtree, employee_test[,1]) #88.57%
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  298  30
##        Yes  10  12
##                                           
##                Accuracy : 0.8857          
##                  95% CI : (0.8476, 0.9171)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.410232        
##                                           
##                   Kappa : 0.3188          
##  Mcnemar's Test P-Value : 0.002663        
##                                           
##             Sensitivity : 0.9675          
##             Specificity : 0.2857          
##          Pos Pred Value : 0.9085          
##          Neg Pred Value : 0.5455          
##              Prevalence : 0.8800          
##          Detection Rate : 0.8514          
##    Detection Prevalence : 0.9371          
##       Balanced Accuracy : 0.6266          
##                                           
##        'Positive' Class : No              
## 
# printing cp table
printcp(rtree_fit)
## 
## Classification tree:
## rpart(formula = employee_train[, 1] ~ ., data = employee_train[, 
##     2:16], method = "class")
## 
## Variables actually used in tree construction:
## [1] Age                     EmployeeNumber          EnvironmentSatisfaction
## [4] HourlyRate              MaritalStatus           NumCompaniesWorked     
## [7] TotalWorkingYears       YearsInCurrentRole     
## 
## Root node error: 128/1053 = 0.12156
## 
## n= 1053 
## 
##         CP nsplit rel error xerror     xstd
## 1 0.031250      0   1.00000 1.0000 0.082842
## 2 0.023438      2   0.93750 1.0391 0.084216
## 3 0.015625      3   0.91406 1.0625 0.085022
## 4 0.010417      7   0.83594 1.1562 0.088111
## 5 0.010000     10   0.80469 1.1641 0.088359
# plotting cross-validation results
plotcp(rtree_fit)

# the plot shows that dividing trees into more nodes increases relative validation errors
# for this reason, this model is not so good
# Decision tree using C50 (no bias like rpart)
# also, this method does not require pruning
set.seed(1)
fit <- C5.0(employee_train[,2:16], employee_train[,1], trials=10)# boosting by adding trials =10
# I tried changing trials and found out 10 gives the best accuracy
summary(fit)
## 
## Call:
## C5.0.default(x = employee_train[, 2:16], y = employee_train[, 1], trials
##  = 10)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Apr 22 16:48:15 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 1053 cases (16 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## JobLevel > 0:
## :...JobRole <= 0.75: No (431/9)
## :   JobRole > 0.75:
## :   :...MaritalStatus <= 0.5: No (162/10)
## :       MaritalStatus > 0.5:
## :       :...EnvironmentSatisfaction > 0: No (58/12)
## :           EnvironmentSatisfaction <= 0:
## :           :...YearsInCurrentRole > 0.1666667: No (7/1)
## :               YearsInCurrentRole <= 0.1666667:
## :               :...Age <= 0.9047619: Yes (8)
## :                   Age > 0.9047619: No (2)
## JobLevel <= 0:
## :...MaritalStatus <= 0.5:
##     :...TotalWorkingYears > 0.02631579: No (203/21)
##     :   TotalWorkingYears <= 0.02631579:
##     :   :...EnvironmentSatisfaction > 0.6666667: No (15/1)
##     :       EnvironmentSatisfaction <= 0.6666667:
##     :       :...YearsAtCompany <= 0: Yes (2)
##     :           YearsAtCompany > 0:
##     :           :...JobRole > 0.5: No (10/2)
##     :               JobRole <= 0.5:
##     :               :...Gender <= 0: Yes (4)
##     :                   Gender > 0:
##     :                   :...EnvironmentSatisfaction <= 0: Yes (2)
##     :                       EnvironmentSatisfaction > 0: No (6/1)
##     MaritalStatus > 0.5:
##     :...NumCompaniesWorked > 0.5555556: Yes (11/2)
##         NumCompaniesWorked <= 0.5555556:
##         :...Incomeperdegree <= 0.008630767: Yes (7)
##             Incomeperdegree > 0.008630767:
##             :...Age > 0.3571429: No (42/4)
##                 Age <= 0.3571429:
##                 :...EmployeeNumber > 0.9245283: No (8)
##                     EmployeeNumber <= 0.9245283:
##                     :...HourlyRate <= 0.2: No (12/1)
##                         HourlyRate > 0.2:
##                         :...YearsInCurrentRole > 0.2222222: No (8/1)
##                             YearsInCurrentRole <= 0.2222222:
##                             :...JobRole <= 0.5: Yes (19/5)
##                                 JobRole > 0.5:
##                                 :...MonthlyIncome <= 0.07704055: Yes (19/4)
##                                     MonthlyIncome > 0.07704055: No (17/4)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## YearsInCurrentRole > 0.3888889: No (160.2/11.6)
## YearsInCurrentRole <= 0.3888889:
## :...MaritalStatus <= 0: No (165/29.4)
##     MaritalStatus > 0:
##     :...JobRole > 0.75:
##         :...TotalWorkingYears <= 0.02631579: Yes (29.4/5.4)
##         :   TotalWorkingYears > 0.02631579:
##         :   :...HourlyRate <= 0.2142857: No (27/0.8)
##         :       HourlyRate > 0.2142857:
##         :       :...HourlyRate > 0.8: No (33.2/5.4)
##         :           HourlyRate <= 0.8:
##         :           :...MaritalStatus <= 0.5: No (74.1/34.9)
##         :               MaritalStatus > 0.5: Yes (82.7/23.9)
##         JobRole <= 0.75:
##         :...Age > 0.7142857: No (44.7)
##             Age <= 0.7142857:
##             :...JobRole <= 0: No (37/3.9)
##                 JobRole > 0:
##                 :...HourlyRate <= 0.1: No (26.2/0.8)
##                     HourlyRate > 0.1:
##                     :...EnvironmentSatisfaction <= 0:
##                         :...Incomeperdegree > 0.1344445: No (12.3)
##                         :   Incomeperdegree <= 0.1344445:
##                         :   :...NumCompaniesWorked <= 0: No (3.8)
##                         :       NumCompaniesWorked > 0: Yes (66.5/20.8)
##                         EnvironmentSatisfaction > 0:
##                         :...Gender <= 0: No (98.8/20.1)
##                             Gender > 0:
##                             :...EnvironmentSatisfaction > 0.6666667: No (72.5/17)
##                                 EnvironmentSatisfaction <= 0.6666667:
##                                 :...NumCompaniesWorked <= 0.4444444: No (95.7/34.8)
##                                     NumCompaniesWorked > 0.4444444: Yes (24/4.6)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## YearsWithCurrManager > 0.4117647: No (148.8/16.4)
## YearsWithCurrManager <= 0.4117647:
## :...JobRole <= 0.125: No (69.6/10.7)
##     JobRole > 0.125:
##     :...Incomeperdegree <= 0.009876952: Yes (32.3/6.3)
##         Incomeperdegree > 0.009876952:
##         :...TotalWorkingYears > 0.4736842: No (66.6/3.2)
##             TotalWorkingYears <= 0.4736842:
##             :...EmployeeNumber > 0.934688: No (37.8/3.8)
##                 EmployeeNumber <= 0.934688:
##                 :...EnvironmentSatisfaction > 0.6666667:
##                     :...MaritalStatus <= 0: No (28.8)
##                     :   MaritalStatus > 0:
##                     :   :...NumCompaniesWorked > 0.6666667: No (13.9)
##                     :       NumCompaniesWorked <= 0.6666667:
##                     :       :...YearsWithCurrManager <= 0: Yes (52.4/20.8)
##                     :           YearsWithCurrManager > 0: No (94.5/28.4)
##                     EnvironmentSatisfaction <= 0.6666667:
##                     :...NumCompaniesWorked <= 0.2222222:
##                         :...NumCompaniesWorked > 0.1111111: No (28.3/0.6)
##                         :   NumCompaniesWorked <= 0.1111111:
##                         :   :...JobLevel > 0:
##                         :       :...TotalWorkingYears <= 0.3947369: No (74.3/15.1)
##                         :       :   TotalWorkingYears > 0.3947369: Yes (5)
##                         :       JobLevel <= 0:
##                         :       :...MonthlyIncome > 0.1524487: No (9.5)
##                         :           MonthlyIncome <= 0.1524487:
##                         :           :...MaritalStatus <= 0.5: No (78.9/32.9)
##                         :               MaritalStatus > 0.5:
##                         :               :...EmployeeNumber <= 0.1185293: No (17.7/3.2)
##                         :                   EmployeeNumber > 0.1185293: Yes (64.4/18.9)
##                         NumCompaniesWorked > 0.2222222:
##                         :...EnvironmentSatisfaction > 0.3333333:
##                             :...TotalWorkingYears <= 0.2368421: Yes (47.9/20.2)
##                             :   TotalWorkingYears > 0.2368421: No (22/0.6)
##                             EnvironmentSatisfaction <= 0.3333333:
##                             :...JobRole > 0.75: Yes (73/16.5)
##                                 JobRole <= 0.75:
##                                 :...Age > 0.6190476: No (11.4)
##                                     Age <= 0.6190476:
##                                     :...NumCompaniesWorked > 0.7777778: No (5.7)
##                                         NumCompaniesWorked <= 0.7777778:
##                                         :...MaritalStatus <= 0.5: Yes (54.2/12)
##                                             MaritalStatus > 0.5: No (15.8/4.4)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
## 
## YearsInCurrentRole > 0.5: No (40.4)
## YearsInCurrentRole <= 0.5:
## :...MonthlyIncome > 0.6680358: No (49.2)
##     MonthlyIncome <= 0.6680358:
##     :...TotalWorkingYears <= 0.05263158:
##         :...NumCompaniesWorked <= 0: Yes (9.7/1)
##         :   NumCompaniesWorked > 0:
##         :   :...HourlyRate <= 0.3285714: Yes (49.6/14)
##         :       HourlyRate > 0.3285714:
##         :       :...Incomeperdegree <= 0.008630767: Yes (5.7)
##         :           Incomeperdegree > 0.008630767: No (85.3/29.1)
##         TotalWorkingYears > 0.05263158:
##         :...HourlyRate <= 0.2142857: No (118.5/16.9)
##             HourlyRate > 0.2142857:
##             :...MaritalStatus <= 0: No (102.9/24.5)
##                 MaritalStatus > 0:
##                 :...Gender <= 0:
##                     :...TotalWorkingYears > 0.2368421: No (63.8)
##                     :   TotalWorkingYears <= 0.2368421:
##                     :   :...Age > 0.3095238: No (63.6/17.9)
##                     :       Age <= 0.3095238:
##                     :       :...NumCompaniesWorked <= 0.6666667: Yes (64.2/20.7)
##                     :           NumCompaniesWorked > 0.6666667: No (9.7/2.1)
##                     Gender > 0:
##                     :...YearsWithCurrManager > 0.2941177:
##                         :...JobRole <= 0.125: No (7.6)
##                         :   JobRole > 0.125:
##                         :   :...Age <= 0.2142857: No (5.8)
##                         :       Age > 0.2142857:
##                         :       :...EmployeeNumber <= 0.06337687: No (6.7)
##                         :           EmployeeNumber > 0.06337687: Yes (98.4/30.3)
##                         YearsWithCurrManager <= 0.2941177:
##                         :...YearsInCurrentRole > 0.2222222: No (30.8)
##                             YearsInCurrentRole <= 0.2222222:
##                             :...EmployeeNumber <= 0.1601355: Yes (44.7/7.9)
##                                 EmployeeNumber > 0.1601355:
##                                 :...JobRole <= 0.125: No (10.2)
##                                     JobRole > 0.125:
##                                     :...YearsInCurrentRole <= 0.05555556: No (36/4.2)
##                                         YearsInCurrentRole > 0.05555556:
##                                         :...TotalWorkingYears <= 0.07894737: No (8.7)
##                                             TotalWorkingYears > 0.07894737: [S1]
## 
## SubTree [S1]
## 
## YearsInCurrentRole <= 0.1666667: No (115.9/44.7)
## YearsInCurrentRole > 0.1666667: Yes (25.7/8.8)
## 
## -----  Trial 4:  -----
## 
## Decision tree:
## 
## YearsWithCurrManager > 0.4117647: No (128.9/16.7)
## YearsWithCurrManager <= 0.4117647:
## :...TotalWorkingYears <= 0.02631579:
##     :...Age > 0.3333333: No (19.6/3.8)
##     :   Age <= 0.3333333:
##     :   :...TotalWorkingYears <= 0: No (17/5.6)
##     :       TotalWorkingYears > 0: Yes (91.8/27.1)
##     TotalWorkingYears > 0.02631579:
##     :...NumCompaniesWorked > 0.4444444:
##         :...EnvironmentSatisfaction > 0.6666667: No (33.8/2.6)
##         :   EnvironmentSatisfaction <= 0.6666667:
##         :   :...Gender <= 0:
##         :       :...MaritalStatus <= 0.5: No (42.1/8.3)
##         :       :   MaritalStatus > 0.5: Yes (25/8.3)
##         :       Gender > 0:
##         :       :...YearsAtCompany <= 0.08108108: Yes (75.8/16.1)
##         :           YearsAtCompany > 0.08108108: No (38.6/14.1)
##         NumCompaniesWorked <= 0.4444444:
##         :...EmployeeNumber > 0.9351717: No (31.3)
##             EmployeeNumber <= 0.9351717:
##             :...JobLevel > 0.25: No (77.3/8.8)
##                 JobLevel <= 0.25:
##                 :...MaritalStatus <= 0: No (51/10)
##                     MaritalStatus > 0:
##                     :...HourlyRate <= 0.2142857: No (53.2/7.4)
##                         HourlyRate > 0.2142857:
##                         :...YearsInCurrentRole > 0.2222222: No (89.4/23.4)
##                             YearsInCurrentRole <= 0.2222222:
##                             :...Age > 0.7380952: Yes (15.5/2.1)
##                                 Age <= 0.7380952:
##                                 :...TotalWorkingYears > 0.2631579: No (18.5/1.3)
##                                     TotalWorkingYears <= 0.2631579:
##                                     :...TotalWorkingYears > 0.2368421: Yes (18.9/5.5)
##                                         TotalWorkingYears <= 0.2368421:
##                                         :...MaritalStatus <= 0.5:
##                                             :...Age > 0.5: No (11.9)
##                                             :   Age <= 0.5:
##                                             :   :...HourlyRate > 0.7857143: No (19.9)
##                                             :       HourlyRate <= 0.7857143: [S1]
##                                             MaritalStatus > 0.5:
##                                             :...Incomeperdegree > 0.1726026: Yes (5.8)
##                                                 Incomeperdegree <= 0.1726026: [S2]
## 
## SubTree [S1]
## 
## TotalWorkingYears > 0.2105263: No (4.4)
## TotalWorkingYears <= 0.2105263:
## :...Age <= 0.3571429: No (49.3/18.8)
##     Age > 0.3571429: Yes (27.4/3.8)
## 
## SubTree [S2]
## 
## EmployeeNumber > 0.6806967: No (20.1/2.1)
## EmployeeNumber <= 0.6806967:
## :...EmployeeNumber > 0.6303822: Yes (8.7)
##     EmployeeNumber <= 0.6303822:
##     :...HourlyRate <= 0.7: Yes (48.2/16.7)
##         HourlyRate > 0.7: No (29.6/7.2)
## 
## -----  Trial 5:  -----
## 
## Decision tree:
## 
## MonthlyIncome > 0.6680358: No (41.4)
## MonthlyIncome <= 0.6680358:
## :...JobRole <= 0: No (44.2/4)
##     JobRole > 0:
##     :...EnvironmentSatisfaction <= 0:
##         :...Incomeperdegree > 0.1570805: No (16.5)
##         :   Incomeperdegree <= 0.1570805:
##         :   :...MonthlyIncome > 0.4799895: Yes (18/2.1)
##         :       MonthlyIncome <= 0.4799895:
##         :       :...YearsInCurrentRole > 0.1666667: No (53.5/15.5)
##         :           YearsInCurrentRole <= 0.1666667:
##         :           :...TotalWorkingYears <= 0.1842105: Yes (85.5/26.1)
##         :               TotalWorkingYears > 0.1842105: No (51.9/18.4)
##         EnvironmentSatisfaction > 0:
##         :...YearsWithCurrManager > 0.4117647: No (62.6/6.2)
##             YearsWithCurrManager <= 0.4117647:
##             :...TotalWorkingYears > 0.4210526: No (32.2/4.1)
##                 TotalWorkingYears <= 0.4210526:
##                 :...Age > 0.6190476: No (49.3/9)
##                     Age <= 0.6190476:
##                     :...Incomeperdegree <= 0.009876952: Yes (26.6/5.9)
##                         Incomeperdegree > 0.009876952:
##                         :...YearsWithCurrManager > 0.3529412:
##                             :...Age > 0.4285714: No (11.8)
##                             :   Age <= 0.4285714:
##                             :   :...YearsInCurrentRole <= 0.05555556: No (4.7)
##                             :       YearsInCurrentRole > 0.05555556:
##                             :       :...EmployeeNumber <= 0.8703435: Yes (80.2/25.6)
##                             :           EmployeeNumber > 0.8703435: No (5.5)
##                             YearsWithCurrManager <= 0.3529412:
##                             :...YearsInCurrentRole > 0.2222222: No (44.1/6.2)
##                                 YearsInCurrentRole <= 0.2222222:
##                                 :...MaritalStatus > 0.5:
##                                     :...EmployeeNumber <= 0.01306241: Yes (11.2/0.3)
##                                     :   EmployeeNumber > 0.01306241:
##                                     :   :...TotalWorkingYears > 0.2368421: No (13.4)
##                                     :       TotalWorkingYears <= 0.2368421:
##                                     :       :...YearsAtCompany > 0.1351351: Yes (19.8/5.5)
##                                     :           YearsAtCompany <= 0.1351351: [S1]
##                                     MaritalStatus <= 0.5:
##                                     :...YearsInCurrentRole > 0.1111111: No (36.3/3.3)
##                                         YearsInCurrentRole <= 0.1111111: [S2]
## 
## SubTree [S1]
## 
## YearsInCurrentRole <= 0: Yes (59.4/25)
## YearsInCurrentRole > 0: No (96.1/34.6)
## 
## SubTree [S2]
## 
## YearsWithCurrManager > 0.1764706: Yes (14.1/3.9)
## YearsWithCurrManager <= 0.1764706:
## :...MonthlyIncome <= 0.06824645: No (34.9/2.8)
##     MonthlyIncome > 0.06824645:
##     :...YearsWithCurrManager > 0.1176471: No (8.4)
##         YearsWithCurrManager <= 0.1176471:
##         :...YearsInCurrentRole <= 0: No (55.9/16.1)
##             YearsInCurrentRole > 0:
##             :...Age <= 0.2142857: No (9.7)
##                 Age > 0.2142857:
##                 :...Incomeperdegree <= 0.1149708: Yes (57.2/18.6)
##                     Incomeperdegree > 0.1149708: No (8.7)
## 
## -----  Trial 6:  -----
## 
## Decision tree:
## 
## YearsWithCurrManager > 0.5294118: No (26.3)
## YearsWithCurrManager <= 0.5294118:
## :...MonthlyIncome > 0.6680358: No (26.2)
##     MonthlyIncome <= 0.6680358:
##     :...MaritalStatus > 0.5:
##         :...Gender <= 0:
##         :   :...JobRole <= 0.875: No (121.6/32)
##         :   :   JobRole > 0.875: Yes (27.3/6.3)
##         :   Gender > 0:
##         :   :...JobRole <= 0.25:
##         :       :...JobRole <= 0.125: No (5)
##         :       :   JobRole > 0.125: Yes (64/20.1)
##         :       JobRole > 0.25:
##         :       :...JobRole <= 0.625: No (10.9)
##         :           JobRole > 0.625:
##         :           :...YearsInCurrentRole > 0.4444444: Yes (14.3/0.6)
##         :               YearsInCurrentRole <= 0.4444444:
##         :               :...MonthlyIncome > 0.4136914: Yes (18.9/1.7)
##         :                   MonthlyIncome <= 0.4136914:
##         :                   :...Incomeperdegree > 0.1554888: No (12.3)
##         :                       Incomeperdegree <= 0.1554888:
##         :                       :...NumCompaniesWorked <= 0.5555556: No (110.1/38.3)
##         :                           NumCompaniesWorked > 0.5555556: Yes (28.4/9.4)
##         MaritalStatus <= 0.5:
##         :...YearsInCurrentRole > 0.4444444: No (26)
##             YearsInCurrentRole <= 0.4444444:
##             :...EnvironmentSatisfaction > 0.3333333:
##                 :...Age <= 0.1904762: Yes (47.8/22.1)
##                 :   Age > 0.1904762: No (247.7/42.6)
##                 EnvironmentSatisfaction <= 0.3333333:
##                 :...NumCompaniesWorked <= 0: No (20.9)
##                     NumCompaniesWorked > 0:
##                     :...YearsInCurrentRole > 0.2777778: Yes (57.5/19.1)
##                         YearsInCurrentRole <= 0.2777778:
##                         :...YearsInCurrentRole > 0.1666667: No (11.2)
##                             YearsInCurrentRole <= 0.1666667:
##                             :...JobRole > 0.875: Yes (19.5/4.6)
##                                 JobRole <= 0.875:
##                                 :...NumCompaniesWorked <= 0.2222222: No (60.8/12.9)
##                                     NumCompaniesWorked > 0.2222222:
##                                     :...NumCompaniesWorked > 0.7777778: No (7.5)
##                                         NumCompaniesWorked <= 0.7777778:
##                                         :...YearsAtCompany <= 0: No (6.2)
##                                             YearsAtCompany > 0:
##                                             :...Gender <= 0: No (23.5/6.5)
##                                                 Gender > 0: Yes (59.3/17.9)
## 
## -----  Trial 7:  -----
## 
## Decision tree:
## 
## YearsInCurrentRole > 0.3888889: No (102.7/15.5)
## YearsInCurrentRole <= 0.3888889:
## :...TotalWorkingYears > 0.4736842: No (46.4/6.1)
##     TotalWorkingYears <= 0.4736842:
##     :...MaritalStatus <= 0.5:
##         :...EnvironmentSatisfaction > 0.6666667:
##         :   :...NumCompaniesWorked <= 0: Yes (21.6/7.6)
##         :   :   NumCompaniesWorked > 0: No (95.3/11.3)
##         :   EnvironmentSatisfaction <= 0.6666667:
##         :   :...NumCompaniesWorked <= 0: No (36.9/2.3)
##         :       NumCompaniesWorked > 0:
##         :       :...HourlyRate > 0.4142857:
##         :           :...YearsWithCurrManager > 0.4117647: Yes (12.6/3.7)
##         :           :   YearsWithCurrManager <= 0.4117647:
##         :           :   :...JobLevel > 0.25: No (13.6)
##         :           :       JobLevel <= 0.25:
##         :           :       :...HourlyRate <= 0.5285714: No (32.7/1.3)
##         :           :           HourlyRate > 0.5285714:
##         :           :           :...HourlyRate <= 0.5857143: Yes (32.4/9.9)
##         :           :               HourlyRate > 0.5857143: No (147.6/40.4)
##         :           HourlyRate <= 0.4142857:
##         :           :...YearsAtCompany > 0.3243243: Yes (7.7)
##         :               YearsAtCompany <= 0.3243243:
##         :               :...YearsAtCompany > 0.1891892: No (14.6)
##         :                   YearsAtCompany <= 0.1891892:
##         :                   :...EmployeeNumber <= 0.1132075: No (8.3)
##         :                       EmployeeNumber > 0.1132075:
##         :                       :...MonthlyIncome > 0.1246445: No (33.7/11.9)
##         :                           MonthlyIncome <= 0.1246445:
##         :                           :...MonthlyIncome <= 0.07035282: No (17.8/5.5)
##         :                               MonthlyIncome > 0.07035282: Yes (51.3/4.3)
##         MaritalStatus > 0.5:
##         :...TotalWorkingYears > 0.2894737: No (20.1/3.1)
##             TotalWorkingYears <= 0.2894737:
##             :...JobRole > 0.75:
##                 :...MonthlyIncome <= 0.06966825: Yes (15.7)
##                 :   MonthlyIncome > 0.06966825:
##                 :   :...MonthlyIncome <= 0.08788836: No (10.5)
##                 :       MonthlyIncome > 0.08788836:
##                 :       :...YearsInCurrentRole <= 0.05555556: Yes (18.6/3.2)
##                 :           YearsInCurrentRole > 0.05555556:
##                 :           :...HourlyRate <= 0.2142857: No (11.1)
##                 :               HourlyRate > 0.2142857:
##                 :               :...Incomeperdegree <= 0.06218617: No (23.6/5.2)
##                 :                   Incomeperdegree > 0.06218617:
##                 :                   :...Incomeperdegree <= 0.1468215: Yes (58.4/12)
##                 :                       Incomeperdegree > 0.1468215: No (7.9)
##                 JobRole <= 0.75:
##                 :...JobLevel > 0: No (35/4.5)
##                     JobLevel <= 0:
##                     :...NumCompaniesWorked > 0.5555556: Yes (17.7/3.4)
##                         NumCompaniesWorked <= 0.5555556:
##                         :...YearsInCurrentRole > 0.2222222: No (11.1)
##                             YearsInCurrentRole <= 0.2222222:
##                             :...HourlyRate <= 0.2: No (15.5/1.7)
##                                 HourlyRate > 0.2:
##                                 :...MonthlyIncome > 0.1631385: No (6.6)
##                                     MonthlyIncome <= 0.1631385:
##                                     :...NumCompaniesWorked <= 0: Yes (25/3.1)
##                                         NumCompaniesWorked > 0:
##                                         :...Incomeperdegree > 0.09834931: Yes (12.3/1.7)
##                                             Incomeperdegree <= 0.09834931: [S1]
## 
## SubTree [S1]
## 
## EnvironmentSatisfaction <= 0: Yes (14.9/3.2)
## EnvironmentSatisfaction > 0: No (73.7/25.2)
## 
## -----  Trial 8:  -----
## 
## Decision tree:
## 
## YearsInCurrentRole > 0.3888889: No (70.8)
## YearsInCurrentRole <= 0.3888889:
## :...TotalWorkingYears <= 0.05263158:
##     :...MaritalStatus <= 0: No (25.1/8)
##     :   MaritalStatus > 0:
##     :   :...NumCompaniesWorked <= 0: Yes (8.7/0.6)
##     :       NumCompaniesWorked > 0:
##     :       :...Age > 0.3571429: No (9.1)
##     :           Age <= 0.3571429:
##     :           :...Incomeperdegree <= 0.008630767: Yes (15.4)
##     :               Incomeperdegree > 0.008630767:
##     :               :...MonthlyIncome <= 0.0249605: No (12.8/1.4)
##     :                   MonthlyIncome > 0.0249605:
##     :                   :...EmployeeNumber > 0.8234156: Yes (28.6/3.8)
##     :                       EmployeeNumber <= 0.8234156:
##     :                       :...JobRole <= 0.5: Yes (36.4/8.9)
##     :                           JobRole > 0.5: No (47/18.2)
##     TotalWorkingYears > 0.05263158:
##     :...HourlyRate <= 0.2142857: No (110.1/13.2)
##         HourlyRate > 0.2142857:
##         :...NumCompaniesWorked > 0.2222222:
##             :...EnvironmentSatisfaction <= 0.3333333:
##             :   :...JobRole > 0.75:
##             :   :   :...YearsInCurrentRole <= 0: No (9.8)
##             :   :   :   YearsInCurrentRole > 0:
##             :   :   :   :...Incomeperdegree <= 0.1714566: Yes (66.7/9)
##             :   :   :       Incomeperdegree > 0.1714566: No (5.5)
##             :   :   JobRole <= 0.75:
##             :   :   :...EmployeeNumber > 0.8064829: No (11)
##             :   :       EmployeeNumber <= 0.8064829:
##             :   :       :...Incomeperdegree <= 0.02843391: Yes (33/5.5)
##             :   :           Incomeperdegree > 0.02843391:
##             :   :           :...Gender <= 0: No (20.8)
##             :   :               Gender > 0: Yes (54.2/24.9)
##             :   EnvironmentSatisfaction > 0.3333333:
##             :   :...YearsAtCompany > 0.1621622: No (20.3)
##             :       YearsAtCompany <= 0.1621622:
##             :       :...JobLevel > 0.25: No (7.6)
##             :           JobLevel <= 0.25:
##             :           :...NumCompaniesWorked > 0.7777778: No (10.6)
##             :               NumCompaniesWorked <= 0.7777778:
##             :               :...MaritalStatus <= 0.5: No (58.2/17.1)
##             :                   MaritalStatus > 0.5: Yes (42.1/14.6)
##             NumCompaniesWorked <= 0.2222222:
##             :...MaritalStatus <= 0: No (42.7)
##                 MaritalStatus > 0:
##                 :...Incomeperdegree <= 0.01425728: Yes (18/3.2)
##                     Incomeperdegree > 0.01425728:
##                     :...NumCompaniesWorked > 0.1111111: No (27.9/3.9)
##                         NumCompaniesWorked <= 0.1111111:
##                         :...JobLevel <= 0:
##                             :...Age > 0.4761905: Yes (24.1/6.3)
##                             :   Age <= 0.4761905:
##                             :   :...Age > 0.3333333: No (26.5)
##                             :       Age <= 0.3333333:
##                             :       :...MonthlyIncome <= 0.07440758: Yes (34.9/12.1)
##                             :           MonthlyIncome > 0.07440758: No (57.3/11.2)
##                             JobLevel > 0:
##                             :...MonthlyIncome <= 0.1891522: No (23.7)
##                                 MonthlyIncome > 0.1891522:
##                                 :...HourlyRate > 0.8285714: No (14)
##                                     HourlyRate <= 0.8285714:
##                                     :...MonthlyIncome <= 0.1902054: Yes (4.4)
##                                         MonthlyIncome > 0.1902054: No (70.6/21.9)
## 
## -----  Trial 9:  -----
## 
## Decision tree:
## 
## YearsInCurrentRole > 0.3888889: No (58.6)
## YearsInCurrentRole <= 0.3888889:
## :...TotalWorkingYears <= 0.05263158:
##     :...Incomeperdegree > 0.119402: No (9.7)
##     :   Incomeperdegree <= 0.119402:
##     :   :...HourlyRate <= 0.7285714: Yes (137.5/49)
##     :       HourlyRate > 0.7285714: No (34.1/10.1)
##     TotalWorkingYears > 0.05263158:
##     :...MaritalStatus <= 0.5:
##         :...HourlyRate <= 0.2142857: No (53.2)
##         :   HourlyRate > 0.2142857:
##         :   :...HourlyRate <= 0.4142857:
##         :       :...EnvironmentSatisfaction > 0.6666667: No (11.3)
##         :       :   EnvironmentSatisfaction <= 0.6666667:
##         :       :   :...JobLevel > 0.5: No (7)
##         :       :       JobLevel <= 0.5:
##         :       :       :...EmployeeNumber <= 0.516207: Yes (61.1/15.8)
##         :       :           EmployeeNumber > 0.516207: No (23.2/3.7)
##         :       HourlyRate > 0.4142857:
##         :       :...EnvironmentSatisfaction > 0.3333333: No (114.9/6.5)
##         :           EnvironmentSatisfaction <= 0.3333333:
##         :           :...HourlyRate <= 0.5285714: No (31.1)
##         :               HourlyRate > 0.5285714:
##         :               :...Gender <= 0: No (42.7/3.1)
##         :                   Gender > 0:
##         :                   :...JobLevel > 0.5: No (8.5)
##         :                       JobLevel <= 0.5:
##         :                       :...EmployeeNumber <= 0.2597968: Yes (37.4/12.8)
##         :                           EmployeeNumber > 0.2597968: No (61.6/13.3)
##         MaritalStatus > 0.5:
##         :...EnvironmentSatisfaction > 0.6666667:
##             :...JobLevel > 0.25: Yes (13.1/4)
##             :   JobLevel <= 0.25:
##             :   :...Incomeperdegree <= 0.1678063: No (97.4/17)
##             :       Incomeperdegree > 0.1678063: Yes (5.8)
##             EnvironmentSatisfaction <= 0.6666667:
##             :...EmployeeNumber <= 0.09143686: Yes (26.1/5.1)
##                 EmployeeNumber > 0.09143686:
##                 :...Gender <= 0: No (66.7/18.6)
##                     Gender > 0:
##                     :...JobRole <= 0.125: No (7.5)
##                         JobRole > 0.125:
##                         :...HourlyRate <= 0.5: No (56.6/14.4)
##                             HourlyRate > 0.5:
##                             :...Age > 0.9285714: No (5)
##                                 Age <= 0.9285714:
##                                 :...EmployeeNumber <= 0.1306241: No (4.9)
##                                     EmployeeNumber > 0.1306241: Yes (62.2/16.2)
## 
## 
## Evaluation on training data (1053 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0     22   78( 7.4%)
##    1     17  136(12.9%)
##    2     23  154(14.6%)
##    3     23  169(16.0%)
##    4     27  127(12.1%)
##    5     29  177(16.8%)
##    6     24  174(16.5%)
##    7     33  114(10.8%)
##    8     33  117(11.1%)
##    9     25  146(13.9%)
## boost             30( 2.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     923     2    (a): class No
##      28   100    (b): class Yes
## 
## 
##  Attribute usage:
## 
##  100.00% JobLevel
##  100.00% JobRole
##  100.00% MonthlyIncome
##  100.00% YearsInCurrentRole
##  100.00% YearsWithCurrManager
##   94.40% MaritalStatus
##   93.16% EnvironmentSatisfaction
##   93.16% TotalWorkingYears
##   90.69% HourlyRate
##   88.22% NumCompaniesWorked
##   82.72% EmployeeNumber
##   82.05% Incomeperdegree
##   81.96% Age
##   69.14% Gender
##   40.74% YearsAtCompany
## 
## 
## Time: 0.1 secs
#print(fit)
#plot(fit)
pred_c.50tree <- predict(fit, employee_test[,2:16])# predictions
confusionMatrix(pred_c.50tree, employee_test[,1]) # 88.57% 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  300  32
##        Yes   8  10
##                                           
##                Accuracy : 0.8857          
##                  95% CI : (0.8476, 0.9171)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.4102318       
##                                           
##                   Kappa : 0.2816          
##  Mcnemar's Test P-Value : 0.0002762       
##                                           
##             Sensitivity : 0.9740          
##             Specificity : 0.2381          
##          Pos Pred Value : 0.9036          
##          Neg Pred Value : 0.5556          
##              Prevalence : 0.8800          
##          Detection Rate : 0.8571          
##    Detection Prevalence : 0.9486          
##       Balanced Accuracy : 0.6061          
##                                           
##        'Positive' Class : No              
## 

Model 4-Random Forest

set.seed(1)
model_rf <- randomForest(employee_train[,1] ~ ., data = employee_train[,2:16], importance = TRUE) #model
# Tuning; found mtry=4 and ntree=500 gives best fit
model_rf2 <- randomForest(employee_train[,1] ~ ., data = employee_train[,2:16], ntree = 500, mtry = 4, importance = TRUE)
# predictions and accuracy
pred_rf2 <- predict(model_rf2, employee_test[,2:16], type = "class")
confusionMatrix(pred_rf2, employee_test[,1]) # 88.29%
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  302  35
##        Yes   6   7
##                                           
##                Accuracy : 0.8829          
##                  95% CI : (0.8445, 0.9146)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.4755          
##                                           
##                   Kappa : 0.2097          
##  Mcnemar's Test P-Value : 1.226e-05       
##                                           
##             Sensitivity : 0.9805          
##             Specificity : 0.1667          
##          Pos Pred Value : 0.8961          
##          Neg Pred Value : 0.5385          
##              Prevalence : 0.8800          
##          Detection Rate : 0.8629          
##    Detection Prevalence : 0.9629          
##       Balanced Accuracy : 0.5736          
##                                           
##        'Positive' Class : No              
## 
# Cross Validation
model_rf_cv<-rfcv(employee_train[,2:16], employee_train[,1], cv.fold=10)
model_rf_cv$error.cv
##        15         8         4         1 
## 0.1168091 0.1206078 0.1405508 0.1272555
# accuracy using 15 features = 88.32%

part f) comparison of models

# Tabulating accuracies
Model <- c('Decision Tree-C5.0','Random Forest','kNN','SVM-vanilladot')
Accuracy_percent <- c(88.57,88.32,88.29,88.00)
mytable<- data.frame(Model, Accuracy_percent)
qplot(1:10, 1:10, geom = "blank") + theme(line = element_blank(), text = element_blank()) + annotation_custom(grob = tableGrob(mytable)) 

# Decision Tree, followed by random forest gave better accuracies
set.seed(1)
# Plotting the ROC curves for the four models
plot(roc(employee_test[,1], as.numeric(pred_knn_cv)), col='red')
par(new=TRUE)
plot(roc(employee_test[,1], as.numeric(pred_svm_cv)), col='green')
par(new=TRUE)
plot(roc(employee_test[,1], as.numeric(pred_c.50tree)), col='blue')
par(new=TRUE)
plot(roc(employee_test[,1], as.numeric(pred_rf2)), col='pink')
legend("bottomright", c("knn", "svm", "decision tree", "random forest"), fill=c('red','green','blue','pink'), title="Model")

# As expected # Decision Tree, followed by random forest gave better ROC curves

part g) Interpretation of results/prediction with interval

# Using knn model to predict Employee Attrition for a new test case
t1<-c(25,2063,2,1,72,2,5,4,9992,4,8,15,3,2,5000) #test case
data_new<-employee.n[,-1] # removing the response variable from dataset
data_new[nrow(employee.n)+1,] <- t1 # adding the test case to normalized dataset
# Normalizing new testrow
set.seed(1)
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) }
data_new.n <- as.data.frame(lapply(data_new, normalize))
# saving normalized test case
t1<-data_new.n[1404,]
# train test and labels 
data_new.train <- data.frame(data_new.n[1:1403,])
data_new.test1 <- data.frame(data_new.n[1404,])
data_labels <- employee.n[,1]
# model
test_pred_1 <- knn(train = data_new.train, test =data_new.test1 , cl = data_labels, k=15)
# prediction
test_pred_1 # 86.57 accurate in 95% CI
## [1] No
## Levels: No Yes

part h) construction of stacked ensemble model

set.seed(1)
# Combining prediction from four models
predCom <- data.frame(pred_knn_cv, pred_rf2,pred_svm,pred_c.50tree, y= employee_test[,1],stringsAsFactors = F)
# Training the ensemble model using random forest
modelStack <- train(y ~ ., data = predCom, method = "rf")
predStack<-predict(modelStack, employee_test[,2:16]) # predictions
confusionMatrix(predStack, employee_test[,1])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  304  36
##        Yes   4   6
##                                           
##                Accuracy : 0.8857          
##                  95% CI : (0.8476, 0.9171)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.4102          
##                                           
##                   Kappa : 0.1935          
##  Mcnemar's Test P-Value : 9.509e-07       
##                                           
##             Sensitivity : 0.9870          
##             Specificity : 0.1429          
##          Pos Pred Value : 0.8941          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.8800          
##          Detection Rate : 0.8686          
##    Detection Prevalence : 0.9714          
##       Balanced Accuracy : 0.5649          
##                                           
##        'Positive' Class : No              
## 
# Accuracy of stacked ensemble model is 88.57%; which is same as accuracy of decision tree

Other

set.seed(1)
# iterating over dataset with imputed missing values to compare accuracies
employee2[,-2][sapply(employee2[,-2], is.factor)] <- data.matrix(employee2[,-2][sapply(employee2[,-2], is.factor)])
# Normalization as usual
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) }
employee.n.m <- as.data.frame(lapply(employee2[,-2], normalize))
employee.n.m<-cbind(employee2$Attrition, employee.n.m)
# adding the new derived column
IncomePerDegree<-data.frame(employee2$MonthlyIncome/employee2$Education)
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) }
IncomePerDegree <- as.data.frame(lapply(IncomePerDegree, normalize))
IncomePerDegree <- setNames(IncomePerDegree, "Incomeperdegree")
# removing redudant and insignificant variables from results of pca and adding derived feature
employee.n.m<-employee.n.m[, -c(3,4,5,6,7,8,13,16,19,21,22,23,24,25,27,28,31)]
employee.n.m<-cbind(employee.n.m,IncomePerDegree )
# splitting up train and test datasets
index <- createDataPartition(employee.n.m[,1], p=0.75, list = FALSE)
employee_train_m <- employee.n.m[index,]
employee_test_m <- employee.n.m[-index,]
# verifying model 1 - knn
trctr.m <- trainControl(method = "repeatedcv", number = 10, repeats = 3)# tune control
knn_fit_m <- train(`employee2$Attrition` ~ ., data = employee_train_m, method = "knn", trControl=trctr.m, tuneLength = 10)
pred_knn.m<-predict(object=knn_fit_m,employee_test_m[,-1])# predictions
confusionMatrix(pred_knn.m, employee_test_m[,1])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  307  41
##        Yes   1   1
##                                           
##                Accuracy : 0.88            
##                  95% CI : (0.8413, 0.9121)
##     No Information Rate : 0.88            
##     P-Value [Acc > NIR] : 0.541           
##                                           
##                   Kappa : 0.0349          
##  Mcnemar's Test P-Value : 1.768e-09       
##                                           
##             Sensitivity : 0.99675         
##             Specificity : 0.02381         
##          Pos Pred Value : 0.88218         
##          Neg Pred Value : 0.50000         
##              Prevalence : 0.88000         
##          Detection Rate : 0.87714         
##    Detection Prevalence : 0.99429         
##       Balanced Accuracy : 0.51028         
##                                           
##        'Positive' Class : No              
## 
# Accuracy remains almost the same.
# Although for original dataset
#Prediction 
            #No Yes
       #No  306  39
       #Yes   2   3