Our example concerns a big company that wants to understand why some of their best and most experienced employees are leaving prematurely. The company also wishes to predict which valuable employees will leave next.

Reading the data into a dataframe

hresource <- read.csv(paste("human.csv", sep=""))
attach(hresource)
dim(hresource)
## [1] 14999    10
library(psych)
describe(hresource)
##                       vars     n   mean    sd median trimmed   mad   min
## satisfaction_level       1 14999   0.61  0.25   0.64    0.63  0.28  0.09
## last_evaluation          2 14999   0.72  0.17   0.72    0.72  0.22  0.36
## number_project           3 14999   3.80  1.23   4.00    3.74  1.48  2.00
## average_montly_hours     4 14999 201.05 49.94 200.00  200.64 65.23 96.00
## time_spend_company       5 14999   3.50  1.46   3.00    3.28  1.48  2.00
## Work_accident            6 14999   0.14  0.35   0.00    0.06  0.00  0.00
## left                     7 14999   0.24  0.43   0.00    0.17  0.00  0.00
## promotion_last_5years    8 14999   0.02  0.14   0.00    0.00  0.00  0.00
## sales*                   9 14999   6.94  2.75   8.00    7.23  2.97  1.00
## salary*                 10 14999   2.35  0.63   2.00    2.41  1.48  1.00
##                       max  range  skew kurtosis   se
## satisfaction_level      1   0.91 -0.48    -0.67 0.00
## last_evaluation         1   0.64 -0.03    -1.24 0.00
## number_project          7   5.00  0.34    -0.50 0.01
## average_montly_hours  310 214.00  0.05    -1.14 0.41
## time_spend_company     10   8.00  1.85     4.77 0.01
## Work_accident           1   1.00  2.02     2.08 0.00
## left                    1   1.00  1.23    -0.49 0.00
## promotion_last_5years   1   1.00  6.64    42.03 0.00
## sales*                 10   9.00 -0.79    -0.62 0.02
## salary*                 3   2.00 -0.42    -0.67 0.01
View(hresource)

Visualization of catagorical values

one way contigency table

mytable <- with(hresource,table(left))
mytable
## left
##     0     1 
## 11428  3571
prop.table(mytable)*100
## left
##        0        1 
## 76.19175 23.80825
mytable1 <- with(hresource,table(promotion_last_5years))
mytable1
## promotion_last_5years
##     0     1 
## 14680   319
prop.table(mytable1)*100
## promotion_last_5years
##         0         1 
## 97.873192  2.126808
mytable2 <- with(hresource,table(salary))
mytable2
## salary
##   high    low medium 
##   1237   7316   6446
prop.table(mytable2)*100
## salary
##      high       low    medium 
##  8.247216 48.776585 42.976198
mytable3 <- with(hresource,table(Work_accident))
mytable3
## Work_accident
##     0     1 
## 12830  2169
prop.table(mytable3)*100
## Work_accident
##        0        1 
## 85.53904 14.46096

Two way contingency table

mytable4 <- xtabs(~ left+promotion_last_5years, data=hresource)
mytable4
##     promotion_last_5years
## left     0     1
##    0 11128   300
##    1  3552    19
margin.table(mytable4,1)
## left
##     0     1 
## 11428  3571
prop.table(mytable4, 1)*100
##     promotion_last_5years
## left          0          1
##    0 97.3748687  2.6251313
##    1 99.4679362  0.5320638
mytable5 <-xtabs(~left+salary,data=hresource)
mytable5
##     salary
## left high  low medium
##    0 1155 5144   5129
##    1   82 2172   1317
margin.table(mytable5,2) 
## salary
##   high    low medium 
##   1237   7316   6446
prop.table(mytable5, 2)
##     salary
## left       high        low     medium
##    0 0.93371059 0.70311646 0.79568725
##    1 0.06628941 0.29688354 0.20431275
mytable6 <- xtabs(~left+Work_accident,data=hresource)
mytable6
##     Work_accident
## left    0    1
##    0 9428 2000
##    1 3402  169
margin.table(mytable6,1)
## left
##     0     1 
## 11428  3571
prop.table(mytable6, 1)
##     Work_accident
## left          0          1
##    0 0.82499125 0.17500875
##    1 0.95267432 0.04732568
library(gmodels)
CrossTable(hresource$left,hresource$salary)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##                | hresource$salary 
## hresource$left |      high |       low |    medium | Row Total | 
## ---------------|-----------|-----------|-----------|-----------|
##              0 |      1155 |      5144 |      5129 |     11428 | 
##                |    47.915 |    33.200 |     9.648 |           | 
##                |     0.101 |     0.450 |     0.449 |     0.762 | 
##                |     0.934 |     0.703 |     0.796 |           | 
##                |     0.077 |     0.343 |     0.342 |           | 
## ---------------|-----------|-----------|-----------|-----------|
##              1 |        82 |      2172 |      1317 |      3571 | 
##                |   153.339 |   106.247 |    30.876 |           | 
##                |     0.023 |     0.608 |     0.369 |     0.238 | 
##                |     0.066 |     0.297 |     0.204 |           | 
##                |     0.005 |     0.145 |     0.088 |           | 
## ---------------|-----------|-----------|-----------|-----------|
##   Column Total |      1237 |      7316 |      6446 |     14999 | 
##                |     0.082 |     0.488 |     0.430 |           | 
## ---------------|-----------|-----------|-----------|-----------|
## 
## 
CrossTable(hresource$left,hresource$promotion_last_5years)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##                | hresource$promotion_last_5years 
## hresource$left |         0 |         1 | Row Total | 
## ---------------|-----------|-----------|-----------|
##              0 |     11128 |       300 |     11428 | 
##                |     0.290 |    13.343 |           | 
##                |     0.974 |     0.026 |     0.762 | 
##                |     0.758 |     0.940 |           | 
##                |     0.742 |     0.020 |           | 
## ---------------|-----------|-----------|-----------|
##              1 |      3552 |        19 |      3571 | 
##                |     0.928 |    42.702 |           | 
##                |     0.995 |     0.005 |     0.238 | 
##                |     0.242 |     0.060 |           | 
##                |     0.237 |     0.001 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |     14680 |       319 |     14999 | 
##                |     0.979 |     0.021 |           | 
## ---------------|-----------|-----------|-----------|
## 
## 

Boxplot

par(mfrow=c(1,2))
boxplot(hresource$satisfaction_level,col="yellow",main="Satisfaction level")
boxplot(hresource$last_evaluation,col="blue",main="Last Evaluation")

par(mfrow=c(1,2))
boxplot(hresource$number_project,col="green",main="Number of project")
boxplot(hresource$average_montly_hours,col="red",main="Average monthly hours")

par(mfrow=c(1,2))
boxplot(satisfaction_level ~left  ,data=hresource, main="satisfaction of left", ylab="satisfaction level", xlab="left",col= "lightblue",vertical=TRUE)

boxplot(satisfaction_level ~promotion_last_5years  ,data=hresource, main= "satisfaction with promotion", ylab="satisfaction level", xlab="promotion in last 5 years",col= "peachpuff",vertical=TRUE)

par(mfrow=c(1,2))
boxplot(number_project ~left  ,data=hresource, main="Distribution of number of projects", ylab="number of projects", xlab="left",col= "lightblue",vertical=TRUE)

boxplot(satisfaction_level ~salary  ,data=hresource, main="Distribution of satisfaction with salary", ylab="satisfaction level", xlab=" salary",col= "blue",vertical=TRUE)

Histogram

par(mfrow=c(1,2))
library(lattice)
histogram(~left, data = hresource,
 main = "HR leaving the company", xlab="left", col='lightgreen' ) 

histogram(~satisfaction_level,data=hresource,main="satisfaction level",col="lightblue")

histogram(~last_evaluation,data=hresource,main="frequency of last evalution",col="yellow")

histogram(~salary,data=hresource,main="frequency of salary",col="green")

Scatterplot

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(satisfaction_level~number_project,     data=hresource,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of satisfaction level vs number of project",
            xlab="number of project",
            ylab="satisfaction level")

scatterplot(satisfaction_level~average_montly_hours,data=hresource,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of satisfaction level vs average working hours",
            xlab="average working hours",
            ylab="satisfaction level")

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(hresource, aes(satisfaction_level, average_montly_hours)) + geom_point(aes(color = left)) +  scale_x_continuous("satisfaction level ") + scale_y_continuous("average monthly hours")+ labs(title="satisfaction of people leaving company")

ggplot(hresource, aes(satisfaction_level, average_montly_hours)) + geom_point(aes(color = left)) +  scale_x_continuous("satisfaction level ") + scale_y_continuous("average monthly hours")+ labs(title="satisfaction of people leaving company with salary")+facet_wrap( ~ salary)

Corrgram

library(corrgram)
    corrgram(hresource, order=TRUE, lower.panel=panel.shade,
    upper.panel=panel.pie, text.panel=panel.txt,
    main="human_resource analysis ")

Correlation test

cor.test(hresource$satisfaction_level,hresource$average_montly_hours)
## 
##  Pearson's product-moment correlation
## 
## data:  hresource$satisfaction_level and hresource$average_montly_hours
## t = -2.4556, df = 14997, p-value = 0.01408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.036040356 -0.004045605
## sample estimates:
##         cor 
## -0.02004811
cor.test(hresource$satisfaction_level,hresource$number_project)
## 
##  Pearson's product-moment correlation
## 
## data:  hresource$satisfaction_level and hresource$number_project
## t = -17.69, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1586105 -0.1272570
## sample estimates:
##        cor 
## -0.1429696

T-test

t.test(hresource$satisfaction_level,hresource$average_montly_hours)
## 
##  Welch Two Sample t-test
## 
## data:  hresource$satisfaction_level and hresource$average_montly_hours
## t = -491.51, df = 14999, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -201.2368 -199.6382
## sample estimates:
##   mean of x   mean of y 
##   0.6128335 201.0503367
t.test(hresource$satisfaction_level,hresource$number_project)
## 
##  Welch Two Sample t-test
## 
## data:  hresource$satisfaction_level and hresource$number_project
## t = -310.72, df = 16216, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.210345 -3.170095
## sample estimates:
## mean of x mean of y 
## 0.6128335 3.8030535

Visualization

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
first <- hresource %>% select(satisfaction_level:promotion_last_5years)
M <- cor(first)
library(corrplot)
## corrplot 0.84 loaded
corrplot(M, method="circle")

On average people who leave have a low satisfaction level, they work more and didn’t get promoted within the past five years.

cor(first)
##                       satisfaction_level last_evaluation number_project
## satisfaction_level            1.00000000     0.105021214   -0.142969586
## last_evaluation               0.10502121     1.000000000    0.349332589
## number_project               -0.14296959     0.349332589    1.000000000
## average_montly_hours         -0.02004811     0.339741800    0.417210634
## time_spend_company           -0.10086607     0.131590722    0.196785891
## Work_accident                 0.05869724    -0.007104289   -0.004740548
## left                         -0.38837498     0.006567120    0.023787185
## promotion_last_5years         0.02560519    -0.008683768   -0.006063958
##                       average_montly_hours time_spend_company
## satisfaction_level            -0.020048113       -0.100866073
## last_evaluation                0.339741800        0.131590722
## number_project                 0.417210634        0.196785891
## average_montly_hours           1.000000000        0.127754910
## time_spend_company             0.127754910        1.000000000
## Work_accident                 -0.010142888        0.002120418
## left                           0.071287179        0.144822175
## promotion_last_5years         -0.003544414        0.067432925
##                       Work_accident        left promotion_last_5years
## satisfaction_level      0.058697241 -0.38837498           0.025605186
## last_evaluation        -0.007104289  0.00656712          -0.008683768
## number_project         -0.004740548  0.02378719          -0.006063958
## average_montly_hours   -0.010142888  0.07128718          -0.003544414
## time_spend_company      0.002120418  0.14482217           0.067432925
## Work_accident           1.000000000 -0.15462163           0.039245435
## left                   -0.154621634  1.00000000          -0.061788107
## promotion_last_5years   0.039245435 -0.06178811           1.000000000

Who is leaving ?

library(lattice)
leaving <- hresource[ which(hresource$left=='1'), ]
par(mfrow=c(1,3))
histogram(leaving$satisfaction_level,col="green",xlab="Degree of Satisfaction", main="Satisfaction  distribution of left")

histogram(leaving$promotion_last_5years,col="blue",xlab="promotion of last 5 years", main="promotion over 5 years of left")

histogram(leaving$last_evaluation,col="yellow",xlab="last evaluation", main="evaluation of left ")

par( mfrow= c(1,2) )

histogram(leaving$Work_accident,col="lightblue", main = "Work accident",xlab ="workaccident")

histogram(leaving$salary,col="lightblue", main = "Salary",xlab="salary")

In the total of 15 000 employees that compose our database, here are the people that have left:

nrow(leaving)
## [1] 3571

Why good people leave?

goodleaving <- leaving %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
nrow(goodleaving)
## [1] 2014
goodleaving2<- hresource %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
goodleavingselected <- goodleaving2 %>% select(satisfaction_level:promotion_last_5years)
M <- cor(goodleavingselected)
corrplot(M, method="circle")

Here it’s much clearer. On average valuable employees that leave are not satisfied, work on many projects, spend many hours in the company each month and aren’t promoted.

cor(goodleavingselected)
##                       satisfaction_level last_evaluation number_project
## satisfaction_level            1.00000000      0.07462604    -0.40620511
## last_evaluation               0.07462604      1.00000000     0.07066455
## number_project               -0.40620511      0.07066455     1.00000000
## average_montly_hours         -0.19065675      0.19647605     0.29820758
## time_spend_company           -0.11169444     -0.20566461     0.09466946
## Work_accident                 0.04563333     -0.06070303    -0.06768055
## left                         -0.29268571      0.26396556     0.47224280
## promotion_last_5years         0.02629127     -0.05025390    -0.03627774
##                       average_montly_hours time_spend_company
## satisfaction_level             -0.19065675      -0.1116944420
## last_evaluation                 0.19647605      -0.2056646119
## number_project                  0.29820758       0.0946694589
## average_montly_hours            1.00000000       0.0469311258
## time_spend_company              0.04693113       1.0000000000
## Work_accident                  -0.06425132       0.0007404257
## left                            0.44804323       0.2014875881
## promotion_last_5years          -0.01664902       0.0770800344
##                       Work_accident        left promotion_last_5years
## satisfaction_level     0.0456333312 -0.29268571            0.02629127
## last_evaluation       -0.0607030310  0.26396556           -0.05025390
## number_project        -0.0676805487  0.47224280           -0.03627774
## average_montly_hours  -0.0642513222  0.44804323           -0.01664902
## time_spend_company     0.0007404257  0.20148759            0.07708003
## Work_accident          1.0000000000 -0.14813587            0.04962916
## left                  -0.1481358726  1.00000000           -0.07299786
## promotion_last_5years  0.0496291604 -0.07299786            1.00000000

Relation between left people and satisfaction level

relation1 <- lm(hresource$satisfaction_level ~ hresource$left)
plot(hresource$left, hresource$satisfaction_level,
     ylab = "Satisfaction Level",
     xlab = "left",
     main = "relation between left(0 or 1) and satisfaction level of people",
     col = "lightblue")
abline (relation1, col = "black")

summary(relation1)
## 
## Call:
## lm(formula = hresource$satisfaction_level ~ hresource$left)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5468 -0.1368 -0.0001  0.1732  0.4799 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.666810   0.002143  311.12   <2e-16 ***
## hresource$left -0.226712   0.004393  -51.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2291 on 14997 degrees of freedom
## Multiple R-squared:  0.1508, Adjusted R-squared:  0.1508 
## F-statistic:  2664 on 1 and 14997 DF,  p-value: < 2.2e-16

Relation between promotion in last5 years and satisfaction level

relation2 <- lm(hresource$satisfaction_level ~ hresource$promotion_last_5years)
plot(hresource$promotion_last_5years, hresource$satisfaction_level,
     ylab = "Satisfaction Level",
     xlab = "promotion",
     main = "relation between promotion(0 or 1) and satisfaction level ",
     col = "lightblue")
abline (relation2, col = "black")

summary(relation2)
## 
## Call:
## lm(formula = hresource$satisfaction_level ~ hresource$promotion_last_5years)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5460 -0.1719  0.0281  0.2081  0.3881 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.611895   0.002051 298.273  < 2e-16 ***
## hresource$promotion_last_5years 0.044124   0.014067   3.137  0.00171 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2486 on 14997 degrees of freedom
## Multiple R-squared:  0.0006556,  Adjusted R-squared:  0.000589 
## F-statistic: 9.839 on 1 and 14997 DF,  p-value: 0.001712

Relation between number of project and satisfcation level

relation3 <- lm(hresource$satisfaction_level ~ hresource$number_project)
plot(hresource$number_project, hresource$satisfaction_level,
     ylab = "Satisfaction Level",
     xlab = "number of projects",
     main = "relation between number of project and satisfaction level ",
     col = "lightblue")
abline (relation3, col = "black")

summary(relation3)
## 
## Call:
## lm(formula = hresource$satisfaction_level ~ hresource$number_project)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56483 -0.21483  0.03169  0.20401  0.45052 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.722509   0.006517  110.86   <2e-16 ***
## hresource$number_project -0.028839   0.001630  -17.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2461 on 14997 degrees of freedom
## Multiple R-squared:  0.02044,    Adjusted R-squared:  0.02037 
## F-statistic: 312.9 on 1 and 14997 DF,  p-value: < 2.2e-16

Relation between satisfaction level and average monthly hours

relation4 <- lm(hresource$average_montly_hours ~ hresource$satisfaction_level)
plot(hresource$satisfaction_level, hresource$average_montly_hours,
     ylab = "average monthly hours",
     xlab = "Satisfaction Level",
     main = "relation between Satisfaction Level and average monthly hours",
     col = "lightblue")
abline (relation4, col = "black")

summary(relation4)
## 
## Call:
## lm(formula = hresource$average_montly_hours ~ hresource$satisfaction_level)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -106.914  -45.176   -0.619   43.985  109.301 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   203.518      1.085 187.648   <2e-16 ***
## hresource$satisfaction_level   -4.027      1.640  -2.456   0.0141 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.93 on 14997 degrees of freedom
## Multiple R-squared:  0.0004019,  Adjusted R-squared:  0.0003353 
## F-statistic:  6.03 on 1 and 14997 DF,  p-value: 0.01408

Result

Most of the employees leaving the company has higher average monthly hours with lower salary along with No promotion in last 5 years due to low satisfaction level from this analysis we can predict the next employee who may leave the company.