Our example concerns a big company that wants to understand why some of their best and most experienced employees are leaving prematurely. The company also wishes to predict which valuable employees will leave next.

Reading the data into a dataframe

hresource <- read.csv(paste("human.csv", sep=""))
attach(hresource)
dim(hresource)
## [1] 14999    10
library(psych)
describe(hresource)
##                       vars     n   mean    sd median trimmed   mad   min
## satisfaction_level       1 14999   0.61  0.25   0.64    0.63  0.28  0.09
## last_evaluation          2 14999   0.72  0.17   0.72    0.72  0.22  0.36
## number_project           3 14999   3.80  1.23   4.00    3.74  1.48  2.00
## average_montly_hours     4 14999 201.05 49.94 200.00  200.64 65.23 96.00
## time_spend_company       5 14999   3.50  1.46   3.00    3.28  1.48  2.00
## Work_accident            6 14999   0.14  0.35   0.00    0.06  0.00  0.00
## left                     7 14999   0.24  0.43   0.00    0.17  0.00  0.00
## promotion_last_5years    8 14999   0.02  0.14   0.00    0.00  0.00  0.00
## sales*                   9 14999   6.94  2.75   8.00    7.23  2.97  1.00
## salary*                 10 14999   2.35  0.63   2.00    2.41  1.48  1.00
##                       max  range  skew kurtosis   se
## satisfaction_level      1   0.91 -0.48    -0.67 0.00
## last_evaluation         1   0.64 -0.03    -1.24 0.00
## number_project          7   5.00  0.34    -0.50 0.01
## average_montly_hours  310 214.00  0.05    -1.14 0.41
## time_spend_company     10   8.00  1.85     4.77 0.01
## Work_accident           1   1.00  2.02     2.08 0.00
## left                    1   1.00  1.23    -0.49 0.00
## promotion_last_5years   1   1.00  6.64    42.03 0.00
## sales*                 10   9.00 -0.79    -0.62 0.02
## salary*                 3   2.00 -0.42    -0.67 0.01
View(hresource)
hresource$Work_accident[hresource$Work_accident == 1] <- 'yes'
hresource$Work_accident[hresource$Work_accident == 0] <- 'no'
hresource$Work_accident <- factor(hresource$Work_accident)

hresource$promotion_last_5years[hresource$promotion_last_5years ==0] <-'no'
hresource$promotion_last_5years[hresource$promotion_last_5years ==1] <- 'yes'
hresource$promotion_last_5years <- factor(hresource$promotion_last_5years)

hresource$left[hresource$left==0] <- 'no'
hresource$left[hresource$left==1] <-'yes'
hresource$left <- factor(hresource$left)

str(hresource)
## 'data.frame':    14999 obs. of  10 variables:
##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ left                 : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ promotion_last_5years: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sales                : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ salary               : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...

one way contigency table

mytable <- with(hresource,table(left))
mytable
## left
##    no   yes 
## 11428  3571
prop.table(mytable)*100
## left
##       no      yes 
## 76.19175 23.80825
mytable1 <- with(hresource,table(promotion_last_5years))
mytable1
## promotion_last_5years
##    no   yes 
## 14680   319
prop.table(mytable1)*100
## promotion_last_5years
##        no       yes 
## 97.873192  2.126808
mytable2 <- with(hresource,table(salary))
mytable2
## salary
##   high    low medium 
##   1237   7316   6446
prop.table(mytable2)*100
## salary
##      high       low    medium 
##  8.247216 48.776585 42.976198
mytable3 <- with(hresource,table(Work_accident))
mytable3
## Work_accident
##    no   yes 
## 12830  2169
prop.table(mytable3)*100
## Work_accident
##       no      yes 
## 85.53904 14.46096

two way contingency table

mytable4 <- xtabs(~ left+promotion_last_5years, data=hresource)
mytable4
##      promotion_last_5years
## left     no   yes
##   no  11128   300
##   yes  3552    19
margin.table(mytable4,1)
## left
##    no   yes 
## 11428  3571
prop.table(mytable4, 1)*100
##      promotion_last_5years
## left          no        yes
##   no  97.3748687  2.6251313
##   yes 99.4679362  0.5320638
mytable5 <-xtabs(~left+salary,data=hresource)
mytable5
##      salary
## left  high  low medium
##   no  1155 5144   5129
##   yes   82 2172   1317
margin.table(mytable5,2) 
## salary
##   high    low medium 
##   1237   7316   6446
prop.table(mytable5, 2)
##      salary
## left        high        low     medium
##   no  0.93371059 0.70311646 0.79568725
##   yes 0.06628941 0.29688354 0.20431275
mytable6 <- xtabs(~left+Work_accident,data=hresource)
mytable6
##      Work_accident
## left    no  yes
##   no  9428 2000
##   yes 3402  169
margin.table(mytable6,1)
## left
##    no   yes 
## 11428  3571
prop.table(mytable6, 1)
##      Work_accident
## left          no        yes
##   no  0.82499125 0.17500875
##   yes 0.95267432 0.04732568
library(gmodels)
CrossTable(hresource$left,hresource$salary)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##                | hresource$salary 
## hresource$left |      high |       low |    medium | Row Total | 
## ---------------|-----------|-----------|-----------|-----------|
##             no |      1155 |      5144 |      5129 |     11428 | 
##                |    47.915 |    33.200 |     9.648 |           | 
##                |     0.101 |     0.450 |     0.449 |     0.762 | 
##                |     0.934 |     0.703 |     0.796 |           | 
##                |     0.077 |     0.343 |     0.342 |           | 
## ---------------|-----------|-----------|-----------|-----------|
##            yes |        82 |      2172 |      1317 |      3571 | 
##                |   153.339 |   106.247 |    30.876 |           | 
##                |     0.023 |     0.608 |     0.369 |     0.238 | 
##                |     0.066 |     0.297 |     0.204 |           | 
##                |     0.005 |     0.145 |     0.088 |           | 
## ---------------|-----------|-----------|-----------|-----------|
##   Column Total |      1237 |      7316 |      6446 |     14999 | 
##                |     0.082 |     0.488 |     0.430 |           | 
## ---------------|-----------|-----------|-----------|-----------|
## 
## 
CrossTable(hresource$left,hresource$promotion_last_5years)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  14999 
## 
##  
##                | hresource$promotion_last_5years 
## hresource$left |        no |       yes | Row Total | 
## ---------------|-----------|-----------|-----------|
##             no |     11128 |       300 |     11428 | 
##                |     0.290 |    13.343 |           | 
##                |     0.974 |     0.026 |     0.762 | 
##                |     0.758 |     0.940 |           | 
##                |     0.742 |     0.020 |           | 
## ---------------|-----------|-----------|-----------|
##            yes |      3552 |        19 |      3571 | 
##                |     0.928 |    42.702 |           | 
##                |     0.995 |     0.005 |     0.238 | 
##                |     0.242 |     0.060 |           | 
##                |     0.237 |     0.001 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |     14680 |       319 |     14999 | 
##                |     0.979 |     0.021 |           | 
## ---------------|-----------|-----------|-----------|
## 
## 

boxplot

attach(hresource)
## The following objects are masked from hresource (pos = 5):
## 
##     average_montly_hours, last_evaluation, left, number_project,
##     promotion_last_5years, salary, sales, satisfaction_level,
##     time_spend_company, Work_accident
par(mfrow=c(1,2))
boxplot(hresource$satisfaction_level,col="yellow",main="Satisfaction level")
boxplot(hresource$last_evaluation,col="blue",main="Last Evaluation")

par(mfrow=c(1,2))
boxplot(hresource$number_project,col="green",main="Number of project")
boxplot(hresource$average_montly_hours,col="red",main="Average monthly hours")

boxplot(satisfaction_level ~left  ,data=hresource, main="Distribution of satisfaction with working hours", ylab="satisfaction level", xlab="left",col= "lightblue",vertical=TRUE)

boxplot(satisfaction_level ~promotion_last_5years  ,data=hresource, main="Distribution of satisfaction with promotion", ylab="satisfaction level", xlab="promotion in last 5 years",col= "peachpuff",vertical=TRUE)

boxplot(number_project ~left  ,data=hresource, main="Distribution of number of projects", ylab="number of projects", xlab="left",col= "lightblue",vertical=TRUE)

boxplot(satisfaction_level ~salary  ,data=hresource, main="Distribution of satisfaction with salary", ylab="satisfaction level", xlab=" salary",col= "blue",vertical=TRUE)

Histogram

library(lattice)
histogram(~left, data = hresource,
 main = "Frequency of human resource leaving the company", xlab="left", col='lightgreen' ) 

histogram(~satisfaction_level,data=hresource,main="Frequency of satisfaction level",col="lightblue")

histogram(~last_evaluation,data=hresource,main="frequency of last evalution",col="yellow")

histogram(~salary,data=hresource,main="frequency of salary",col="green")

scatterplot

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(satisfaction_level~number_project,     data=hresource,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of satisfaction level vs number of project",
            xlab="number of project",
            ylab="satisfaction level")

scatterplot(satisfaction_level~average_montly_hours,data=hresource,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of satisfaction level vs average working hours",
            xlab="average working hours",
            ylab="satisfaction level")

Analyzing about leaving people

newdata <- hresource[ which(hresource$left=='yes'), ]
par(mfrow=c(1,3))
histogram(newdata$satisfaction_level,col="green",xlab="Degree of Satisfaction", main="Satisfaction  distribution of left")

histogram(newdata$promotion_last_5years,col="blue",xlab="promotion of last 5 years", main="promotion over 5 years of left")

histogram(newdata$last_evaluation,col="yellow",xlab="last evaluation", main="evaluation of left ")

par( mfrow= c(1,2) )

histogram(newdata$Work_accident,col="lightblue", main = "Work accident",xlab ="workaccident")

histogram(newdata$salary,col="lightblue", main = "Salary",xlab="salary")

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(hresource, aes(satisfaction_level, average_montly_hours)) + geom_point(aes(color = left)) +  scale_x_continuous("satisfaction level ") + scale_y_continuous("average monthly hours")+ labs(title="satisfaction of people leaving company")

ggplot(hresource, aes(satisfaction_level, average_montly_hours)) + geom_point(aes(color = left)) +  scale_x_continuous("satisfaction level ") + scale_y_continuous("average monthly hours")+ labs(title="satisfaction of people leaving company with salary")+facet_wrap( ~ salary)

corrgram

library(corrgram)
    corrgram(hresource, order=TRUE, lower.panel=panel.shade,
    upper.panel=panel.pie, text.panel=panel.txt,
    main="human_resource analysis ")

correlation test

cor.test(hresource$satisfaction_level,hresource$average_montly_hours)
## 
##  Pearson's product-moment correlation
## 
## data:  hresource$satisfaction_level and hresource$average_montly_hours
## t = -2.4556, df = 14997, p-value = 0.01408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.036040356 -0.004045605
## sample estimates:
##         cor 
## -0.02004811
cor.test(hresource$satisfaction_level,hresource$number_project)
## 
##  Pearson's product-moment correlation
## 
## data:  hresource$satisfaction_level and hresource$number_project
## t = -17.69, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1586105 -0.1272570
## sample estimates:
##        cor 
## -0.1429696

t-test

t.test(hresource$satisfaction_level,hresource$average_montly_hours)
## 
##  Welch Two Sample t-test
## 
## data:  hresource$satisfaction_level and hresource$average_montly_hours
## t = -491.51, df = 14999, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -201.2368 -199.6382
## sample estimates:
##   mean of x   mean of y 
##   0.6128335 201.0503367
t.test(hresource$satisfaction_level,hresource$number_project)
## 
##  Welch Two Sample t-test
## 
## data:  hresource$satisfaction_level and hresource$number_project
## t = -310.72, df = 16216, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.210345 -3.170095
## sample estimates:
## mean of x mean of y 
## 0.6128335 3.8030535