Our example concerns a big company that wants to understand why some of their best and most experienced employees are leaving prematurely. The company also wishes to predict which valuable employees will leave next.
hresource <- read.csv(paste("human.csv", sep=""))
attach(hresource)
dim(hresource)
## [1] 14999 10
library(psych)
describe(hresource)
## vars n mean sd median trimmed mad min
## satisfaction_level 1 14999 0.61 0.25 0.64 0.63 0.28 0.09
## last_evaluation 2 14999 0.72 0.17 0.72 0.72 0.22 0.36
## number_project 3 14999 3.80 1.23 4.00 3.74 1.48 2.00
## average_montly_hours 4 14999 201.05 49.94 200.00 200.64 65.23 96.00
## time_spend_company 5 14999 3.50 1.46 3.00 3.28 1.48 2.00
## Work_accident 6 14999 0.14 0.35 0.00 0.06 0.00 0.00
## left 7 14999 0.24 0.43 0.00 0.17 0.00 0.00
## promotion_last_5years 8 14999 0.02 0.14 0.00 0.00 0.00 0.00
## sales* 9 14999 6.94 2.75 8.00 7.23 2.97 1.00
## salary* 10 14999 2.35 0.63 2.00 2.41 1.48 1.00
## max range skew kurtosis se
## satisfaction_level 1 0.91 -0.48 -0.67 0.00
## last_evaluation 1 0.64 -0.03 -1.24 0.00
## number_project 7 5.00 0.34 -0.50 0.01
## average_montly_hours 310 214.00 0.05 -1.14 0.41
## time_spend_company 10 8.00 1.85 4.77 0.01
## Work_accident 1 1.00 2.02 2.08 0.00
## left 1 1.00 1.23 -0.49 0.00
## promotion_last_5years 1 1.00 6.64 42.03 0.00
## sales* 10 9.00 -0.79 -0.62 0.02
## salary* 3 2.00 -0.42 -0.67 0.01
View(hresource)
hresource$Work_accident[hresource$Work_accident == 1] <- 'yes'
hresource$Work_accident[hresource$Work_accident == 0] <- 'no'
hresource$Work_accident <- factor(hresource$Work_accident)
hresource$promotion_last_5years[hresource$promotion_last_5years ==0] <-'no'
hresource$promotion_last_5years[hresource$promotion_last_5years ==1] <- 'yes'
hresource$promotion_last_5years <- factor(hresource$promotion_last_5years)
hresource$left[hresource$left==0] <- 'no'
hresource$left[hresource$left==1] <-'yes'
hresource$left <- factor(hresource$left)
str(hresource)
## 'data.frame': 14999 obs. of 10 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ left : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ promotion_last_5years: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ sales : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ salary : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...
mytable <- with(hresource,table(left))
mytable
## left
## no yes
## 11428 3571
prop.table(mytable)*100
## left
## no yes
## 76.19175 23.80825
mytable1 <- with(hresource,table(promotion_last_5years))
mytable1
## promotion_last_5years
## no yes
## 14680 319
prop.table(mytable1)*100
## promotion_last_5years
## no yes
## 97.873192 2.126808
mytable2 <- with(hresource,table(salary))
mytable2
## salary
## high low medium
## 1237 7316 6446
prop.table(mytable2)*100
## salary
## high low medium
## 8.247216 48.776585 42.976198
mytable3 <- with(hresource,table(Work_accident))
mytable3
## Work_accident
## no yes
## 12830 2169
prop.table(mytable3)*100
## Work_accident
## no yes
## 85.53904 14.46096
mytable4 <- xtabs(~ left+promotion_last_5years, data=hresource)
mytable4
## promotion_last_5years
## left no yes
## no 11128 300
## yes 3552 19
margin.table(mytable4,1)
## left
## no yes
## 11428 3571
prop.table(mytable4, 1)*100
## promotion_last_5years
## left no yes
## no 97.3748687 2.6251313
## yes 99.4679362 0.5320638
mytable5 <-xtabs(~left+salary,data=hresource)
mytable5
## salary
## left high low medium
## no 1155 5144 5129
## yes 82 2172 1317
margin.table(mytable5,2)
## salary
## high low medium
## 1237 7316 6446
prop.table(mytable5, 2)
## salary
## left high low medium
## no 0.93371059 0.70311646 0.79568725
## yes 0.06628941 0.29688354 0.20431275
mytable6 <- xtabs(~left+Work_accident,data=hresource)
mytable6
## Work_accident
## left no yes
## no 9428 2000
## yes 3402 169
margin.table(mytable6,1)
## left
## no yes
## 11428 3571
prop.table(mytable6, 1)
## Work_accident
## left no yes
## no 0.82499125 0.17500875
## yes 0.95267432 0.04732568
library(gmodels)
CrossTable(hresource$left,hresource$salary)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hresource$salary
## hresource$left | high | low | medium | Row Total |
## ---------------|-----------|-----------|-----------|-----------|
## no | 1155 | 5144 | 5129 | 11428 |
## | 47.915 | 33.200 | 9.648 | |
## | 0.101 | 0.450 | 0.449 | 0.762 |
## | 0.934 | 0.703 | 0.796 | |
## | 0.077 | 0.343 | 0.342 | |
## ---------------|-----------|-----------|-----------|-----------|
## yes | 82 | 2172 | 1317 | 3571 |
## | 153.339 | 106.247 | 30.876 | |
## | 0.023 | 0.608 | 0.369 | 0.238 |
## | 0.066 | 0.297 | 0.204 | |
## | 0.005 | 0.145 | 0.088 | |
## ---------------|-----------|-----------|-----------|-----------|
## Column Total | 1237 | 7316 | 6446 | 14999 |
## | 0.082 | 0.488 | 0.430 | |
## ---------------|-----------|-----------|-----------|-----------|
##
##
CrossTable(hresource$left,hresource$promotion_last_5years)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hresource$promotion_last_5years
## hresource$left | no | yes | Row Total |
## ---------------|-----------|-----------|-----------|
## no | 11128 | 300 | 11428 |
## | 0.290 | 13.343 | |
## | 0.974 | 0.026 | 0.762 |
## | 0.758 | 0.940 | |
## | 0.742 | 0.020 | |
## ---------------|-----------|-----------|-----------|
## yes | 3552 | 19 | 3571 |
## | 0.928 | 42.702 | |
## | 0.995 | 0.005 | 0.238 |
## | 0.242 | 0.060 | |
## | 0.237 | 0.001 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 14680 | 319 | 14999 |
## | 0.979 | 0.021 | |
## ---------------|-----------|-----------|-----------|
##
##
attach(hresource)
## The following objects are masked from hresource (pos = 5):
##
## average_montly_hours, last_evaluation, left, number_project,
## promotion_last_5years, salary, sales, satisfaction_level,
## time_spend_company, Work_accident
par(mfrow=c(1,2))
boxplot(hresource$satisfaction_level,col="yellow",main="Satisfaction level")
boxplot(hresource$last_evaluation,col="blue",main="Last Evaluation")
par(mfrow=c(1,2))
boxplot(hresource$number_project,col="green",main="Number of project")
boxplot(hresource$average_montly_hours,col="red",main="Average monthly hours")
boxplot(satisfaction_level ~left ,data=hresource, main="Distribution of satisfaction with working hours", ylab="satisfaction level", xlab="left",col= "lightblue",vertical=TRUE)
boxplot(satisfaction_level ~promotion_last_5years ,data=hresource, main="Distribution of satisfaction with promotion", ylab="satisfaction level", xlab="promotion in last 5 years",col= "peachpuff",vertical=TRUE)
boxplot(number_project ~left ,data=hresource, main="Distribution of number of projects", ylab="number of projects", xlab="left",col= "lightblue",vertical=TRUE)
boxplot(satisfaction_level ~salary ,data=hresource, main="Distribution of satisfaction with salary", ylab="satisfaction level", xlab=" salary",col= "blue",vertical=TRUE)
library(lattice)
histogram(~left, data = hresource,
main = "Frequency of human resource leaving the company", xlab="left", col='lightgreen' )
histogram(~satisfaction_level,data=hresource,main="Frequency of satisfaction level",col="lightblue")
histogram(~last_evaluation,data=hresource,main="frequency of last evalution",col="yellow")
histogram(~salary,data=hresource,main="frequency of salary",col="green")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(satisfaction_level~number_project, data=hresource,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of satisfaction level vs number of project",
xlab="number of project",
ylab="satisfaction level")
scatterplot(satisfaction_level~average_montly_hours,data=hresource,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of satisfaction level vs average working hours",
xlab="average working hours",
ylab="satisfaction level")
newdata <- hresource[ which(hresource$left=='yes'), ]
par(mfrow=c(1,3))
histogram(newdata$satisfaction_level,col="green",xlab="Degree of Satisfaction", main="Satisfaction distribution of left")
histogram(newdata$promotion_last_5years,col="blue",xlab="promotion of last 5 years", main="promotion over 5 years of left")
histogram(newdata$last_evaluation,col="yellow",xlab="last evaluation", main="evaluation of left ")
par( mfrow= c(1,2) )
histogram(newdata$Work_accident,col="lightblue", main = "Work accident",xlab ="workaccident")
histogram(newdata$salary,col="lightblue", main = "Salary",xlab="salary")
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(hresource, aes(satisfaction_level, average_montly_hours)) + geom_point(aes(color = left)) + scale_x_continuous("satisfaction level ") + scale_y_continuous("average monthly hours")+ labs(title="satisfaction of people leaving company")
ggplot(hresource, aes(satisfaction_level, average_montly_hours)) + geom_point(aes(color = left)) + scale_x_continuous("satisfaction level ") + scale_y_continuous("average monthly hours")+ labs(title="satisfaction of people leaving company with salary")+facet_wrap( ~ salary)
library(corrgram)
corrgram(hresource, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="human_resource analysis ")
cor.test(hresource$satisfaction_level,hresource$average_montly_hours)
##
## Pearson's product-moment correlation
##
## data: hresource$satisfaction_level and hresource$average_montly_hours
## t = -2.4556, df = 14997, p-value = 0.01408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.036040356 -0.004045605
## sample estimates:
## cor
## -0.02004811
cor.test(hresource$satisfaction_level,hresource$number_project)
##
## Pearson's product-moment correlation
##
## data: hresource$satisfaction_level and hresource$number_project
## t = -17.69, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1586105 -0.1272570
## sample estimates:
## cor
## -0.1429696
t.test(hresource$satisfaction_level,hresource$average_montly_hours)
##
## Welch Two Sample t-test
##
## data: hresource$satisfaction_level and hresource$average_montly_hours
## t = -491.51, df = 14999, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -201.2368 -199.6382
## sample estimates:
## mean of x mean of y
## 0.6128335 201.0503367
t.test(hresource$satisfaction_level,hresource$number_project)
##
## Welch Two Sample t-test
##
## data: hresource$satisfaction_level and hresource$number_project
## t = -310.72, df = 16216, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.210345 -3.170095
## sample estimates:
## mean of x mean of y
## 0.6128335 3.8030535