hresource <- read.csv(paste("human.csv", sep=""))
attach(hresource)
dim(hresource)
## [1] 14999 10
library(psych)
describe(hresource)
## vars n mean sd median trimmed mad min
## satisfaction_level 1 14999 0.61 0.25 0.64 0.63 0.28 0.09
## last_evaluation 2 14999 0.72 0.17 0.72 0.72 0.22 0.36
## number_project 3 14999 3.80 1.23 4.00 3.74 1.48 2.00
## average_montly_hours 4 14999 201.05 49.94 200.00 200.64 65.23 96.00
## time_spend_company 5 14999 3.50 1.46 3.00 3.28 1.48 2.00
## Work_accident 6 14999 0.14 0.35 0.00 0.06 0.00 0.00
## left 7 14999 0.24 0.43 0.00 0.17 0.00 0.00
## promotion_last_5years 8 14999 0.02 0.14 0.00 0.00 0.00 0.00
## sales* 9 14999 6.94 2.75 8.00 7.23 2.97 1.00
## salary* 10 14999 2.35 0.63 2.00 2.41 1.48 1.00
## max range skew kurtosis se
## satisfaction_level 1 0.91 -0.48 -0.67 0.00
## last_evaluation 1 0.64 -0.03 -1.24 0.00
## number_project 7 5.00 0.34 -0.50 0.01
## average_montly_hours 310 214.00 0.05 -1.14 0.41
## time_spend_company 10 8.00 1.85 4.77 0.01
## Work_accident 1 1.00 2.02 2.08 0.00
## left 1 1.00 1.23 -0.49 0.00
## promotion_last_5years 1 1.00 6.64 42.03 0.00
## sales* 10 9.00 -0.79 -0.62 0.02
## salary* 3 2.00 -0.42 -0.67 0.01
View(hresource)
mytable <- with(hresource,table(left))
mytable
## left
## 0 1
## 11428 3571
prop.table(mytable)*100
## left
## 0 1
## 76.19175 23.80825
mytable1 <- with(hresource,table(promotion_last_5years))
mytable1
## promotion_last_5years
## 0 1
## 14680 319
prop.table(mytable1)*100
## promotion_last_5years
## 0 1
## 97.873192 2.126808
mytable2 <- with(hresource,table(salary))
mytable2
## salary
## high low medium
## 1237 7316 6446
prop.table(mytable2)*100
## salary
## high low medium
## 8.247216 48.776585 42.976198
mytable3 <- with(hresource,table(Work_accident))
mytable3
## Work_accident
## 0 1
## 12830 2169
prop.table(mytable3)*100
## Work_accident
## 0 1
## 85.53904 14.46096
mytable4 <- xtabs(~ left+promotion_last_5years, data=hresource)
mytable4
## promotion_last_5years
## left 0 1
## 0 11128 300
## 1 3552 19
margin.table(mytable4,1)
## left
## 0 1
## 11428 3571
prop.table(mytable4, 1)*100
## promotion_last_5years
## left 0 1
## 0 97.3748687 2.6251313
## 1 99.4679362 0.5320638
mytable5 <-xtabs(~left+salary,data=hresource)
mytable5
## salary
## left high low medium
## 0 1155 5144 5129
## 1 82 2172 1317
margin.table(mytable5,2)
## salary
## high low medium
## 1237 7316 6446
prop.table(mytable5, 2)
## salary
## left high low medium
## 0 0.93371059 0.70311646 0.79568725
## 1 0.06628941 0.29688354 0.20431275
mytable6 <- xtabs(~left+Work_accident,data=hresource)
mytable6
## Work_accident
## left 0 1
## 0 9428 2000
## 1 3402 169
margin.table(mytable6,1)
## left
## 0 1
## 11428 3571
prop.table(mytable6, 1)
## Work_accident
## left 0 1
## 0 0.82499125 0.17500875
## 1 0.95267432 0.04732568
library(gmodels)
CrossTable(hresource$left,hresource$salary)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hresource$salary
## hresource$left | high | low | medium | Row Total |
## ---------------|-----------|-----------|-----------|-----------|
## 0 | 1155 | 5144 | 5129 | 11428 |
## | 47.915 | 33.200 | 9.648 | |
## | 0.101 | 0.450 | 0.449 | 0.762 |
## | 0.934 | 0.703 | 0.796 | |
## | 0.077 | 0.343 | 0.342 | |
## ---------------|-----------|-----------|-----------|-----------|
## 1 | 82 | 2172 | 1317 | 3571 |
## | 153.339 | 106.247 | 30.876 | |
## | 0.023 | 0.608 | 0.369 | 0.238 |
## | 0.066 | 0.297 | 0.204 | |
## | 0.005 | 0.145 | 0.088 | |
## ---------------|-----------|-----------|-----------|-----------|
## Column Total | 1237 | 7316 | 6446 | 14999 |
## | 0.082 | 0.488 | 0.430 | |
## ---------------|-----------|-----------|-----------|-----------|
##
##
CrossTable(hresource$left,hresource$promotion_last_5years)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 14999
##
##
## | hresource$promotion_last_5years
## hresource$left | 0 | 1 | Row Total |
## ---------------|-----------|-----------|-----------|
## 0 | 11128 | 300 | 11428 |
## | 0.290 | 13.343 | |
## | 0.974 | 0.026 | 0.762 |
## | 0.758 | 0.940 | |
## | 0.742 | 0.020 | |
## ---------------|-----------|-----------|-----------|
## 1 | 3552 | 19 | 3571 |
## | 0.928 | 42.702 | |
## | 0.995 | 0.005 | 0.238 |
## | 0.242 | 0.060 | |
## | 0.237 | 0.001 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 14680 | 319 | 14999 |
## | 0.979 | 0.021 | |
## ---------------|-----------|-----------|-----------|
##
##
par(mfrow=c(1,2))
boxplot(hresource$satisfaction_level,col="yellow",main="Satisfaction level")
boxplot(hresource$last_evaluation,col="blue",main="Last Evaluation")
par(mfrow=c(1,2))
boxplot(hresource$number_project,col="green",main="Number of project")
boxplot(hresource$average_montly_hours,col="red",main="Average monthly hours")
par(mfrow=c(1,2))
boxplot(satisfaction_level ~left ,data=hresource, main="satisfaction of left", ylab="satisfaction level", xlab="left",col= "lightblue",vertical=TRUE)
boxplot(satisfaction_level ~promotion_last_5years ,data=hresource, main= "satisfaction with promotion", ylab="satisfaction level", xlab="promotion in last 5 years",col= "peachpuff",vertical=TRUE)
par(mfrow=c(1,2))
boxplot(number_project ~left ,data=hresource, main="Distribution of number of projects", ylab="number of projects", xlab="left",col= "lightblue",vertical=TRUE)
boxplot(satisfaction_level ~salary ,data=hresource, main="Distribution of satisfaction with salary", ylab="satisfaction level", xlab=" salary",col= "blue",vertical=TRUE)
par(mfrow=c(1,2))
library(lattice)
histogram(~left, data = hresource,
main = "HR leaving the company", xlab="left", col='lightgreen' )
histogram(~satisfaction_level,data=hresource,main="satisfaction level",col="lightblue")
histogram(~last_evaluation,data=hresource,main="frequency of last evalution",col="yellow")
histogram(~salary,data=hresource,main="frequency of salary",col="green")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(satisfaction_level~number_project, data=hresource,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of satisfaction level vs number of project",
xlab="number of project",
ylab="satisfaction level")
scatterplot(satisfaction_level~average_montly_hours,data=hresource,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of satisfaction level vs average working hours",
xlab="average working hours",
ylab="satisfaction level")
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(hresource, aes(satisfaction_level, average_montly_hours)) + geom_point(aes(color = left)) + scale_x_continuous("satisfaction level ") + scale_y_continuous("average monthly hours")+ labs(title="satisfaction of people leaving company")
ggplot(hresource, aes(satisfaction_level, average_montly_hours)) + geom_point(aes(color = left)) + scale_x_continuous("satisfaction level ") + scale_y_continuous("average monthly hours")+ labs(title="satisfaction of people leaving company with salary")+facet_wrap( ~ salary)
library(corrgram)
corrgram(hresource, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="human_resource analysis ")
cor.test(hresource$satisfaction_level,hresource$average_montly_hours)
##
## Pearson's product-moment correlation
##
## data: hresource$satisfaction_level and hresource$average_montly_hours
## t = -2.4556, df = 14997, p-value = 0.01408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.036040356 -0.004045605
## sample estimates:
## cor
## -0.02004811
cor.test(hresource$satisfaction_level,hresource$number_project)
##
## Pearson's product-moment correlation
##
## data: hresource$satisfaction_level and hresource$number_project
## t = -17.69, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1586105 -0.1272570
## sample estimates:
## cor
## -0.1429696
t.test(hresource$satisfaction_level,hresource$average_montly_hours)
##
## Welch Two Sample t-test
##
## data: hresource$satisfaction_level and hresource$average_montly_hours
## t = -491.51, df = 14999, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -201.2368 -199.6382
## sample estimates:
## mean of x mean of y
## 0.6128335 201.0503367
t.test(hresource$satisfaction_level,hresource$number_project)
##
## Welch Two Sample t-test
##
## data: hresource$satisfaction_level and hresource$number_project
## t = -310.72, df = 16216, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.210345 -3.170095
## sample estimates:
## mean of x mean of y
## 0.6128335 3.8030535
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
first <- hresource %>% select(satisfaction_level:promotion_last_5years)
M <- cor(first)
library(corrplot)
## corrplot 0.84 loaded
corrplot(M, method="circle")
On average people who leave have a low satisfaction level, they work more and didn’t get promoted within the past five years.
cor(first)
## satisfaction_level last_evaluation number_project
## satisfaction_level 1.00000000 0.105021214 -0.142969586
## last_evaluation 0.10502121 1.000000000 0.349332589
## number_project -0.14296959 0.349332589 1.000000000
## average_montly_hours -0.02004811 0.339741800 0.417210634
## time_spend_company -0.10086607 0.131590722 0.196785891
## Work_accident 0.05869724 -0.007104289 -0.004740548
## left -0.38837498 0.006567120 0.023787185
## promotion_last_5years 0.02560519 -0.008683768 -0.006063958
## average_montly_hours time_spend_company
## satisfaction_level -0.020048113 -0.100866073
## last_evaluation 0.339741800 0.131590722
## number_project 0.417210634 0.196785891
## average_montly_hours 1.000000000 0.127754910
## time_spend_company 0.127754910 1.000000000
## Work_accident -0.010142888 0.002120418
## left 0.071287179 0.144822175
## promotion_last_5years -0.003544414 0.067432925
## Work_accident left promotion_last_5years
## satisfaction_level 0.058697241 -0.38837498 0.025605186
## last_evaluation -0.007104289 0.00656712 -0.008683768
## number_project -0.004740548 0.02378719 -0.006063958
## average_montly_hours -0.010142888 0.07128718 -0.003544414
## time_spend_company 0.002120418 0.14482217 0.067432925
## Work_accident 1.000000000 -0.15462163 0.039245435
## left -0.154621634 1.00000000 -0.061788107
## promotion_last_5years 0.039245435 -0.06178811 1.000000000
library(lattice)
leaving <- hresource[ which(hresource$left=='1'), ]
par(mfrow=c(1,3))
histogram(leaving$satisfaction_level,col="green",xlab="Degree of Satisfaction", main="Satisfaction distribution of left")
histogram(leaving$promotion_last_5years,col="blue",xlab="promotion of last 5 years", main="promotion over 5 years of left")
histogram(leaving$last_evaluation,col="yellow",xlab="last evaluation", main="evaluation of left ")
par( mfrow= c(1,2) )
histogram(leaving$Work_accident,col="lightblue", main = "Work accident",xlab ="workaccident")
histogram(leaving$salary,col="lightblue", main = "Salary",xlab="salary")
In the total of 15 000 employees that compose our database, here are the people that have left:
nrow(leaving)
## [1] 3571
goodleaving <- leaving %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
nrow(goodleaving)
## [1] 2014
goodleaving2<- hresource %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
goodleavingselected <- goodleaving2 %>% select(satisfaction_level:promotion_last_5years)
M <- cor(goodleavingselected)
corrplot(M, method="circle")
Here it’s much clearer. On average valuable employees that leave are not satisfied, work on many projects, spend many hours in the company each month and aren’t promoted.
cor(goodleavingselected)
## satisfaction_level last_evaluation number_project
## satisfaction_level 1.00000000 0.07462604 -0.40620511
## last_evaluation 0.07462604 1.00000000 0.07066455
## number_project -0.40620511 0.07066455 1.00000000
## average_montly_hours -0.19065675 0.19647605 0.29820758
## time_spend_company -0.11169444 -0.20566461 0.09466946
## Work_accident 0.04563333 -0.06070303 -0.06768055
## left -0.29268571 0.26396556 0.47224280
## promotion_last_5years 0.02629127 -0.05025390 -0.03627774
## average_montly_hours time_spend_company
## satisfaction_level -0.19065675 -0.1116944420
## last_evaluation 0.19647605 -0.2056646119
## number_project 0.29820758 0.0946694589
## average_montly_hours 1.00000000 0.0469311258
## time_spend_company 0.04693113 1.0000000000
## Work_accident -0.06425132 0.0007404257
## left 0.44804323 0.2014875881
## promotion_last_5years -0.01664902 0.0770800344
## Work_accident left promotion_last_5years
## satisfaction_level 0.0456333312 -0.29268571 0.02629127
## last_evaluation -0.0607030310 0.26396556 -0.05025390
## number_project -0.0676805487 0.47224280 -0.03627774
## average_montly_hours -0.0642513222 0.44804323 -0.01664902
## time_spend_company 0.0007404257 0.20148759 0.07708003
## Work_accident 1.0000000000 -0.14813587 0.04962916
## left -0.1481358726 1.00000000 -0.07299786
## promotion_last_5years 0.0496291604 -0.07299786 1.00000000
relation1 <- lm(hresource$satisfaction_level ~ hresource$left)
plot(hresource$left, hresource$satisfaction_level,
ylab = "Satisfaction Level",
xlab = "left",
main = "relation between left(0 or 1) and satisfaction level of people",
col = "lightblue")
abline (relation1, col = "black")
summary(relation1)
##
## Call:
## lm(formula = hresource$satisfaction_level ~ hresource$left)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5468 -0.1368 -0.0001 0.1732 0.4799
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.666810 0.002143 311.12 <2e-16 ***
## hresource$left -0.226712 0.004393 -51.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2291 on 14997 degrees of freedom
## Multiple R-squared: 0.1508, Adjusted R-squared: 0.1508
## F-statistic: 2664 on 1 and 14997 DF, p-value: < 2.2e-16
relation2 <- lm(hresource$satisfaction_level ~ hresource$promotion_last_5years)
plot(hresource$promotion_last_5years, hresource$satisfaction_level,
ylab = "Satisfaction Level",
xlab = "promotion",
main = "relation between promotion(0 or 1) and satisfaction level ",
col = "lightblue")
abline (relation2, col = "black")
summary(relation2)
##
## Call:
## lm(formula = hresource$satisfaction_level ~ hresource$promotion_last_5years)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5460 -0.1719 0.0281 0.2081 0.3881
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.611895 0.002051 298.273 < 2e-16 ***
## hresource$promotion_last_5years 0.044124 0.014067 3.137 0.00171 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2486 on 14997 degrees of freedom
## Multiple R-squared: 0.0006556, Adjusted R-squared: 0.000589
## F-statistic: 9.839 on 1 and 14997 DF, p-value: 0.001712
relation3 <- lm(hresource$satisfaction_level ~ hresource$number_project)
plot(hresource$number_project, hresource$satisfaction_level,
ylab = "Satisfaction Level",
xlab = "number of projects",
main = "relation between number of project and satisfaction level ",
col = "lightblue")
abline (relation3, col = "black")
summary(relation3)
##
## Call:
## lm(formula = hresource$satisfaction_level ~ hresource$number_project)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56483 -0.21483 0.03169 0.20401 0.45052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.722509 0.006517 110.86 <2e-16 ***
## hresource$number_project -0.028839 0.001630 -17.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2461 on 14997 degrees of freedom
## Multiple R-squared: 0.02044, Adjusted R-squared: 0.02037
## F-statistic: 312.9 on 1 and 14997 DF, p-value: < 2.2e-16
relation4 <- lm(hresource$average_montly_hours ~ hresource$satisfaction_level)
plot(hresource$satisfaction_level, hresource$average_montly_hours,
ylab = "average monthly hours",
xlab = "Satisfaction Level",
main = "relation between Satisfaction Level and average monthly hours",
col = "lightblue")
abline (relation4, col = "black")
summary(relation4)
##
## Call:
## lm(formula = hresource$average_montly_hours ~ hresource$satisfaction_level)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.914 -45.176 -0.619 43.985 109.301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 203.518 1.085 187.648 <2e-16 ***
## hresource$satisfaction_level -4.027 1.640 -2.456 0.0141 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.93 on 14997 degrees of freedom
## Multiple R-squared: 0.0004019, Adjusted R-squared: 0.0003353
## F-statistic: 6.03 on 1 and 14997 DF, p-value: 0.01408