There is 3 parts for the assignment 4. The data we will use for the assignment is available at https://www.kaggle.com/ludobenistant/hr-analytics. The data is about Humam resources from a company, each row corresponds to an employee. Here is the list of library we are going to use.
library(dplyr)
library(tibble)
library(xtable)
library(ggplot2)
library(reshape2)
mydf <- read.csv("C:/Users/tresz/Desktop/DNSC6211/ass04/HR_comma_sep.csv")
Get row and column
dim.data.frame(mydf)
## [1] 14999 10
Explore the data structure
str(mydf)
## 'data.frame': 14999 obs. of 10 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ sales : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ salary : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...
Create a correlation heat map
melted_cormat <- melt(cor(subset(mydf, select = -c(sales, salary))))
head(melted_cormat)
## Var1 Var2 value
## 1 satisfaction_level satisfaction_level 1.00000000
## 2 last_evaluation satisfaction_level 0.10502121
## 3 number_project satisfaction_level -0.14296959
## 4 average_montly_hours satisfaction_level -0.02004811
## 5 time_spend_company satisfaction_level -0.10086607
## 6 Work_accident satisfaction_level 0.05869724
g <- ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_raster()
g <- g + theme(axis.text.x = element_text(angle = 90, hjust = 1))
g
# Develop a 3-way table
#create column
mydf$status <- ifelse(mydf$left == 1, "Left", "Stay")
#table(x,y,z)
table(mydf$status, mydf$salary, mydf$sales)
## , , = accounting
##
##
## high low medium
## Left 5 99 100
## Stay 69 259 235
##
## , , = hr
##
##
## high low medium
## Left 6 92 117
## Stay 39 243 242
##
## , , = IT
##
##
## high low medium
## Left 4 172 97
## Stay 79 437 438
##
## , , = management
##
##
## high low medium
## Left 1 59 31
## Stay 224 121 194
##
## , , = marketing
##
##
## high low medium
## Left 9 126 68
## Stay 71 276 308
##
## , , = product_mng
##
##
## high low medium
## Left 6 105 87
## Stay 62 346 296
##
## , , = RandD
##
##
## high low medium
## Left 4 55 62
## Stay 47 309 310
##
## , , = sales
##
##
## high low medium
## Left 14 697 303
## Stay 255 1402 1469
##
## , , = support
##
##
## high low medium
## Left 8 389 158
## Stay 133 757 784
##
## , , = technical
##
##
## high low medium
## Left 25 378 294
## Stay 176 994 853
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = sales, fill = status)) + geom_bar(aes(fill=sales)) + guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(mydf[ which(mydf$status=='Stay'),],
aes(x = sales, fill = status)) + geom_bar(aes(fill=sales)) + guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = salary, fill = status)) + geom_bar(aes(fill=salary)) + guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(mydf[ which(mydf$status=='Stay'),],
aes(x = salary, fill = status)) + geom_bar(aes(fill=salary)) + guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) +
geom_bar(aes(fill=sales)) + guides(fill=FALSE) +
facet_grid(salary~ sales) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(mydf[ which(mydf$status=='Stay'),],
aes(x = time_spend_company, fill = status)) +
geom_bar(aes(fill=sales)) + guides(fill=FALSE) +
facet_grid(salary~ sales) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) + geom_bar() + guides(fill=FALSE)
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) +
geom_bar(aes(fill=sales)) + guides(fill=FALSE) +
facet_grid(.~ sales)
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) +
geom_bar(aes(fill=salary)) + guides(fill=FALSE) +
facet_grid(.~ salary)
Summary
From the analysis by R, we can see that high level salary employees leave the company least among 3 salary levels in all department. Employees from sales department has the most number in both employee leave and stay in the company. Employees with high salary tend to stay at the company most, and employees with low salary leave the company most. Employees tend to leave the company after worked in the company for 3 years for all salary level and all departments. The more year employees work for he company, the more likely they are stay in the company because we can see that the number of employees who work more than 6 years don’t leave the company.