This is a programming assignment.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
library(xtable)
library(ggplot2)
and use your own path
get rows and columns
## [1] 14999 10
## 'data.frame': 14999 obs. of 10 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ sales : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ salary : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Sun Oct 01 03:12:18 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrrr}
## \hline
## & satisfaction\_level & last\_evaluation & number\_project & average\_montly\_hours & time\_spend\_company & Work\_accident & left & promotion\_last\_5years \\
## \hline
## satisfaction\_level & 1.00 & 0.11 & -0.14 & -0.02 & -0.10 & 0.06 & -0.39 & 0.03 \\
## last\_evaluation & 0.11 & 1.00 & 0.35 & 0.34 & 0.13 & -0.01 & 0.01 & -0.01 \\
## number\_project & -0.14 & 0.35 & 1.00 & 0.42 & 0.20 & -0.00 & 0.02 & -0.01 \\
## average\_montly\_hours & -0.02 & 0.34 & 0.42 & 1.00 & 0.13 & -0.01 & 0.07 & -0.00 \\
## time\_spend\_company & -0.10 & 0.13 & 0.20 & 0.13 & 1.00 & 0.00 & 0.14 & 0.07 \\
## Work\_accident & 0.06 & -0.01 & -0.00 & -0.01 & 0.00 & 1.00 & -0.15 & 0.04 \\
## left & -0.39 & 0.01 & 0.02 & 0.07 & 0.14 & -0.15 & 1.00 & -0.06 \\
## promotion\_last\_5years & 0.03 & -0.01 & -0.01 & -0.00 & 0.07 & 0.04 & -0.06 & 1.00 \\
## \hline
## \end{tabular}
## \end{table}
library(reshape2)
melted_cormat <- melt(cor(subset(mydf, select = -c(sales, salary))))
head(melted_cormat)
## Var1 Var2 value
## 1 satisfaction_level satisfaction_level 1.00000000
## 2 last_evaluation satisfaction_level 0.10502121
## 3 number_project satisfaction_level -0.14296959
## 4 average_montly_hours satisfaction_level -0.02004811
## 5 time_spend_company satisfaction_level -0.10086607
## 6 Work_accident satisfaction_level 0.05869724
g <- ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_raster()
g <- g + theme(axis.text.x = element_text(angle = 90, hjust = 1))
g
mydf$status <- ifelse(mydf$left == 1, "Left", "Stay")
table(mydf$status, mydf$salary, mydf$sales)
## , , = accounting
##
##
## high low medium
## Left 5 99 100
## Stay 69 259 235
##
## , , = hr
##
##
## high low medium
## Left 6 92 117
## Stay 39 243 242
##
## , , = IT
##
##
## high low medium
## Left 4 172 97
## Stay 79 437 438
##
## , , = management
##
##
## high low medium
## Left 1 59 31
## Stay 224 121 194
##
## , , = marketing
##
##
## high low medium
## Left 9 126 68
## Stay 71 276 308
##
## , , = product_mng
##
##
## high low medium
## Left 6 105 87
## Stay 62 346 296
##
## , , = RandD
##
##
## high low medium
## Left 4 55 62
## Stay 47 309 310
##
## , , = sales
##
##
## high low medium
## Left 14 697 303
## Stay 255 1402 1469
##
## , , = support
##
##
## high low medium
## Left 8 389 158
## Stay 133 757 784
##
## , , = technical
##
##
## high low medium
## Left 25 378 294
## Stay 176 994 853
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = sales, fill = status)) + geom_bar() + guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
- Complete this for employees who stay in the company by department
ggplot(mydf[ which(mydf$status=='Stay'),],
aes(x = sales, fill = status)) +
geom_bar() +
labs(x = "DEPARTMENT", y = "STAY") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
mydf$salary <- factor(mydf$salary, levels = c("low", "medium", "high"))
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = salary, fill = status)) +
geom_bar() +
labs(x = "SALARY", y = "LEFT") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
ggplot(mydf[ which(mydf$status=='Stay'),],
aes(x = salary, fill = status)) +
geom_bar() +
labs(x = "SALARY", y = "STAY") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
mydf$time_spend_company <- as.factor(mydf$time_spend_company)
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) +
geom_bar() +
labs(x = "Time Spent in the company", y = "LEFT") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) +
geom_bar() +
labs(x = "Time Spent in the company", y = "LEFT") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.3)) +
facet_grid(salary~sales)
ggplot(mydf[ which(mydf$status=='Stay'),],
aes(x = time_spend_company, fill = status)) +
geom_bar() +
labs(x = "Time Spent in the company", y = "STAY") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.3)) +
facet_grid(salary~sales)
mydf$time_spend_company <- as.factor(mydf$time_spend_company)
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) +
geom_bar() +
labs(x = "Year stayed in the company", y = "LEFT") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.3))
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) +
geom_bar() +
labs(x = "Year stayed in the company", y = "LEFT") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.3)) +
facet_grid(. ~ sales)
ggplot(mydf[ which(mydf$status=='Left'),],
aes(x = time_spend_company, fill = status)) +
geom_bar() +
labs(x = "Year stayed in the company", y = "LEFT") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 0, hjust = 0.3)) +
facet_grid(. ~ salary)
The first thing to do the analysis work is to define the problem, and since we focus on staff’s status (left or stay), people who left the office is the group we need to concentrate on.
The graph which shows people left separated by salary level indicates that people with higher salary level tend to stay in the company, most office-leavings happen among people with low salary. Sales department have the largest number of people leaving office, but the proportion of office leaving is quite normal as it has the largest number of staff. Relatively, accounting and hr departments are 2 departments have the highest office-leaving rate.
When it comes to year spent in the company before leaving, it seems most people tend to leave office after 3-year working in the company. Rare people will choose to leave office after 2 years, and people tend to stay for a stable job when they spent more than 6 years in the company. Staff with 3-5 years of working experience in the company are the group of people who have a high office-leaving rate.