Library

library(ggplot2)
library(ggthemes)
library(scales)

Import Data

The data is related with company attrition.

attr <- read.csv("attrition.csv")

Data Inspection

head(attr,5)
dim(attr)
## [1] 150  34

From our inspection we can conclude :

  • attrition data contain 150 rows and 34 columns
  • each columns names: “attrition”, “age”, “business_travel”, “daily-rate”, “department”, “distance_from_home”, “education”, “education_field”, “employee_count”, “employee_number”, “environment_satisfaction”, “gender”, “hourly_rate”, “job_involvement”, “job_level”, “job_role”, “job_satisfaction”, “marital_status”, “monthly income”, “monthly_rate”, “num_companies_worked”, “over-time”, “percent_salary_hike”, “performance_rating”, “relationship_satisfaction”, “standard_hours”, “stock_option_level”, “total_working_years”, “training_times_last_year”, “work_life_balance”, “years_at_company”, “years_in_current_role”, “years_since_last_promotion”, “years_with_curr_manager”

Data Structure & NA Checking

str(attr)
## 'data.frame':    150 obs. of  34 variables:
##  $ attrition                 : chr  "no" "no" "no" "no" ...
##  $ age                       : int  35 50 45 30 46 26 33 30 38 27 ...
##  $ business_travel           : chr  "travel_frequently" "travel_frequently" "travel_rarely" "travel_rarely" ...
##  $ daily_rate                : int  853 1115 1339 288 669 1355 1141 201 653 591 ...
##  $ department                : chr  "sales" "research_development" "research_development" "research_development" ...
##  $ distance_from_home        : int  18 1 7 2 9 25 1 5 29 2 ...
##  $ education                 : int  5 3 3 3 2 1 3 3 5 1 ...
##  $ education_field           : chr  "life_sciences" "life_sciences" "life_sciences" "life_sciences" ...
##  $ employee_count            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ employee_number           : int  74 141 86 117 118 177 52 197 79 7 ...
##  $ environment_satisfaction  : int  2 1 2 3 3 3 3 4 4 1 ...
##  $ gender                    : chr  "male" "female" "male" "male" ...
##  $ hourly_rate               : int  71 73 59 99 64 61 42 84 50 40 ...
##  $ job_involvement           : int  3 3 3 2 2 3 4 3 3 3 ...
##  $ job_level                 : int  3 5 3 2 3 1 2 1 2 1 ...
##  $ job_role                  : chr  "sales_executive" "research_director" "research_scientist" "healthcare_representative" ...
##  $ job_satisfaction          : int  1 2 1 4 4 3 1 1 4 2 ...
##  $ marital_status            : chr  "married" "married" "divorced" "married" ...
##  $ monthly_income            : int  9069 18172 9724 4152 9619 2942 5376 3204 2406 3468 ...
##  $ monthly_rate              : int  11031 9755 18787 15830 13596 8916 3193 10415 5456 16632 ...
##  $ num_companies_worked      : int  1 3 2 1 1 1 2 5 1 9 ...
##  $ over_time                 : chr  "no" "yes" "no" "no" ...
##  $ percent_salary_hike       : int  22 19 17 19 16 23 19 14 11 12 ...
##  $ performance_rating        : int  4 3 3 3 3 4 3 3 3 3 ...
##  $ relationship_satisfaction : int  4 1 3 1 4 4 1 4 4 4 ...
##  $ standard_hours            : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ stock_option_level        : int  1 0 1 3 0 1 2 1 0 1 ...
##  $ total_working_years       : int  9 28 25 11 9 8 10 8 10 6 ...
##  $ training_times_last_year  : int  3 1 2 3 3 3 3 3 2 3 ...
##  $ work_life_balance         : int  2 2 3 3 3 3 3 3 3 3 ...
##  $ years_at_company          : int  9 8 1 11 9 8 5 3 10 2 ...
##  $ years_in_current_role     : int  8 3 0 10 8 7 3 2 3 2 ...
##  $ years_since_last_promotion: int  1 0 0 10 4 5 1 2 9 2 ...
##  $ years_with_curr_manager   : int  8 7 0 8 7 7 3 2 9 2 ...

Based on STR, I want to change some columns into factor

attr$attrition <- factor(attr$attrition)
attr$business_travel <- factor(attr$business_travel)
attr$department <- factor(attr$department)
attr$education_field <- factor(attr$education_field)
attr$gender <- factor(attr$gender)
attr$job_role <- factor(attr$job_role)
attr$marital_status <- factor(attr$marital_status)
attr$over_time <- factor(attr$over_time)
str(attr)
## 'data.frame':    150 obs. of  34 variables:
##  $ attrition                 : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ age                       : int  35 50 45 30 46 26 33 30 38 27 ...
##  $ business_travel           : Factor w/ 3 levels "non_travel","travel_frequently",..: 2 2 3 3 3 3 2 3 2 3 ...
##  $ daily_rate                : int  853 1115 1339 288 669 1355 1141 201 653 591 ...
##  $ department                : Factor w/ 3 levels "human_resources",..: 3 2 2 2 3 1 3 2 2 2 ...
##  $ distance_from_home        : int  18 1 7 2 9 25 1 5 29 2 ...
##  $ education                 : int  5 3 3 3 2 1 3 3 5 1 ...
##  $ education_field           : Factor w/ 6 levels "human_resources",..: 2 2 2 2 4 2 2 6 2 4 ...
##  $ employee_count            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ employee_number           : int  74 141 86 117 118 177 52 197 79 7 ...
##  $ environment_satisfaction  : int  2 1 2 3 3 3 3 4 4 1 ...
##  $ gender                    : Factor w/ 2 levels "female","male": 2 1 2 2 2 1 1 1 1 2 ...
##  $ hourly_rate               : int  71 73 59 99 64 61 42 84 50 40 ...
##  $ job_involvement           : int  3 3 3 2 2 3 4 3 3 3 ...
##  $ job_level                 : int  3 5 3 2 3 1 2 1 2 1 ...
##  $ job_role                  : Factor w/ 9 levels "healthcare_representative",..: 8 6 7 1 8 2 8 7 3 3 ...
##  $ job_satisfaction          : int  1 2 1 4 4 3 1 1 4 2 ...
##  $ marital_status            : Factor w/ 3 levels "divorced","married",..: 2 2 1 2 3 2 2 1 3 2 ...
##  $ monthly_income            : int  9069 18172 9724 4152 9619 2942 5376 3204 2406 3468 ...
##  $ monthly_rate              : int  11031 9755 18787 15830 13596 8916 3193 10415 5456 16632 ...
##  $ num_companies_worked      : int  1 3 2 1 1 1 2 5 1 9 ...
##  $ over_time                 : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ percent_salary_hike       : int  22 19 17 19 16 23 19 14 11 12 ...
##  $ performance_rating        : int  4 3 3 3 3 4 3 3 3 3 ...
##  $ relationship_satisfaction : int  4 1 3 1 4 4 1 4 4 4 ...
##  $ standard_hours            : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ stock_option_level        : int  1 0 1 3 0 1 2 1 0 1 ...
##  $ total_working_years       : int  9 28 25 11 9 8 10 8 10 6 ...
##  $ training_times_last_year  : int  3 1 2 3 3 3 3 3 2 3 ...
##  $ work_life_balance         : int  2 2 3 3 3 3 3 3 3 3 ...
##  $ years_at_company          : int  9 8 1 11 9 8 5 3 10 2 ...
##  $ years_in_current_role     : int  8 3 0 10 8 7 3 2 3 2 ...
##  $ years_since_last_promotion: int  1 0 0 10 4 5 1 2 9 2 ...
##  $ years_with_curr_manager   : int  8 7 0 8 7 7 3 2 9 2 ...

After that, I want to check if there is any NA value in the data.

colSums(is.na(attr))
##                  attrition                        age 
##                          0                          0 
##            business_travel                 daily_rate 
##                          0                          0 
##                 department         distance_from_home 
##                          0                          0 
##                  education            education_field 
##                          0                          0 
##             employee_count            employee_number 
##                          0                          0 
##   environment_satisfaction                     gender 
##                          0                          0 
##                hourly_rate            job_involvement 
##                          0                          0 
##                  job_level                   job_role 
##                          0                          0 
##           job_satisfaction             marital_status 
##                          0                          0 
##             monthly_income               monthly_rate 
##                          0                          0 
##       num_companies_worked                  over_time 
##                          0                          0 
##        percent_salary_hike         performance_rating 
##                          0                          0 
##  relationship_satisfaction             standard_hours 
##                          0                          0 
##         stock_option_level        total_working_years 
##                          0                          0 
##   training_times_last_year          work_life_balance 
##                          0                          0 
##           years_at_company      years_in_current_role 
##                          0                          0 
## years_since_last_promotion    years_with_curr_manager 
##                          0                          0
anyNA(attr)
## [1] FALSE

Based on the result, the data don’t have any NA values, so we can continue to explore the data

Data Wrangling, Exploration & Visualisation

Employee Attrition

I want to see how many employees resign and how many employees currently in the company right now

attr_hist <- data.frame(sort(table("attrition" = attr$attrition), decreasing= T))
attr_hist

Visualisation with ggplot2

ggplot(data = attr_hist, mapping = aes(x = attrition, y = Freq)) +
  geom_col(mapping = aes(fill = attrition), alpha = 0.9) +
  geom_label(mapping = aes(label = Freq), 
            data = attr_hist, 
            size = 4) + 
  labs(title = "Employees Attrition",
       x = "",
       y = "Number of Employees") +
  theme_pander()+
  theme(legend.position = "none",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.x = element_blank())

Based on the data, there are 26 employees that resign from the company and 124 employees that still working for the company

Attrition Department

After know there are 26 employees that resign, I want to know from which department they are from.

yes_attr <- attr[attr$attrition == "yes",]
yes_dept_hist <- data.frame(sort(table("department" = yes_attr$department), decreasing= T))
yes_dept_hist

Visualisation with ggplot2

ggplot(data = yes_dept_hist, mapping = aes(x = department, y = Freq)) +
  geom_col(mapping = aes(fill = department), alpha = 0.9) +
  geom_label(mapping = aes(label = Freq), 
            data = yes_dept_hist, 
            nudge_y = 0, 
            size = 4) + 
  labs(title = "Number of Employees Resign",
       subtitle = "Based on Department",
       x = "",
       y = "Number of Employees") +
  theme_pander() +
  theme(legend.position = "none",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.x = element_blank())

Based on the data, there are 16 employees from RnD department, 9 employees from Sales department, and 1 employee from HR department that resign from the company.

Employee Age Distribution

I want to see age distribution from employees that still working in the company

First, I have to make a function

ag <- function(x){
  if(x < 31){
    x <- "<31 Year Old"
  }else if(x >= 31 & x <= 40){
    x <- "31-40 Year Old"
  }else if(x >= 41 & x <= 50){
    x <- "41-50 Year Old"
  }else{
    x <- ">51 Year Old"}
}   

Then I apply it to a column

attr$age_category <- as.factor(sapply(attr$age , ag))
head(attr)

To make visualisation, I have to subset the attrition column so employees that resign don’t include in the data

no_attr <- attr[attr$attrition == "no",]
age_hist <- data.frame(sort(table("age_category" = no_attr$age_category), decreasing= T))
age_hist

Visualisation with ggplot2

ggplot(data = age_hist, mapping = aes(x = age_category, y = Freq)) +
  geom_col(mapping = aes(fill = age_category), alpha = 0.9) +
  geom_label(mapping = aes(label = Freq), 
            data = age_hist, 
            nudge_y = 1, 
            size = 4) + 
  labs(title = "Number of Employees",
       subtitle = "Based on Age Distribution",
       x = "",
       y = "Number of Employees") +
  theme_pander()+
  theme(legend.position = "none",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.x = element_blank())

Based on the data, 53 employees are around 31-40 year old, 32 employees are <31 year old, 23 employees are 41-50 year old, and 16 employees are >51 year old.

Performance Rating and Salary

I already have data about employees that still work in the company.

With the data, I want to see if there is correlation between performance rating and salary.

ggplot(data = no_attr, mapping = aes(x = monthly_income ,y= performance_rating)) +
  geom_jitter(aes(x = monthly_income, y = performance_rating, color= department)) +
  geom_smooth() + 
  scale_x_continuous(labels = comma) +
  labs(title = "Performance Rating vs Salary",
       subtitle = "Based on Department",
       x = "Monthly Income",
       y = "Performance Rating") +
  theme_pander() +
  theme(legend.position = "top",
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank())
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Based on the data, there are no correlation between performance rating and monthly income.

Years Work and Salary

Also after that, I want to see if there is correlation between monthly income and years work at the company.

ggplot(data = no_attr, mapping = aes(x = years_at_company ,y= monthly_income)) +
  geom_jitter(aes(x = years_at_company, y = monthly_income, color= department)) +
  geom_smooth() +
  scale_y_continuous(labels = comma) +
  labs(title = "Salary Distribution",
       subtitle = "Based on Years at Company",
       x = "Years at Company",
       y = "Monthly Income") +
  theme_pander() +
  theme(legend.position = "top",
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank())
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Based on the data, there is correlation between years at company and monthly income. The longer we work at the company then more likely the employees have higher monthly income.

Work Life Balance

Lastly, I want to know which department have better work life balance.

ggplot(data = no_attr, mapping = aes(x = department, y = work_life_balance)) +
  geom_boxplot(mapping = aes(x = department, y = work_life_balance, fill = department)) +
  labs(title = "Work Life Balance",
       subtitle = "Based on Departement",
       x = "",
       y = "Work Life Balance") +
  theme_pander() +
  theme(legend.position = "none",
        panel.grid.major.x = element_blank(),
        panel.grid.minor.y = element_blank())

Based on the data, HR department have the most stable work life balance with RnD and Sales department have similar work life balance