By: Aaron Stearns

This dataset contains employee attrition data and is available here:

https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset/data

library(tidyverse)

df <- read.csv("~/IBM_HR/ibm_hr.csv")

#check for NA values
table(is.na(df))
## 
## FALSE 
## 51450
#check for duplicated values
sum(duplicated(df)) 
## [1] 0
#see data frame dimensions and column data types
str(df)
## 'data.frame':    1470 obs. of  35 variables:
##  $ Age                     : int  41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
##  $ DailyRate               : int  1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
##  $ DistanceFromHome        : int  1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : int  2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : int  2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
##  $ HourlyRate              : int  94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : int  3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : int  2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
##  $ JobSatisfaction         : int  4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
##  $ MonthlyIncome           : int  5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
##  $ MonthlyRate             : int  19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
##  $ NumCompaniesWorked      : int  8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
##  $ PercentSalaryHike       : int  11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : int  3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: int  1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : int  8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : int  0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : int  1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : int  4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : int  0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : int  5 7 0 0 2 6 0 0 8 7 ...
# remove employeecount, standardhours, over18 columns
df <- df[, c(-9, -22, -27)]

df$JobSatisfaction <- as.factor(df$JobSatisfaction)
df$OverTime <- as.factor(df$OverTime)
df$PerformanceRating <- as.factor(df$PerformanceRating)
df$RelationshipSatisfaction <- as.factor(df$RelationshipSatisfaction)
df$StockOptionLevel <- as.factor(df$StockOptionLevel)
df$TrainingTimesLastYear <- as.factor(df$TrainingTimesLastYear)
df$WorkLifeBalance <- as.factor(df$WorkLifeBalance)
df$Education <- as.factor(df$Education)
df$EnvironmentSatisfaction <- as.factor(df$EnvironmentSatisfaction)
df$JobInvolvement <- as.factor(df$JobInvolvement)
df$JobLevel <- df$JobLevel

# check income levels
ggplot(df, aes(x = MonthlyIncome, fill = Attrition)) + 
  geom_density(alpha = 0.6) +
  labs(x = "Monthly Income", y = "") +
  ggtitle("Attrition by income level") +
  theme_classic()

### let's see what percentage of people are leaving from their departments
dept <- df %>% group_by(Department, Attrition) %>% summarise(n = n())

dept$totals <- ifelse(dept$Department == "Human Resources", 
                      sum(df$Department == "Human Resources"), 
                      ifelse(dept$Department == "Research & Development", 
                             sum(df$Department == "Research & Development"),
                             ifelse(dept$Department == "Sales",
                                    sum(df$Department == "Sales"), df$Department))) 

dept <- dept %>% mutate(attr_perc = n / totals * 100)

dept %>% filter(Attrition == "Yes") %>% 
  ggplot(aes(x = Department, y = attr_perc, fill = Department)) + 
  geom_bar(stat = "identity", alpha = 0.9) +
  labs(x="", y="% of employees")+
  ggtitle("Attrition % by Department") +
  theme_classic() 

## do people who travel more for business leave at higher rates?
trav_attr <- df %>% group_by(BusinessTravel, Attrition) %>% summarise(n = n())

trav_attr$totals <- ifelse(trav_attr$BusinessTravel == "Non-Travel", 
                           sum(df$BusinessTravel == "Non-Travel"), 
                           ifelse(trav_attr$BusinessTravel == "Travel_Frequently", 
                                  sum(df$BusinessTravel == "Travel_Frequently"),
                                  ifelse(trav_attr$BusinessTravel == "Travel_Rarely",
                                         sum(df$BusinessTravel == "Travel_Rarely"), trav_attr$BusinessTravel))) 

trav_attr <- trav_attr %>% mutate(attr_perc = n / totals * 100)

trav_attr %>% filter(Attrition == "Yes") %>% 
  ggplot(aes(x = BusinessTravel, y = attr_perc, fill = BusinessTravel)) + 
  geom_bar(stat = "identity", alpha = 0.9) +
  labs(x="", y="% of employees") +
  ggtitle("Attrition % by Amount of Business Travel") +
  theme_classic()

## attrition by education level
edu_attr <- df %>% group_by(Education, Attrition) %>% summarise(n = n())
edu_tots <- df %>% group_by(Education) %>% summarise(num = n())
edu_attr <- left_join(edu_attr, edu_tots, by = "Education")
edu_attr <- edu_attr %>% mutate(attr_perc = n / num * 100)

edu_attr %>% filter(Attrition == "Yes") %>% 
  ggplot(aes(x = Education, y = attr_perc, fill = Education)) + 
  geom_bar(stat = "identity", alpha = 0.9) +
  labs(x="Education Level", y="% of employees") +
  ggtitle("Attrition % by Education Level") + 
  theme_classic()

### attrition by marital status
mar_attr <- df %>% group_by(MaritalStatus, Attrition) %>% summarise(n = n())
mar_tots <- df %>% group_by(MaritalStatus) %>% summarise(nums = n())
mar_attr <- left_join(mar_attr, mar_tots, by = "MaritalStatus")
mar_attr <- mar_attr %>% mutate(attr_perc = n / nums * 100)

mar_attr %>% filter(Attrition == "Yes") %>% 
  ggplot(aes(x = MaritalStatus, y = attr_perc, fill = MaritalStatus)) + 
  geom_bar(stat = "identity", alpha = 0.9) +
  labs(x="", y="% of employees") +
  ggtitle("Attrition % by Marital Status") +
  theme_classic()

## attrition by gender and marital status
gend_attr <- df %>% group_by(Gender, MaritalStatus, Attrition) %>% summarise(n = n())
gend_tots <- df %>% group_by(Gender, MaritalStatus) %>% summarise(nums = n())
gend_attr <- left_join(gend_attr, gend_tots)
gend_attr <- gend_attr %>% mutate(attr_perc = n / nums * 100)
gend_attr$type <- paste(gend_attr$MaritalStatus, gend_attr$Gender, sep = " ")

gend_attr %>% filter(Attrition == "Yes") %>% 
  ggplot(aes(x = type, y = attr_perc, fill = Gender)) + 
  geom_bar(stat = "identity", alpha = 0.9) +
  labs(x="", y="% of employees") +
  ggtitle("Attrition % by Gender and Marital Status") +
  theme_classic() +
  coord_flip()

# attrition by JobInvolvement
jobinv <- df %>% group_by(JobInvolvement, Attrition) %>% summarise(n = n())
jobtots <- df %>% group_by(JobInvolvement) %>% summarise(nums = n())
jobinv <- left_join(jobinv, jobtots, by = "JobInvolvement")
jobinv <- jobinv %>% mutate(attr_perc = n / nums * 100)

jobinv %>% filter(Attrition == "Yes") %>% 
  ggplot(aes(x = JobInvolvement, y = attr_perc, fill = JobInvolvement)) + 
  geom_bar(stat = "identity", alpha = 0.9) +
  labs(x="Job Involvement Level", y="% of employee attrition") +
  ggtitle("Attrition % by Job Involvement") +
  theme_classic()

# attrition by overtime
ovt <- df %>% group_by(OverTime, Attrition) %>% summarise(n = n())
ovttots <- df %>% group_by(OverTime) %>% summarise(nums = n())
ovt <- left_join(ovt, ovttots, by = "OverTime")
ovt <- ovt %>% mutate(attr_perc = n / nums * 100)

ovt %>% filter(Attrition == "Yes") %>% 
  ggplot(aes(x = OverTime, y = attr_perc, fill = OverTime)) + 
  geom_bar(stat = "identity", alpha = 0.9) +
  labs(x="Overtime", y="% of employee attrition") +
  ggtitle("Attrition % by Overtime (Y/N)") +
  theme_classic()

yrcomp <- df %>% group_by(YearsAtCompany) %>% filter(Attrition == "Yes") %>% summarise(n = n())
yrcomp$Attrition <- "Left"
yrno <- df %>% group_by(YearsAtCompany) %>% filter(Attrition == "No") %>% summarise(n = n())
yrno$Attrition <- "Stayed"
yearscomp <- rbind(yrcomp, yrno)

ggplot(yearscomp, aes(x = YearsAtCompany, y = n, fill = Attrition)) +
  geom_bar(position = "stack", stat = "identity", alpha = 0.9) +
  xlab("Years With Company") +
  ylab("# of employees") +
  ggtitle("Attrition by years worked for company") +
  theme_classic()

yrscomp <- df %>% group_by(YearsInCurrentRole) %>% filter(Attrition == "Yes") %>% summarise(n = n())
yrscomp$Attrition <- "Left"
yrsno <- df %>% group_by(YearsInCurrentRole) %>% filter(Attrition == "No") %>% summarise(n = n())
yrsno$Attrition <- "Stayed"
jyearscomp <- rbind(yrscomp, yrsno)

ggplot(jyearscomp, aes(x = YearsInCurrentRole, y = n, fill = Attrition)) +
  geom_bar(position = "stack", stat = "identity", alpha = 0.9) +
  xlab("Years In Current Role") +
  ylab("# of employees") +
  ggtitle("Attrition by years in current role") +
  theme_classic()

mancomp <- df %>% group_by(YearsWithCurrManager) %>% filter(Attrition == "Yes") %>% summarise(n = n())
mancomp$Attrition <- "Left"
manno <- df %>% group_by(YearsWithCurrManager) %>% filter(Attrition == "No") %>% summarise(n = n())
manno$Attrition <- "Stayed"
managercomp <- rbind(mancomp, manno)

ggplot(managercomp, aes(x = YearsWithCurrManager, y = n, fill = Attrition)) +
  geom_bar(position = "stack", stat = "identity", alpha = 0.9) +
  xlab("Years With Current Manager") +
  ylab("# of employees") +
  ggtitle("Attrition by years with current manager") +
  theme_classic()