https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset/data
library(tidyverse)
df <- read.csv("~/IBM_HR/ibm_hr.csv")
#check for NA values
table(is.na(df))
##
## FALSE
## 51450
#check for duplicated values
sum(duplicated(df))
## [1] 0
#see data frame dimensions and column data types
str(df)
## 'data.frame': 1470 obs. of 35 variables:
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
# remove employeecount, standardhours, over18 columns
df <- df[, c(-9, -22, -27)]
df$JobSatisfaction <- as.factor(df$JobSatisfaction)
df$OverTime <- as.factor(df$OverTime)
df$PerformanceRating <- as.factor(df$PerformanceRating)
df$RelationshipSatisfaction <- as.factor(df$RelationshipSatisfaction)
df$StockOptionLevel <- as.factor(df$StockOptionLevel)
df$TrainingTimesLastYear <- as.factor(df$TrainingTimesLastYear)
df$WorkLifeBalance <- as.factor(df$WorkLifeBalance)
df$Education <- as.factor(df$Education)
df$EnvironmentSatisfaction <- as.factor(df$EnvironmentSatisfaction)
df$JobInvolvement <- as.factor(df$JobInvolvement)
df$JobLevel <- df$JobLevel
# check income levels
ggplot(df, aes(x = MonthlyIncome, fill = Attrition)) +
geom_density(alpha = 0.6) +
labs(x = "Monthly Income", y = "") +
ggtitle("Attrition by income level") +
theme_classic()
### let's see what percentage of people are leaving from their departments
dept <- df %>% group_by(Department, Attrition) %>% summarise(n = n())
dept$totals <- ifelse(dept$Department == "Human Resources",
sum(df$Department == "Human Resources"),
ifelse(dept$Department == "Research & Development",
sum(df$Department == "Research & Development"),
ifelse(dept$Department == "Sales",
sum(df$Department == "Sales"), df$Department)))
dept <- dept %>% mutate(attr_perc = n / totals * 100)
dept %>% filter(Attrition == "Yes") %>%
ggplot(aes(x = Department, y = attr_perc, fill = Department)) +
geom_bar(stat = "identity", alpha = 0.9) +
labs(x="", y="% of employees")+
ggtitle("Attrition % by Department") +
theme_classic()
## do people who travel more for business leave at higher rates?
trav_attr <- df %>% group_by(BusinessTravel, Attrition) %>% summarise(n = n())
trav_attr$totals <- ifelse(trav_attr$BusinessTravel == "Non-Travel",
sum(df$BusinessTravel == "Non-Travel"),
ifelse(trav_attr$BusinessTravel == "Travel_Frequently",
sum(df$BusinessTravel == "Travel_Frequently"),
ifelse(trav_attr$BusinessTravel == "Travel_Rarely",
sum(df$BusinessTravel == "Travel_Rarely"), trav_attr$BusinessTravel)))
trav_attr <- trav_attr %>% mutate(attr_perc = n / totals * 100)
trav_attr %>% filter(Attrition == "Yes") %>%
ggplot(aes(x = BusinessTravel, y = attr_perc, fill = BusinessTravel)) +
geom_bar(stat = "identity", alpha = 0.9) +
labs(x="", y="% of employees") +
ggtitle("Attrition % by Amount of Business Travel") +
theme_classic()
## attrition by education level
edu_attr <- df %>% group_by(Education, Attrition) %>% summarise(n = n())
edu_tots <- df %>% group_by(Education) %>% summarise(num = n())
edu_attr <- left_join(edu_attr, edu_tots, by = "Education")
edu_attr <- edu_attr %>% mutate(attr_perc = n / num * 100)
edu_attr %>% filter(Attrition == "Yes") %>%
ggplot(aes(x = Education, y = attr_perc, fill = Education)) +
geom_bar(stat = "identity", alpha = 0.9) +
labs(x="Education Level", y="% of employees") +
ggtitle("Attrition % by Education Level") +
theme_classic()
### attrition by marital status
mar_attr <- df %>% group_by(MaritalStatus, Attrition) %>% summarise(n = n())
mar_tots <- df %>% group_by(MaritalStatus) %>% summarise(nums = n())
mar_attr <- left_join(mar_attr, mar_tots, by = "MaritalStatus")
mar_attr <- mar_attr %>% mutate(attr_perc = n / nums * 100)
mar_attr %>% filter(Attrition == "Yes") %>%
ggplot(aes(x = MaritalStatus, y = attr_perc, fill = MaritalStatus)) +
geom_bar(stat = "identity", alpha = 0.9) +
labs(x="", y="% of employees") +
ggtitle("Attrition % by Marital Status") +
theme_classic()
## attrition by gender and marital status
gend_attr <- df %>% group_by(Gender, MaritalStatus, Attrition) %>% summarise(n = n())
gend_tots <- df %>% group_by(Gender, MaritalStatus) %>% summarise(nums = n())
gend_attr <- left_join(gend_attr, gend_tots)
gend_attr <- gend_attr %>% mutate(attr_perc = n / nums * 100)
gend_attr$type <- paste(gend_attr$MaritalStatus, gend_attr$Gender, sep = " ")
gend_attr %>% filter(Attrition == "Yes") %>%
ggplot(aes(x = type, y = attr_perc, fill = Gender)) +
geom_bar(stat = "identity", alpha = 0.9) +
labs(x="", y="% of employees") +
ggtitle("Attrition % by Gender and Marital Status") +
theme_classic() +
coord_flip()
# attrition by JobInvolvement
jobinv <- df %>% group_by(JobInvolvement, Attrition) %>% summarise(n = n())
jobtots <- df %>% group_by(JobInvolvement) %>% summarise(nums = n())
jobinv <- left_join(jobinv, jobtots, by = "JobInvolvement")
jobinv <- jobinv %>% mutate(attr_perc = n / nums * 100)
jobinv %>% filter(Attrition == "Yes") %>%
ggplot(aes(x = JobInvolvement, y = attr_perc, fill = JobInvolvement)) +
geom_bar(stat = "identity", alpha = 0.9) +
labs(x="Job Involvement Level", y="% of employee attrition") +
ggtitle("Attrition % by Job Involvement") +
theme_classic()
# attrition by overtime
ovt <- df %>% group_by(OverTime, Attrition) %>% summarise(n = n())
ovttots <- df %>% group_by(OverTime) %>% summarise(nums = n())
ovt <- left_join(ovt, ovttots, by = "OverTime")
ovt <- ovt %>% mutate(attr_perc = n / nums * 100)
ovt %>% filter(Attrition == "Yes") %>%
ggplot(aes(x = OverTime, y = attr_perc, fill = OverTime)) +
geom_bar(stat = "identity", alpha = 0.9) +
labs(x="Overtime", y="% of employee attrition") +
ggtitle("Attrition % by Overtime (Y/N)") +
theme_classic()
yrcomp <- df %>% group_by(YearsAtCompany) %>% filter(Attrition == "Yes") %>% summarise(n = n())
yrcomp$Attrition <- "Left"
yrno <- df %>% group_by(YearsAtCompany) %>% filter(Attrition == "No") %>% summarise(n = n())
yrno$Attrition <- "Stayed"
yearscomp <- rbind(yrcomp, yrno)
ggplot(yearscomp, aes(x = YearsAtCompany, y = n, fill = Attrition)) +
geom_bar(position = "stack", stat = "identity", alpha = 0.9) +
xlab("Years With Company") +
ylab("# of employees") +
ggtitle("Attrition by years worked for company") +
theme_classic()
yrscomp <- df %>% group_by(YearsInCurrentRole) %>% filter(Attrition == "Yes") %>% summarise(n = n())
yrscomp$Attrition <- "Left"
yrsno <- df %>% group_by(YearsInCurrentRole) %>% filter(Attrition == "No") %>% summarise(n = n())
yrsno$Attrition <- "Stayed"
jyearscomp <- rbind(yrscomp, yrsno)
ggplot(jyearscomp, aes(x = YearsInCurrentRole, y = n, fill = Attrition)) +
geom_bar(position = "stack", stat = "identity", alpha = 0.9) +
xlab("Years In Current Role") +
ylab("# of employees") +
ggtitle("Attrition by years in current role") +
theme_classic()
mancomp <- df %>% group_by(YearsWithCurrManager) %>% filter(Attrition == "Yes") %>% summarise(n = n())
mancomp$Attrition <- "Left"
manno <- df %>% group_by(YearsWithCurrManager) %>% filter(Attrition == "No") %>% summarise(n = n())
manno$Attrition <- "Stayed"
managercomp <- rbind(mancomp, manno)
ggplot(managercomp, aes(x = YearsWithCurrManager, y = n, fill = Attrition)) +
geom_bar(position = "stack", stat = "identity", alpha = 0.9) +
xlab("Years With Current Manager") +
ylab("# of employees") +
ggtitle("Attrition by years with current manager") +
theme_classic()