Introduction

Write the introduction here.

Library

Install the packages, if the library is not exist in R. Load the library, if there are exist.

#install.packages("dplyr")
#install.packages("janitor")
#install.packages("ggplot2")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggplot2)
library(ggthemes)

Load Data

There are two datasets.

train<-read.csv("test.csv")
test <-read.csv("train.csv")

# Quick structure overview
glimpse(train)
## Rows: 14,900
## Columns: 24
## $ Employee.ID              <int> 52685, 30585, 54656, 33442, 15667, 3496, 4677…
## $ Age                      <int> 36, 35, 50, 58, 39, 45, 22, 34, 48, 55, 32, 2…
## $ Gender                   <chr> "Male", "Male", "Male", "Male", "Male", "Fema…
## $ Years.at.Company         <int> 13, 7, 7, 44, 24, 30, 5, 15, 40, 16, 12, 15, …
## $ Job.Role                 <chr> "Healthcare", "Education", "Education", "Medi…
## $ Monthly.Income           <int> 8029, 4563, 5583, 5525, 4604, 8104, 8700, 110…
## $ Work.Life.Balance        <chr> "Excellent", "Good", "Fair", "Fair", "Good", …
## $ Job.Satisfaction         <chr> "High", "High", "High", "Very High", "High", …
## $ Performance.Rating       <chr> "Average", "Average", "Average", "High", "Ave…
## $ Number.of.Promotions     <int> 1, 1, 3, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 1, …
## $ Overtime                 <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No"…
## $ Distance.from.Home       <int> 83, 55, 14, 43, 47, 38, 2, 9, 65, 31, 28, 35,…
## $ Education.Level          <chr> "Master’s Degree", "Associate Degree", "Assoc…
## $ Marital.Status           <chr> "Married", "Single", "Divorced", "Single", "M…
## $ Number.of.Dependents     <int> 1, 4, 2, 4, 6, 0, 0, 4, 1, 1, 1, 1, 3, 0, 0, …
## $ Job.Level                <chr> "Mid", "Entry", "Senior", "Entry", "Mid", "Se…
## $ Company.Size             <chr> "Large", "Medium", "Medium", "Medium", "Large…
## $ Company.Tenure           <int> 22, 27, 76, 96, 45, 75, 48, 16, 52, 46, 57, 9…
## $ Remote.Work              <chr> "No", "No", "No", "No", "Yes", "No", "No", "N…
## $ Leadership.Opportunities <chr> "No", "No", "No", "No", "No", "No", "No", "No…
## $ Innovation.Opportunities <chr> "No", "No", "Yes", "No", "No", "No", "No", "N…
## $ Company.Reputation       <chr> "Poor", "Good", "Good", "Poor", "Good", "Good…
## $ Employee.Recognition     <chr> "Medium", "High", "Low", "Low", "High", "Low"…
## $ Attrition                <chr> "Stayed", "Left", "Stayed", "Left", "Stayed",…
glimpse(test)
## Rows: 59,598
## Columns: 24
## $ Employee.ID              <int> 8410, 64756, 30257, 65791, 65026, 24368, 6497…
## $ Age                      <int> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30, 2…
## $ Gender                   <chr> "Male", "Female", "Female", "Female", "Male",…
## $ Years.at.Company         <int> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, 38…
## $ Job.Role                 <chr> "Education", "Media", "Healthcare", "Educatio…
## $ Monthly.Income           <int> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 112…
## $ Work.Life.Balance        <chr> "Excellent", "Poor", "Good", "Good", "Fair", …
## $ Job.Satisfaction         <chr> "Medium", "High", "High", "High", "Very High"…
## $ Performance.Rating       <chr> "Average", "Low", "Low", "High", "Average", "…
## $ Number.of.Promotions     <int> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0, …
## $ Overtime                 <chr> "No", "No", "No", "No", "Yes", "No", "Yes", "…
## $ Distance.from.Home       <int> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, 26…
## $ Education.Level          <chr> "Associate Degree", "Master’s Degree", "Bache…
## $ Marital.Status           <chr> "Married", "Divorced", "Married", "Single", "…
## $ Number.of.Dependents     <int> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0, …
## $ Job.Level                <chr> "Mid", "Mid", "Mid", "Mid", "Senior", "Mid", …
## $ Company.Size             <chr> "Medium", "Medium", "Medium", "Small", "Mediu…
## $ Company.Tenure           <int> 89, 21, 74, 50, 68, 47, 93, 88, 75, 45, 17, 3…
## $ Remote.Work              <chr> "No", "No", "No", "Yes", "No", "No", "No", "N…
## $ Leadership.Opportunities <chr> "No", "No", "No", "No", "No", "No", "No", "No…
## $ Innovation.Opportunities <chr> "No", "No", "No", "No", "No", "Yes", "No", "N…
## $ Company.Reputation       <chr> "Excellent", "Fair", "Poor", "Good", "Fair", …
## $ Employee.Recognition     <chr> "Medium", "Low", "Low", "Medium", "Medium", "…
## $ Attrition                <chr> "Stayed", "Stayed", "Stayed", "Stayed", "Stay…
summary(train)
##   Employee.ID         Age           Gender          Years.at.Company
##  Min.   :    5   Min.   :18.00   Length:14900       Min.   : 1.00   
##  1st Qu.:18826   1st Qu.:28.00   Class :character   1st Qu.: 7.00   
##  Median :37433   Median :38.00   Mode  :character   Median :13.00   
##  Mean   :37339   Mean   :38.39                      Mean   :15.59   
##  3rd Qu.:55858   3rd Qu.:49.00                      3rd Qu.:23.00   
##  Max.   :74471   Max.   :59.00                      Max.   :51.00   
##    Job.Role         Monthly.Income  Work.Life.Balance  Job.Satisfaction  
##  Length:14900       Min.   : 1226   Length:14900       Length:14900      
##  Class :character   1st Qu.: 5634   Class :character   Class :character  
##  Mode  :character   Median : 7332   Mode  :character   Mode  :character  
##                     Mean   : 7287                                        
##                     3rd Qu.: 8852                                        
##                     Max.   :15063                                        
##  Performance.Rating Number.of.Promotions   Overtime         Distance.from.Home
##  Length:14900       Min.   :0.0000       Length:14900       Min.   : 1.00     
##  Class :character   1st Qu.:0.0000       Class :character   1st Qu.:25.00     
##  Mode  :character   Median :1.0000       Mode  :character   Median :50.00     
##                     Mean   :0.8344                          Mean   :49.93     
##                     3rd Qu.:2.0000                          3rd Qu.:75.00     
##                     Max.   :4.0000                          Max.   :99.00     
##  Education.Level    Marital.Status     Number.of.Dependents  Job.Level        
##  Length:14900       Length:14900       Min.   :0.000        Length:14900      
##  Class :character   Class :character   1st Qu.:0.000        Class :character  
##  Mode  :character   Mode  :character   Median :1.000        Mode  :character  
##                                        Mean   :1.659                          
##                                        3rd Qu.:3.000                          
##                                        Max.   :6.000                          
##  Company.Size       Company.Tenure  Remote.Work        Leadership.Opportunities
##  Length:14900       Min.   :  2.0   Length:14900       Length:14900            
##  Class :character   1st Qu.: 36.0   Class :character   Class :character        
##  Mode  :character   Median : 56.0   Mode  :character   Mode  :character        
##                     Mean   : 55.6                                              
##                     3rd Qu.: 75.0                                              
##                     Max.   :127.0                                              
##  Innovation.Opportunities Company.Reputation Employee.Recognition
##  Length:14900             Length:14900       Length:14900        
##  Class :character         Class :character   Class :character    
##  Mode  :character         Mode  :character   Mode  :character    
##                                                                  
##                                                                  
##                                                                  
##   Attrition        
##  Length:14900      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
summary(test)
##   Employee.ID         Age           Gender          Years.at.Company
##  Min.   :    1   Min.   :18.00   Length:59598       Min.   : 1.00   
##  1st Qu.:18580   1st Qu.:28.00   Class :character   1st Qu.: 7.00   
##  Median :37210   Median :39.00   Mode  :character   Median :13.00   
##  Mean   :37227   Mean   :38.57                      Mean   :15.75   
##  3rd Qu.:55877   3rd Qu.:49.00                      3rd Qu.:23.00   
##  Max.   :74498   Max.   :59.00                      Max.   :51.00   
##    Job.Role         Monthly.Income  Work.Life.Balance  Job.Satisfaction  
##  Length:59598       Min.   : 1316   Length:59598       Length:59598      
##  Class :character   1st Qu.: 5658   Class :character   Class :character  
##  Mode  :character   Median : 7354   Mode  :character   Mode  :character  
##                     Mean   : 7302                                        
##                     3rd Qu.: 8880                                        
##                     Max.   :16149                                        
##  Performance.Rating Number.of.Promotions   Overtime         Distance.from.Home
##  Length:59598       Min.   :0.0000       Length:59598       Min.   : 1.00     
##  Class :character   1st Qu.:0.0000       Class :character   1st Qu.:25.00     
##  Mode  :character   Median :1.0000       Mode  :character   Median :50.00     
##                     Mean   :0.8326                          Mean   :50.01     
##                     3rd Qu.:2.0000                          3rd Qu.:75.00     
##                     Max.   :4.0000                          Max.   :99.00     
##  Education.Level    Marital.Status     Number.of.Dependents  Job.Level        
##  Length:59598       Length:59598       Min.   :0.000        Length:59598      
##  Class :character   Class :character   1st Qu.:0.000        Class :character  
##  Mode  :character   Mode  :character   Median :1.000        Mode  :character  
##                                        Mean   :1.648                          
##                                        3rd Qu.:3.000                          
##                                        Max.   :6.000                          
##  Company.Size       Company.Tenure   Remote.Work       
##  Length:59598       Min.   :  2.00   Length:59598      
##  Class :character   1st Qu.: 36.00   Class :character  
##  Mode  :character   Median : 56.00   Mode  :character  
##                     Mean   : 55.76                     
##                     3rd Qu.: 76.00                     
##                     Max.   :128.00                     
##  Leadership.Opportunities Innovation.Opportunities Company.Reputation
##  Length:59598             Length:59598             Length:59598      
##  Class :character         Class :character         Class :character  
##  Mode  :character         Mode  :character         Mode  :character  
##                                                                      
##                                                                      
##                                                                      
##  Employee.Recognition  Attrition        
##  Length:59598         Length:59598      
##  Class :character     Class :character  
##  Mode  :character     Mode  :character  
##                                         
##                                         
## 
# Check missing values
colSums(is.na(train))
##              Employee.ID                      Age                   Gender 
##                        0                        0                        0 
##         Years.at.Company                 Job.Role           Monthly.Income 
##                        0                        0                        0 
##        Work.Life.Balance         Job.Satisfaction       Performance.Rating 
##                        0                        0                        0 
##     Number.of.Promotions                 Overtime       Distance.from.Home 
##                        0                        0                        0 
##          Education.Level           Marital.Status     Number.of.Dependents 
##                        0                        0                        0 
##                Job.Level             Company.Size           Company.Tenure 
##                        0                        0                        0 
##              Remote.Work Leadership.Opportunities Innovation.Opportunities 
##                        0                        0                        0 
##       Company.Reputation     Employee.Recognition                Attrition 
##                        0                        0                        0
colSums(is.na(test))
##              Employee.ID                      Age                   Gender 
##                        0                        0                        0 
##         Years.at.Company                 Job.Role           Monthly.Income 
##                        0                        0                        0 
##        Work.Life.Balance         Job.Satisfaction       Performance.Rating 
##                        0                        0                        0 
##     Number.of.Promotions                 Overtime       Distance.from.Home 
##                        0                        0                        0 
##          Education.Level           Marital.Status     Number.of.Dependents 
##                        0                        0                        0 
##                Job.Level             Company.Size           Company.Tenure 
##                        0                        0                        0 
##              Remote.Work Leadership.Opportunities Innovation.Opportunities 
##                        0                        0                        0 
##       Company.Reputation     Employee.Recognition                Attrition 
##                        0                        0                        0
# Check duplicates
get_dupes(train)
## No variable names specified - using all columns.
## No duplicate combinations found of: Employee.ID, Age, Gender, Years.at.Company, Job.Role, Monthly.Income, Work.Life.Balance, Job.Satisfaction, Performance.Rating, ... and 15 other variables
##  [1] Employee.ID              Age                      Gender                  
##  [4] Years.at.Company         Job.Role                 Monthly.Income          
##  [7] Work.Life.Balance        Job.Satisfaction         Performance.Rating      
## [10] Number.of.Promotions     Overtime                 Distance.from.Home      
## [13] Education.Level          Marital.Status           Number.of.Dependents    
## [16] Job.Level                Company.Size             Company.Tenure          
## [19] Remote.Work              Leadership.Opportunities Innovation.Opportunities
## [22] Company.Reputation       Employee.Recognition     Attrition               
## [25] dupe_count              
## <0 rows> (or 0-length row.names)
get_dupes(test)
## No variable names specified - using all columns.
## 
## No duplicate combinations found of: Employee.ID, Age, Gender, Years.at.Company, Job.Role, Monthly.Income, Work.Life.Balance, Job.Satisfaction, Performance.Rating, ... and 15 other variables
##  [1] Employee.ID              Age                      Gender                  
##  [4] Years.at.Company         Job.Role                 Monthly.Income          
##  [7] Work.Life.Balance        Job.Satisfaction         Performance.Rating      
## [10] Number.of.Promotions     Overtime                 Distance.from.Home      
## [13] Education.Level          Marital.Status           Number.of.Dependents    
## [16] Job.Level                Company.Size             Company.Tenure          
## [19] Remote.Work              Leadership.Opportunities Innovation.Opportunities
## [22] Company.Reputation       Employee.Recognition     Attrition               
## [25] dupe_count              
## <0 rows> (or 0-length row.names)
# metadata states tenure in years but there are unrealistic values(e.g. 89,95)
# Convert tenure from months to years
train$Company.Tenure <- round(train$Company.Tenure /12,2)
test$Company.Tenure <- round(test$Company.Tenure /12,2)

Check logical consistency

## Check Age Vs Company.Tenure (must not exceed age)
train %>% filter(Company.Tenure > Age)
##  [1] Employee.ID              Age                      Gender                  
##  [4] Years.at.Company         Job.Role                 Monthly.Income          
##  [7] Work.Life.Balance        Job.Satisfaction         Performance.Rating      
## [10] Number.of.Promotions     Overtime                 Distance.from.Home      
## [13] Education.Level          Marital.Status           Number.of.Dependents    
## [16] Job.Level                Company.Size             Company.Tenure          
## [19] Remote.Work              Leadership.Opportunities Innovation.Opportunities
## [22] Company.Reputation       Employee.Recognition     Attrition               
## <0 rows> (or 0-length row.names)
test %>% filter(Company.Tenure > Age)
##  [1] Employee.ID              Age                      Gender                  
##  [4] Years.at.Company         Job.Role                 Monthly.Income          
##  [7] Work.Life.Balance        Job.Satisfaction         Performance.Rating      
## [10] Number.of.Promotions     Overtime                 Distance.from.Home      
## [13] Education.Level          Marital.Status           Number.of.Dependents    
## [16] Job.Level                Company.Size             Company.Tenure          
## [19] Remote.Work              Leadership.Opportunities Innovation.Opportunities
## [22] Company.Reputation       Employee.Recognition     Attrition               
## <0 rows> (or 0-length row.names)
## Check Age Vs Years.at.Company (must not exceed age)
checkdata2<-function(df){
  # Filter and count rows where Age < Years.at.Company
  issue_count<- df %>%
    filter(Age<Years.at.Company) %>%
    nrow()
  # Check the count
  if(issue_count== 0) {
    print("No problem.No one has years_at_company greater than age.")
  } else {
    print(paste(issue_count,"rows have years_at_company > age"))
  }
}
checkdata2(train)
## [1] "No problem.No one has years_at_company greater than age."
checkdata2(test)
## [1] "No problem.No one has years_at_company greater than age."
# Convert categorical variables to factors
type_change <- function(df){
  categorical_cols<-c("Gender","Job.Role","Work.Life.Balance","Job.Satisfaction",
                      "Performance.Rating","Marital.Status","Overtime","Education.Level",
                      "Job.Level","Company.Size","Remote.Work","Leadership.Opportunities",
                      "Innovation.Opportunities","Company.Reputation","Employee.Recognition",
                      "Attrition")
  for (col in categorical_cols){
    if(col %in% names(df)){
      df[[col]] <- as.factor(df[[col]])
    }
  }

  return(df)
}
train <- type_change(train)
test <- type_change(test)
glimpse(train)
## Rows: 14,900
## Columns: 24
## $ Employee.ID              <int> 52685, 30585, 54656, 33442, 15667, 3496, 4677…
## $ Age                      <int> 36, 35, 50, 58, 39, 45, 22, 34, 48, 55, 32, 2…
## $ Gender                   <fct> Male, Male, Male, Male, Male, Female, Female,…
## $ Years.at.Company         <int> 13, 7, 7, 44, 24, 30, 5, 15, 40, 16, 12, 15, …
## $ Job.Role                 <fct> Healthcare, Education, Education, Media, Educ…
## $ Monthly.Income           <int> 8029, 4563, 5583, 5525, 4604, 8104, 8700, 110…
## $ Work.Life.Balance        <fct> Excellent, Good, Fair, Fair, Good, Fair, Good…
## $ Job.Satisfaction         <fct> High, High, High, Very High, High, High, High…
## $ Performance.Rating       <fct> Average, Average, Average, High, Average, Ave…
## $ Number.of.Promotions     <int> 1, 1, 3, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 1, …
## $ Overtime                 <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, No, No, …
## $ Distance.from.Home       <int> 83, 55, 14, 43, 47, 38, 2, 9, 65, 31, 28, 35,…
## $ Education.Level          <fct> Master’s Degree, Associate Degree, Associate …
## $ Marital.Status           <fct> Married, Single, Divorced, Single, Married, D…
## $ Number.of.Dependents     <int> 1, 4, 2, 4, 6, 0, 0, 4, 1, 1, 1, 1, 3, 0, 0, …
## $ Job.Level                <fct> Mid, Entry, Senior, Entry, Mid, Senior, Mid, …
## $ Company.Size             <fct> Large, Medium, Medium, Medium, Large, Large, …
## $ Company.Tenure           <dbl> 1.83, 2.25, 6.33, 8.00, 3.75, 6.25, 4.00, 1.3…
## $ Remote.Work              <fct> No, No, No, No, Yes, No, No, No, No, No, No, …
## $ Leadership.Opportunities <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ Innovation.Opportunities <fct> No, No, Yes, No, No, No, No, No, No, No, No, …
## $ Company.Reputation       <fct> Poor, Good, Good, Poor, Good, Good, Poor, Goo…
## $ Employee.Recognition     <fct> Medium, High, Low, Low, High, Low, High, Low,…
## $ Attrition                <fct> Stayed, Left, Stayed, Left, Stayed, Stayed, S…
glimpse(test)
## Rows: 59,598
## Columns: 24
## $ Employee.ID              <int> 8410, 64756, 30257, 65791, 65026, 24368, 6497…
## $ Age                      <int> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30, 2…
## $ Gender                   <fct> Male, Female, Female, Female, Male, Female, M…
## $ Years.at.Company         <int> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, 38…
## $ Job.Role                 <fct> Education, Media, Healthcare, Education, Educ…
## $ Monthly.Income           <int> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 112…
## $ Work.Life.Balance        <fct> Excellent, Poor, Good, Good, Fair, Fair, Fair…
## $ Job.Satisfaction         <fct> Medium, High, High, High, Very High, High, Hi…
## $ Performance.Rating       <fct> Average, Low, Low, High, Average, Below Avera…
## $ Number.of.Promotions     <int> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0, …
## $ Overtime                 <fct> No, No, No, No, Yes, No, Yes, No, Yes, Yes, N…
## $ Distance.from.Home       <int> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, 26…
## $ Education.Level          <fct> Associate Degree, Master’s Degree, Bachelor’s…
## $ Marital.Status           <fct> Married, Divorced, Married, Single, Divorced,…
## $ Number.of.Dependents     <int> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0, …
## $ Job.Level                <fct> Mid, Mid, Mid, Mid, Senior, Mid, Entry, Entry…
## $ Company.Size             <fct> Medium, Medium, Medium, Small, Medium, Medium…
## $ Company.Tenure           <dbl> 7.42, 1.75, 6.17, 4.17, 5.67, 3.92, 7.75, 7.3…
## $ Remote.Work              <fct> No, No, No, Yes, No, No, No, No, No, No, No, …
## $ Leadership.Opportunities <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ Innovation.Opportunities <fct> No, No, No, No, No, Yes, No, No, No, Yes, No,…
## $ Company.Reputation       <fct> Excellent, Fair, Poor, Good, Fair, Fair, Good…
## $ Employee.Recognition     <fct> Medium, Low, Low, Medium, Medium, High, Mediu…
## $ Attrition                <fct> Stayed, Stayed, Stayed, Stayed, Stayed, Left,…
# Check outliers for Monthly.Income with Boxplot
ggplot(train, aes(y=Monthly.Income)) + geom_boxplot()

ggplot(test, aes(y=Monthly.Income)) + geom_boxplot()

# Remove extreme outliers from training set
Q1 <- quantile(train$Monthly.Income, 0.25)
Q3 <- quantile(train$Monthly.Income, 0.75)
IQR <- Q3 - Q1
train_clean <- train %>%
  filter(Monthly.Income >(Q1-1.5*IQR) & Monthly.Income < (Q3+1.5*IQR))

ggplot(train_clean, aes(y=Monthly.Income)) + geom_boxplot()

# Remove extreme outliers from testing set
Q1 <- quantile(test$Monthly.Income, 0.25)
Q3 <- quantile(test$Monthly.Income, 0.75)
IQR <- Q3 - Q1
test_clean <- test %>% filter(Monthly.Income >(Q1-1.5*IQR) & Monthly.Income < (Q3+1.5*IQR))

ggplot(test_clean, aes(y=Monthly.Income)) + geom_boxplot()

# Final versions
#View(train_clean)
#View(test_clean)

# Export cleaned data
#write.csv(train_clean, "train_clean.csv",row.names = F)
#write.csv(test_clean, "test_clean.csv",row.names = F)

Data Visualization

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.