Write the introduction here.
Install the packages, if the library is not exist in R. Load the library, if there are exist.
#install.packages("dplyr")
#install.packages("janitor")
#install.packages("ggplot2")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggplot2)
library(ggthemes)
There are two datasets.
train<-read.csv("test.csv")
test <-read.csv("train.csv")
# Quick structure overview
glimpse(train)
## Rows: 14,900
## Columns: 24
## $ Employee.ID <int> 52685, 30585, 54656, 33442, 15667, 3496, 4677…
## $ Age <int> 36, 35, 50, 58, 39, 45, 22, 34, 48, 55, 32, 2…
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Fema…
## $ Years.at.Company <int> 13, 7, 7, 44, 24, 30, 5, 15, 40, 16, 12, 15, …
## $ Job.Role <chr> "Healthcare", "Education", "Education", "Medi…
## $ Monthly.Income <int> 8029, 4563, 5583, 5525, 4604, 8104, 8700, 110…
## $ Work.Life.Balance <chr> "Excellent", "Good", "Fair", "Fair", "Good", …
## $ Job.Satisfaction <chr> "High", "High", "High", "Very High", "High", …
## $ Performance.Rating <chr> "Average", "Average", "Average", "High", "Ave…
## $ Number.of.Promotions <int> 1, 1, 3, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 1, …
## $ Overtime <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No"…
## $ Distance.from.Home <int> 83, 55, 14, 43, 47, 38, 2, 9, 65, 31, 28, 35,…
## $ Education.Level <chr> "Master’s Degree", "Associate Degree", "Assoc…
## $ Marital.Status <chr> "Married", "Single", "Divorced", "Single", "M…
## $ Number.of.Dependents <int> 1, 4, 2, 4, 6, 0, 0, 4, 1, 1, 1, 1, 3, 0, 0, …
## $ Job.Level <chr> "Mid", "Entry", "Senior", "Entry", "Mid", "Se…
## $ Company.Size <chr> "Large", "Medium", "Medium", "Medium", "Large…
## $ Company.Tenure <int> 22, 27, 76, 96, 45, 75, 48, 16, 52, 46, 57, 9…
## $ Remote.Work <chr> "No", "No", "No", "No", "Yes", "No", "No", "N…
## $ Leadership.Opportunities <chr> "No", "No", "No", "No", "No", "No", "No", "No…
## $ Innovation.Opportunities <chr> "No", "No", "Yes", "No", "No", "No", "No", "N…
## $ Company.Reputation <chr> "Poor", "Good", "Good", "Poor", "Good", "Good…
## $ Employee.Recognition <chr> "Medium", "High", "Low", "Low", "High", "Low"…
## $ Attrition <chr> "Stayed", "Left", "Stayed", "Left", "Stayed",…
glimpse(test)
## Rows: 59,598
## Columns: 24
## $ Employee.ID <int> 8410, 64756, 30257, 65791, 65026, 24368, 6497…
## $ Age <int> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30, 2…
## $ Gender <chr> "Male", "Female", "Female", "Female", "Male",…
## $ Years.at.Company <int> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, 38…
## $ Job.Role <chr> "Education", "Media", "Healthcare", "Educatio…
## $ Monthly.Income <int> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 112…
## $ Work.Life.Balance <chr> "Excellent", "Poor", "Good", "Good", "Fair", …
## $ Job.Satisfaction <chr> "Medium", "High", "High", "High", "Very High"…
## $ Performance.Rating <chr> "Average", "Low", "Low", "High", "Average", "…
## $ Number.of.Promotions <int> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0, …
## $ Overtime <chr> "No", "No", "No", "No", "Yes", "No", "Yes", "…
## $ Distance.from.Home <int> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, 26…
## $ Education.Level <chr> "Associate Degree", "Master’s Degree", "Bache…
## $ Marital.Status <chr> "Married", "Divorced", "Married", "Single", "…
## $ Number.of.Dependents <int> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0, …
## $ Job.Level <chr> "Mid", "Mid", "Mid", "Mid", "Senior", "Mid", …
## $ Company.Size <chr> "Medium", "Medium", "Medium", "Small", "Mediu…
## $ Company.Tenure <int> 89, 21, 74, 50, 68, 47, 93, 88, 75, 45, 17, 3…
## $ Remote.Work <chr> "No", "No", "No", "Yes", "No", "No", "No", "N…
## $ Leadership.Opportunities <chr> "No", "No", "No", "No", "No", "No", "No", "No…
## $ Innovation.Opportunities <chr> "No", "No", "No", "No", "No", "Yes", "No", "N…
## $ Company.Reputation <chr> "Excellent", "Fair", "Poor", "Good", "Fair", …
## $ Employee.Recognition <chr> "Medium", "Low", "Low", "Medium", "Medium", "…
## $ Attrition <chr> "Stayed", "Stayed", "Stayed", "Stayed", "Stay…
summary(train)
## Employee.ID Age Gender Years.at.Company
## Min. : 5 Min. :18.00 Length:14900 Min. : 1.00
## 1st Qu.:18826 1st Qu.:28.00 Class :character 1st Qu.: 7.00
## Median :37433 Median :38.00 Mode :character Median :13.00
## Mean :37339 Mean :38.39 Mean :15.59
## 3rd Qu.:55858 3rd Qu.:49.00 3rd Qu.:23.00
## Max. :74471 Max. :59.00 Max. :51.00
## Job.Role Monthly.Income Work.Life.Balance Job.Satisfaction
## Length:14900 Min. : 1226 Length:14900 Length:14900
## Class :character 1st Qu.: 5634 Class :character Class :character
## Mode :character Median : 7332 Mode :character Mode :character
## Mean : 7287
## 3rd Qu.: 8852
## Max. :15063
## Performance.Rating Number.of.Promotions Overtime Distance.from.Home
## Length:14900 Min. :0.0000 Length:14900 Min. : 1.00
## Class :character 1st Qu.:0.0000 Class :character 1st Qu.:25.00
## Mode :character Median :1.0000 Mode :character Median :50.00
## Mean :0.8344 Mean :49.93
## 3rd Qu.:2.0000 3rd Qu.:75.00
## Max. :4.0000 Max. :99.00
## Education.Level Marital.Status Number.of.Dependents Job.Level
## Length:14900 Length:14900 Min. :0.000 Length:14900
## Class :character Class :character 1st Qu.:0.000 Class :character
## Mode :character Mode :character Median :1.000 Mode :character
## Mean :1.659
## 3rd Qu.:3.000
## Max. :6.000
## Company.Size Company.Tenure Remote.Work Leadership.Opportunities
## Length:14900 Min. : 2.0 Length:14900 Length:14900
## Class :character 1st Qu.: 36.0 Class :character Class :character
## Mode :character Median : 56.0 Mode :character Mode :character
## Mean : 55.6
## 3rd Qu.: 75.0
## Max. :127.0
## Innovation.Opportunities Company.Reputation Employee.Recognition
## Length:14900 Length:14900 Length:14900
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Attrition
## Length:14900
## Class :character
## Mode :character
##
##
##
summary(test)
## Employee.ID Age Gender Years.at.Company
## Min. : 1 Min. :18.00 Length:59598 Min. : 1.00
## 1st Qu.:18580 1st Qu.:28.00 Class :character 1st Qu.: 7.00
## Median :37210 Median :39.00 Mode :character Median :13.00
## Mean :37227 Mean :38.57 Mean :15.75
## 3rd Qu.:55877 3rd Qu.:49.00 3rd Qu.:23.00
## Max. :74498 Max. :59.00 Max. :51.00
## Job.Role Monthly.Income Work.Life.Balance Job.Satisfaction
## Length:59598 Min. : 1316 Length:59598 Length:59598
## Class :character 1st Qu.: 5658 Class :character Class :character
## Mode :character Median : 7354 Mode :character Mode :character
## Mean : 7302
## 3rd Qu.: 8880
## Max. :16149
## Performance.Rating Number.of.Promotions Overtime Distance.from.Home
## Length:59598 Min. :0.0000 Length:59598 Min. : 1.00
## Class :character 1st Qu.:0.0000 Class :character 1st Qu.:25.00
## Mode :character Median :1.0000 Mode :character Median :50.00
## Mean :0.8326 Mean :50.01
## 3rd Qu.:2.0000 3rd Qu.:75.00
## Max. :4.0000 Max. :99.00
## Education.Level Marital.Status Number.of.Dependents Job.Level
## Length:59598 Length:59598 Min. :0.000 Length:59598
## Class :character Class :character 1st Qu.:0.000 Class :character
## Mode :character Mode :character Median :1.000 Mode :character
## Mean :1.648
## 3rd Qu.:3.000
## Max. :6.000
## Company.Size Company.Tenure Remote.Work
## Length:59598 Min. : 2.00 Length:59598
## Class :character 1st Qu.: 36.00 Class :character
## Mode :character Median : 56.00 Mode :character
## Mean : 55.76
## 3rd Qu.: 76.00
## Max. :128.00
## Leadership.Opportunities Innovation.Opportunities Company.Reputation
## Length:59598 Length:59598 Length:59598
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Employee.Recognition Attrition
## Length:59598 Length:59598
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Check missing values
colSums(is.na(train))
## Employee.ID Age Gender
## 0 0 0
## Years.at.Company Job.Role Monthly.Income
## 0 0 0
## Work.Life.Balance Job.Satisfaction Performance.Rating
## 0 0 0
## Number.of.Promotions Overtime Distance.from.Home
## 0 0 0
## Education.Level Marital.Status Number.of.Dependents
## 0 0 0
## Job.Level Company.Size Company.Tenure
## 0 0 0
## Remote.Work Leadership.Opportunities Innovation.Opportunities
## 0 0 0
## Company.Reputation Employee.Recognition Attrition
## 0 0 0
colSums(is.na(test))
## Employee.ID Age Gender
## 0 0 0
## Years.at.Company Job.Role Monthly.Income
## 0 0 0
## Work.Life.Balance Job.Satisfaction Performance.Rating
## 0 0 0
## Number.of.Promotions Overtime Distance.from.Home
## 0 0 0
## Education.Level Marital.Status Number.of.Dependents
## 0 0 0
## Job.Level Company.Size Company.Tenure
## 0 0 0
## Remote.Work Leadership.Opportunities Innovation.Opportunities
## 0 0 0
## Company.Reputation Employee.Recognition Attrition
## 0 0 0
# Check duplicates
get_dupes(train)
## No variable names specified - using all columns.
## No duplicate combinations found of: Employee.ID, Age, Gender, Years.at.Company, Job.Role, Monthly.Income, Work.Life.Balance, Job.Satisfaction, Performance.Rating, ... and 15 other variables
## [1] Employee.ID Age Gender
## [4] Years.at.Company Job.Role Monthly.Income
## [7] Work.Life.Balance Job.Satisfaction Performance.Rating
## [10] Number.of.Promotions Overtime Distance.from.Home
## [13] Education.Level Marital.Status Number.of.Dependents
## [16] Job.Level Company.Size Company.Tenure
## [19] Remote.Work Leadership.Opportunities Innovation.Opportunities
## [22] Company.Reputation Employee.Recognition Attrition
## [25] dupe_count
## <0 rows> (or 0-length row.names)
get_dupes(test)
## No variable names specified - using all columns.
##
## No duplicate combinations found of: Employee.ID, Age, Gender, Years.at.Company, Job.Role, Monthly.Income, Work.Life.Balance, Job.Satisfaction, Performance.Rating, ... and 15 other variables
## [1] Employee.ID Age Gender
## [4] Years.at.Company Job.Role Monthly.Income
## [7] Work.Life.Balance Job.Satisfaction Performance.Rating
## [10] Number.of.Promotions Overtime Distance.from.Home
## [13] Education.Level Marital.Status Number.of.Dependents
## [16] Job.Level Company.Size Company.Tenure
## [19] Remote.Work Leadership.Opportunities Innovation.Opportunities
## [22] Company.Reputation Employee.Recognition Attrition
## [25] dupe_count
## <0 rows> (or 0-length row.names)
# metadata states tenure in years but there are unrealistic values(e.g. 89,95)
# Convert tenure from months to years
train$Company.Tenure <- round(train$Company.Tenure /12,2)
test$Company.Tenure <- round(test$Company.Tenure /12,2)
## Check Age Vs Company.Tenure (must not exceed age)
train %>% filter(Company.Tenure > Age)
## [1] Employee.ID Age Gender
## [4] Years.at.Company Job.Role Monthly.Income
## [7] Work.Life.Balance Job.Satisfaction Performance.Rating
## [10] Number.of.Promotions Overtime Distance.from.Home
## [13] Education.Level Marital.Status Number.of.Dependents
## [16] Job.Level Company.Size Company.Tenure
## [19] Remote.Work Leadership.Opportunities Innovation.Opportunities
## [22] Company.Reputation Employee.Recognition Attrition
## <0 rows> (or 0-length row.names)
test %>% filter(Company.Tenure > Age)
## [1] Employee.ID Age Gender
## [4] Years.at.Company Job.Role Monthly.Income
## [7] Work.Life.Balance Job.Satisfaction Performance.Rating
## [10] Number.of.Promotions Overtime Distance.from.Home
## [13] Education.Level Marital.Status Number.of.Dependents
## [16] Job.Level Company.Size Company.Tenure
## [19] Remote.Work Leadership.Opportunities Innovation.Opportunities
## [22] Company.Reputation Employee.Recognition Attrition
## <0 rows> (or 0-length row.names)
## Check Age Vs Years.at.Company (must not exceed age)
checkdata2<-function(df){
# Filter and count rows where Age < Years.at.Company
issue_count<- df %>%
filter(Age<Years.at.Company) %>%
nrow()
# Check the count
if(issue_count== 0) {
print("No problem.No one has years_at_company greater than age.")
} else {
print(paste(issue_count,"rows have years_at_company > age"))
}
}
checkdata2(train)
## [1] "No problem.No one has years_at_company greater than age."
checkdata2(test)
## [1] "No problem.No one has years_at_company greater than age."
# Convert categorical variables to factors
type_change <- function(df){
categorical_cols<-c("Gender","Job.Role","Work.Life.Balance","Job.Satisfaction",
"Performance.Rating","Marital.Status","Overtime","Education.Level",
"Job.Level","Company.Size","Remote.Work","Leadership.Opportunities",
"Innovation.Opportunities","Company.Reputation","Employee.Recognition",
"Attrition")
for (col in categorical_cols){
if(col %in% names(df)){
df[[col]] <- as.factor(df[[col]])
}
}
return(df)
}
train <- type_change(train)
test <- type_change(test)
glimpse(train)
## Rows: 14,900
## Columns: 24
## $ Employee.ID <int> 52685, 30585, 54656, 33442, 15667, 3496, 4677…
## $ Age <int> 36, 35, 50, 58, 39, 45, 22, 34, 48, 55, 32, 2…
## $ Gender <fct> Male, Male, Male, Male, Male, Female, Female,…
## $ Years.at.Company <int> 13, 7, 7, 44, 24, 30, 5, 15, 40, 16, 12, 15, …
## $ Job.Role <fct> Healthcare, Education, Education, Media, Educ…
## $ Monthly.Income <int> 8029, 4563, 5583, 5525, 4604, 8104, 8700, 110…
## $ Work.Life.Balance <fct> Excellent, Good, Fair, Fair, Good, Fair, Good…
## $ Job.Satisfaction <fct> High, High, High, Very High, High, High, High…
## $ Performance.Rating <fct> Average, Average, Average, High, Average, Ave…
## $ Number.of.Promotions <int> 1, 1, 3, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 1, …
## $ Overtime <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, No, No, …
## $ Distance.from.Home <int> 83, 55, 14, 43, 47, 38, 2, 9, 65, 31, 28, 35,…
## $ Education.Level <fct> Master’s Degree, Associate Degree, Associate …
## $ Marital.Status <fct> Married, Single, Divorced, Single, Married, D…
## $ Number.of.Dependents <int> 1, 4, 2, 4, 6, 0, 0, 4, 1, 1, 1, 1, 3, 0, 0, …
## $ Job.Level <fct> Mid, Entry, Senior, Entry, Mid, Senior, Mid, …
## $ Company.Size <fct> Large, Medium, Medium, Medium, Large, Large, …
## $ Company.Tenure <dbl> 1.83, 2.25, 6.33, 8.00, 3.75, 6.25, 4.00, 1.3…
## $ Remote.Work <fct> No, No, No, No, Yes, No, No, No, No, No, No, …
## $ Leadership.Opportunities <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ Innovation.Opportunities <fct> No, No, Yes, No, No, No, No, No, No, No, No, …
## $ Company.Reputation <fct> Poor, Good, Good, Poor, Good, Good, Poor, Goo…
## $ Employee.Recognition <fct> Medium, High, Low, Low, High, Low, High, Low,…
## $ Attrition <fct> Stayed, Left, Stayed, Left, Stayed, Stayed, S…
glimpse(test)
## Rows: 59,598
## Columns: 24
## $ Employee.ID <int> 8410, 64756, 30257, 65791, 65026, 24368, 6497…
## $ Age <int> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30, 2…
## $ Gender <fct> Male, Female, Female, Female, Male, Female, M…
## $ Years.at.Company <int> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, 38…
## $ Job.Role <fct> Education, Media, Healthcare, Education, Educ…
## $ Monthly.Income <int> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 112…
## $ Work.Life.Balance <fct> Excellent, Poor, Good, Good, Fair, Fair, Fair…
## $ Job.Satisfaction <fct> Medium, High, High, High, Very High, High, Hi…
## $ Performance.Rating <fct> Average, Low, Low, High, Average, Below Avera…
## $ Number.of.Promotions <int> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0, …
## $ Overtime <fct> No, No, No, No, Yes, No, Yes, No, Yes, Yes, N…
## $ Distance.from.Home <int> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, 26…
## $ Education.Level <fct> Associate Degree, Master’s Degree, Bachelor’s…
## $ Marital.Status <fct> Married, Divorced, Married, Single, Divorced,…
## $ Number.of.Dependents <int> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0, …
## $ Job.Level <fct> Mid, Mid, Mid, Mid, Senior, Mid, Entry, Entry…
## $ Company.Size <fct> Medium, Medium, Medium, Small, Medium, Medium…
## $ Company.Tenure <dbl> 7.42, 1.75, 6.17, 4.17, 5.67, 3.92, 7.75, 7.3…
## $ Remote.Work <fct> No, No, No, Yes, No, No, No, No, No, No, No, …
## $ Leadership.Opportunities <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ Innovation.Opportunities <fct> No, No, No, No, No, Yes, No, No, No, Yes, No,…
## $ Company.Reputation <fct> Excellent, Fair, Poor, Good, Fair, Fair, Good…
## $ Employee.Recognition <fct> Medium, Low, Low, Medium, Medium, High, Mediu…
## $ Attrition <fct> Stayed, Stayed, Stayed, Stayed, Stayed, Left,…
# Check outliers for Monthly.Income with Boxplot
ggplot(train, aes(y=Monthly.Income)) + geom_boxplot()
ggplot(test, aes(y=Monthly.Income)) + geom_boxplot()
# Remove extreme outliers from training set
Q1 <- quantile(train$Monthly.Income, 0.25)
Q3 <- quantile(train$Monthly.Income, 0.75)
IQR <- Q3 - Q1
train_clean <- train %>%
filter(Monthly.Income >(Q1-1.5*IQR) & Monthly.Income < (Q3+1.5*IQR))
ggplot(train_clean, aes(y=Monthly.Income)) + geom_boxplot()
# Remove extreme outliers from testing set
Q1 <- quantile(test$Monthly.Income, 0.25)
Q3 <- quantile(test$Monthly.Income, 0.75)
IQR <- Q3 - Q1
test_clean <- test %>% filter(Monthly.Income >(Q1-1.5*IQR) & Monthly.Income < (Q3+1.5*IQR))
ggplot(test_clean, aes(y=Monthly.Income)) + geom_boxplot()
# Final versions
#View(train_clean)
#View(test_clean)
# Export cleaned data
#write.csv(train_clean, "train_clean.csv",row.names = F)
#write.csv(test_clean, "test_clean.csv",row.names = F)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.