library(tidyverse)
library(tidyquant)
attrition <- readr::read_csv("../00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
attrition %>% skimr::skim()
Name | Piped data |
Number of rows | 1470 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
character | 9 |
numeric | 26 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
Attrition | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
BusinessTravel | 0 | 1 | 10 | 17 | 0 | 3 | 0 |
Department | 0 | 1 | 5 | 22 | 0 | 3 | 0 |
EducationField | 0 | 1 | 5 | 16 | 0 | 6 | 0 |
Gender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
JobRole | 0 | 1 | 7 | 25 | 0 | 9 | 0 |
MaritalStatus | 0 | 1 | 6 | 8 | 0 | 3 | 0 |
Over18 | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
OverTime | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Age | 0 | 1 | 36.92 | 9.14 | 18 | 30.00 | 36.0 | 43.00 | 60 | ▂▇▇▃▂ |
DailyRate | 0 | 1 | 802.49 | 403.51 | 102 | 465.00 | 802.0 | 1157.00 | 1499 | ▇▇▇▇▇ |
DistanceFromHome | 0 | 1 | 9.19 | 8.11 | 1 | 2.00 | 7.0 | 14.00 | 29 | ▇▅▂▂▂ |
Education | 0 | 1 | 2.91 | 1.02 | 1 | 2.00 | 3.0 | 4.00 | 5 | ▂▃▇▆▁ |
EmployeeCount | 0 | 1 | 1.00 | 0.00 | 1 | 1.00 | 1.0 | 1.00 | 1 | ▁▁▇▁▁ |
EmployeeNumber | 0 | 1 | 1024.87 | 602.02 | 1 | 491.25 | 1020.5 | 1555.75 | 2068 | ▇▇▇▇▇ |
EnvironmentSatisfaction | 0 | 1 | 2.72 | 1.09 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▅▁▇▇ |
HourlyRate | 0 | 1 | 65.89 | 20.33 | 30 | 48.00 | 66.0 | 83.75 | 100 | ▇▇▇▇▇ |
JobInvolvement | 0 | 1 | 2.73 | 0.71 | 1 | 2.00 | 3.0 | 3.00 | 4 | ▁▃▁▇▁ |
JobLevel | 0 | 1 | 2.06 | 1.11 | 1 | 1.00 | 2.0 | 3.00 | 5 | ▇▇▃▂▁ |
JobSatisfaction | 0 | 1 | 2.73 | 1.10 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▅▁▇▇ |
MonthlyIncome | 0 | 1 | 6502.93 | 4707.96 | 1009 | 2911.00 | 4919.0 | 8379.00 | 19999 | ▇▅▂▁▂ |
MonthlyRate | 0 | 1 | 14313.10 | 7117.79 | 2094 | 8047.00 | 14235.5 | 20461.50 | 26999 | ▇▇▇▇▇ |
NumCompaniesWorked | 0 | 1 | 2.69 | 2.50 | 0 | 1.00 | 2.0 | 4.00 | 9 | ▇▃▂▂▁ |
PercentSalaryHike | 0 | 1 | 15.21 | 3.66 | 11 | 12.00 | 14.0 | 18.00 | 25 | ▇▅▃▂▁ |
PerformanceRating | 0 | 1 | 3.15 | 0.36 | 3 | 3.00 | 3.0 | 3.00 | 4 | ▇▁▁▁▂ |
RelationshipSatisfaction | 0 | 1 | 2.71 | 1.08 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▅▁▇▇ |
StandardHours | 0 | 1 | 80.00 | 0.00 | 80 | 80.00 | 80.0 | 80.00 | 80 | ▁▁▇▁▁ |
StockOptionLevel | 0 | 1 | 0.79 | 0.85 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▂▁ |
TotalWorkingYears | 0 | 1 | 11.28 | 7.78 | 0 | 6.00 | 10.0 | 15.00 | 40 | ▇▇▂▁▁ |
TrainingTimesLastYear | 0 | 1 | 2.80 | 1.29 | 0 | 2.00 | 3.0 | 3.00 | 6 | ▂▇▇▂▃ |
WorkLifeBalance | 0 | 1 | 2.76 | 0.71 | 1 | 2.00 | 3.0 | 3.00 | 4 | ▁▃▁▇▂ |
YearsAtCompany | 0 | 1 | 7.01 | 6.13 | 0 | 3.00 | 5.0 | 9.00 | 40 | ▇▂▁▁▁ |
YearsInCurrentRole | 0 | 1 | 4.23 | 3.62 | 0 | 2.00 | 3.0 | 7.00 | 18 | ▇▃▂▁▁ |
YearsSinceLastPromotion | 0 | 1 | 2.19 | 3.22 | 0 | 0.00 | 1.0 | 3.00 | 15 | ▇▁▁▁▁ |
YearsWithCurrManager | 0 | 1 | 4.12 | 3.57 | 0 | 2.00 | 3.0 | 7.00 | 17 | ▇▂▅▁▁ |
data <- attrition %>%
# Remove zero variance variables
select(-Over18, -EmployeeCount, -StandardHours) %>%
# Convert character to factor
mutate(across(where(is.character), factor)) %>%
# Convert factors imported as numeric
mutate(across(c(Education, JobLevel, StockOptionLevel), factor))
*** Identify variables with correlation with the target variable.***
data %>% count(Attrition)
## # A tibble: 2 × 2
## Attrition n
## <fct> <int>
## 1 No 1233
## 2 Yes 237
Monthly income looks like the best predictor among income variables.
data %>%
ggplot(aes(Attrition, MonthlyIncome)) +
geom_boxplot()
data %>%
ggplot(aes(Attrition, MonthlyRate)) +
geom_boxplot()
data %>%
ggplot(aes(Attrition, DailyRate)) +
geom_boxplot()
data %>%
ggplot(aes(Attrition, HourlyRate)) +
geom_boxplot()
data %>%
ggplot(aes(Attrition, DistanceFromHome)) +
geom_boxplot()
data %>%
# Transform
count(Attrition, WorkLifeBalance) %>%
# Plot
ggplot(aes(Attrition, WorkLifeBalance, fill = n)) +
geom_tile()
data <- sample_n(data,100)
library(tidymodels)
set.seed(123)
data_split <- initial_split(data, strata = Attrition)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- bootstraps(data_train, strata = Attrition)
data_folds
## # Bootstrap sampling using stratification
## # A tibble: 25 × 2
## splits id
## <list> <chr>
## 1 <split [74/28]> Bootstrap01
## 2 <split [74/30]> Bootstrap02
## 3 <split [74/26]> Bootstrap03
## 4 <split [74/26]> Bootstrap04
## 5 <split [74/27]> Bootstrap05
## 6 <split [74/26]> Bootstrap06
## 7 <split [74/29]> Bootstrap07
## 8 <split [74/34]> Bootstrap08
## 9 <split [74/27]> Bootstrap09
## 10 <split [74/29]> Bootstrap10
## # ℹ 15 more rows
library(themis)
data_rec <-
recipe(Attrition ~ ., data = data_train) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(Attrition)
data_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 122
## Columns: 54
## $ Age <dbl> -1.5368207, 1.1735999, -1.0850840, 0…
## $ DailyRate <dbl> 0.65716498, 1.03008808, -0.12234789,…
## $ DistanceFromHome <dbl> 0.19091820, -0.95073408, -0.95073408…
## $ EmployeeNumber <dbl> -0.32114650, -0.55762847, 0.43623497…
## $ EnvironmentSatisfaction <dbl> 0.2510663, 1.1800115, 1.1800115, 0.2…
## $ HourlyRate <dbl> -0.2217389, -1.1514090, -0.9654749, …
## $ JobInvolvement <dbl> -1.144000, -1.144000, 0.341193, 0.34…
## $ JobSatisfaction <dbl> -0.5096229, -1.4294302, 0.4101843, 0…
## $ MonthlyIncome <dbl> -0.788043354, 1.879046788, -0.901375…
## $ MonthlyRate <dbl> 1.638378424, 0.491854588, -1.0032907…
## $ NumCompaniesWorked <dbl> -0.72227243, 1.60156060, -1.10957793…
## $ PercentSalaryHike <dbl> -1.1410380, -1.1410380, -1.1410380, …
## $ PerformanceRating <dbl> -0.4369587, -0.4369587, -0.4369587, …
## $ RelationshipSatisfaction <dbl> 0.3767387, -1.3128774, 0.3767387, -0…
## $ TotalWorkingYears <dbl> -1.36965471, 1.26978406, -0.84176696…
## $ TrainingTimesLastYear <dbl> 1.188705, 0.350951, -0.486803, 2.026…
## $ WorkLifeBalance <dbl> 0.2798988, -2.6790313, -1.1995663, 0…
## $ YearsAtCompany <dbl> -1.06903964, -0.71900011, -0.5439803…
## $ YearsInCurrentRole <dbl> -1.2942719, -0.7102712, -0.7102712, …
## $ YearsSinceLastPromotion <dbl> -0.6926947, -0.6926947, -0.6926947, …
## $ YearsWithCurrManager <dbl> -1.2785445, -0.6998761, -0.6998761, …
## $ BusinessTravel_Travel_Frequently <dbl> -0.4369587, -0.4369587, 2.2576197, -…
## $ BusinessTravel_Travel_Rarely <dbl> -1.4335900, 0.6881232, -1.4335900, -…
## $ Department_Research...Development <dbl> 0.583769, 0.583769, 0.583769, 0.5837…
## $ Department_Sales <dbl> -0.5424161, -0.5424161, -0.5424161, …
## $ Education_X2 <dbl> -0.4369587, -0.4369587, -0.4369587, …
## $ Education_X3 <dbl> 1.2372368, -0.7973304, -0.7973304, -…
## $ Education_X4 <dbl> -0.6669978, 1.4789951, 1.4789951, -0…
## $ Education_X5 <dbl> -0.1162476, -0.1162476, -0.1162476, …
## $ EducationField_Life.Sciences <dbl> 1.1378768, 1.1378768, -0.8669537, -0…
## $ EducationField_Marketing <dbl> -0.2950304, -0.2950304, -0.2950304, …
## $ EducationField_Medical <dbl> -0.6881232, -0.6881232, 1.4335900, 1…
## $ EducationField_Other <dbl> -0.2374251, -0.2374251, -0.2374251, …
## $ EducationField_Technical.Degree <dbl> -0.3210386, -0.3210386, -0.3210386, …
## $ Gender_Male <dbl> 0.9409083, -1.0484407, 0.9409083, -1…
## $ JobLevel_X2 <dbl> -0.8201246, -0.8201246, -0.8201246, …
## $ JobLevel_X3 <dbl> -0.4150225, -0.4150225, -0.4150225, …
## $ JobLevel_X4 <dbl> -0.2673659, 3.6896495, -0.2673659, -…
## $ JobLevel_X5 <dbl> -0.2374251, -0.2374251, -0.2374251, …
## $ JobRole_Human.Resources <dbl> -0.1655367, -0.1655367, -0.1655367, …
## $ JobRole_Laboratory.Technician <dbl> 2.1514874, -0.4585137, 2.1514874, -0…
## $ JobRole_Manager <dbl> -0.2673659, 3.6896495, -0.2673659, -…
## $ JobRole_Manufacturing.Director <dbl> -0.3926048, -0.3926048, -0.3926048, …
## $ JobRole_Research.Director <dbl> -0.2374251, -0.2374251, -0.2374251, …
## $ JobRole_Research.Scientist <dbl> -0.5424161, -0.5424161, -0.5424161, …
## $ JobRole_Sales.Executive <dbl> -0.500801, -0.500801, -0.500801, -0.…
## $ JobRole_Sales.Representative <dbl> -0.1655367, -0.1655367, -0.1655367, …
## $ MaritalStatus_Married <dbl> -0.9157038, 1.0772986, -0.9157038, -…
## $ MaritalStatus_Single <dbl> -0.6669978, -0.6669978, 1.4789951, -…
## $ OverTime_Yes <dbl> -0.7309903, -0.7309903, 1.3495206, -…
## $ StockOptionLevel_X1 <dbl> -0.6881232, 1.4335900, -0.6881232, 1…
## $ StockOptionLevel_X2 <dbl> -0.3926048, -0.3926048, -0.3926048, …
## $ StockOptionLevel_X3 <dbl> 3.0727976, -0.3210386, -0.3210386, -…
## $ Attrition <fct> No, No, No, No, No, No, No, No, No, …