library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
employee_raw <- read_csv("HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
employee_raw$Attrition <- ifelse(employee_raw$Attrition == "Yes", 1, 0)
employee_raw %>%
ggplot(aes(Age, Education, z = Attrition)) +
stat_summary_hex(alpha = 0.8, bins = 4) +
scale_fill_viridis_c(labels = scales::percent) +
labs(fill = "Attrition")
employee_raw %>%
ggplot(aes(HourlyRate, YearsAtCompany, z = Attrition)) +
stat_summary_hex(alpha = 0.8) +
scale_fill_viridis_c(labels = scales::percent) +
labs(fill = "Attrition")
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
employee_split <- employee_raw %>%
mutate(Attrition = if_else(as.logical(Attrition), "Left", "Stayed"),
Attrition = factor(Attrition)) %>%
initial_split(strata = Attrition)
employee_train <- training(employee_split)
employee_test <- testing(employee_split)
set.seed(234)
employee_folds <- vfold_cv(employee_train, strata = Attrition)
employee_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [990/111]> Fold01
## 2 <split [990/111]> Fold02
## 3 <split [990/111]> Fold03
## 4 <split [990/111]> Fold04
## 5 <split [991/110]> Fold05
## 6 <split [991/110]> Fold06
## 7 <split [991/110]> Fold07
## 8 <split [992/109]> Fold08
## 9 <split [992/109]> Fold09
## 10 <split [992/109]> Fold10
employee_rec <-
recipe(Attrition ~ DailyRate + DistanceFromHome + HourlyRate + JobLevel +
MonthlyIncome + PerformanceRating + StandardHours +
YearsAtCompany + WorkLifeBalance + YearsInCurrentRole,
data = employee_train
) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_nzv(all_predictors())
prep(employee_rec)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 10
##
## ── Training information
## Training data contained 1101 data points and no incomplete rows.
##
## ── Operations
## • Unknown factor level assignment for: <none> | Trained
## • Dummy variables from: <none> | Trained
## • Sparse, unbalanced variable filter removed: StandardHours | Trained
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(employee_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_race_anova(
xgb_wf,
resamples = employee_folds,
grid = 15,
metrics = metric_set(mn_log_loss),
control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will minimize the mn_log_loss metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 6 eliminated; 9 candidates remain.
##
## ℹ Fold07: 3 eliminated; 6 candidates remain.
##
## ℹ Fold03: 3 eliminated; 3 candidates remain.
##
## ℹ Fold05: 0 eliminated; 3 candidates remain.
##
## ℹ Fold09: 0 eliminated; 3 candidates remain.
##
## ℹ Fold04: 0 eliminated; 3 candidates remain.
##
## ℹ Fold06: 0 eliminated; 3 candidates remain.
plot_race(xgb_rs)
show_best(xgb_rs)
## # A tibble: 3 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 4 442 39 mn_log_loss binary 0.414 10 0.00683 Preprocessor1_Mo…
## 2 1 599 8 mn_log_loss binary 0.415 10 0.00853 Preprocessor1_Mo…
## 3 2 1805 31 mn_log_loss binary 0.417 10 0.00868 Preprocessor1_Mo…
xgb_last <- xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "mn_log_loss")) %>%
last_fit(employee_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [1101/369]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_predictions(xgb_last) %>%
mn_log_loss(Attrition, .pred_Left)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mn_log_loss binary 0.409
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
extract_workflow(xgb_last) %>%
extract_fit_parsnip() %>%
vip(geom = "point", num_features = 15)
What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. The data preprocessing steps mentioned in the code along video that I applied to the new data set are mutate and factor. I used these two steps to transform the data so that it would be easier to read and understand. These steps also help train the model.
What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models I used in the analysis was XGBoost. I used boost_tree and set_engine. XGBoost was used to predict the Attirtion based on various variables such as hourly rate, education, distance from home, job level, monthly income, years at company, work life balance. and standard hours. XGBoost is great at handling classification tasks, which is why this was an appropriate machine learning model to use.