library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
employee_raw <- read_csv("HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
employee_raw$Attrition <- ifelse(employee_raw$Attrition == "Yes", 1, 0)
employee_raw %>%
  ggplot(aes(Age, Education, z = Attrition)) +
  stat_summary_hex(alpha = 0.8, bins = 4) +
  scale_fill_viridis_c(labels = scales::percent) +
  labs(fill = "Attrition")

employee_raw %>%
  ggplot(aes(HourlyRate, YearsAtCompany, z = Attrition)) +
  stat_summary_hex(alpha = 0.8) +
  scale_fill_viridis_c(labels = scales::percent) +
  labs(fill = "Attrition")

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
employee_split <- employee_raw %>%
  mutate(Attrition = if_else(as.logical(Attrition), "Left", "Stayed"),
    Attrition = factor(Attrition)) %>%
  initial_split(strata = Attrition)
employee_train <- training(employee_split)
employee_test <- testing(employee_split)

set.seed(234)
employee_folds <- vfold_cv(employee_train, strata = Attrition)
employee_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits            id    
##    <list>            <chr> 
##  1 <split [990/111]> Fold01
##  2 <split [990/111]> Fold02
##  3 <split [990/111]> Fold03
##  4 <split [990/111]> Fold04
##  5 <split [991/110]> Fold05
##  6 <split [991/110]> Fold06
##  7 <split [991/110]> Fold07
##  8 <split [992/109]> Fold08
##  9 <split [992/109]> Fold09
## 10 <split [992/109]> Fold10
employee_rec <-
  recipe(Attrition ~ DailyRate + DistanceFromHome + HourlyRate + JobLevel +
    MonthlyIncome + PerformanceRating + StandardHours +
    YearsAtCompany + WorkLifeBalance + YearsInCurrentRole,
  data = employee_train
  ) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_nzv(all_predictors())

prep(employee_rec)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 10
## 
## ── Training information
## Training data contained 1101 data points and no incomplete rows.
## 
## ── Operations
## • Unknown factor level assignment for: <none> | Trained
## • Dummy variables from: <none> | Trained
## • Sparse, unbalanced variable filter removed: StandardHours | Trained
xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(employee_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = employee_folds,
  grid = 15,
  metrics = metric_set(mn_log_loss),
  control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will minimize the mn_log_loss metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 6 eliminated; 9 candidates remain.
## 
## ℹ Fold07: 3 eliminated; 6 candidates remain.
## 
## ℹ Fold03: 3 eliminated; 3 candidates remain.
## 
## ℹ Fold05: 0 eliminated; 3 candidates remain.
## 
## ℹ Fold09: 0 eliminated; 3 candidates remain.
## 
## ℹ Fold04: 0 eliminated; 3 candidates remain.
## 
## ℹ Fold06: 0 eliminated; 3 candidates remain.
plot_race(xgb_rs)

show_best(xgb_rs)
## # A tibble: 3 × 9
##    mtry trees min_n .metric     .estimator  mean     n std_err .config          
##   <int> <int> <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>            
## 1     4   442    39 mn_log_loss binary     0.414    10 0.00683 Preprocessor1_Mo…
## 2     1   599     8 mn_log_loss binary     0.415    10 0.00853 Preprocessor1_Mo…
## 3     2  1805    31 mn_log_loss binary     0.417    10 0.00868 Preprocessor1_Mo…
xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "mn_log_loss")) %>%
  last_fit(employee_split)

xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits             id               .metrics .notes   .predictions .workflow 
##   <list>             <chr>            <list>   <list>   <list>       <list>    
## 1 <split [1101/369]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_predictions(xgb_last) %>%
  mn_log_loss(Attrition, .pred_Left)
## # A tibble: 1 × 3
##   .metric     .estimator .estimate
##   <chr>       <chr>          <dbl>
## 1 mn_log_loss binary         0.409
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
extract_workflow(xgb_last) %>%
  extract_fit_parsnip() %>%
  vip(geom = "point", num_features = 15)

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. Can I predict if an employee has left a company (attrition) based off various variables in the data set such as hourly rate, education level, age, etc.?
    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The original data set has 1,470 observations of 35 variables. The data set contains variables regarding details on the employees. Some of these are education field, business travel, gender, job involvement, job level and more.
    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The key variables in this data set are Attrition, DailyRate, DistanceFromHome, HourlyRate, JobLevel, MonthlyIncome, PerformanceRating, StandardHours, YearsAtCompany, WorkLifeBalance, and YearsInCurrentRole. Attrition is what we are trying to predict and is numerical data which is either a 0 or a 1. Attrition indicates whether the employee left or not. The other variables indicate different things about the employees and are all numerical data.
  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The data transformation steps we used were mutate(Attrition = if_else(as.logical(Attrition), “Left”, “Stayed”) and Attrition = factor(Attrition)). I used mutate to change the 0 and 1 to Left or Stayed. This makes the data easier to look at it and helps with model performance. I used factor to change attrition from numerical to factor so that the model could do a better job predicting.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. The data preprocessing steps mentioned in the code along video that I applied to the new data set are mutate and factor. I used these two steps to transform the data so that it would be easier to read and understand. These steps also help train the model.

    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models I used in the analysis was XGBoost. I used boost_tree and set_engine. XGBoost was used to predict the Attirtion based on various variables such as hourly rate, education, distance from home, job level, monthly income, years at company, work life balance. and standard hours. XGBoost is great at handling classification tasks, which is why this was an appropriate machine learning model to use.

  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The metrics I used were collect_predictions(xgb_last) %>% mn_log_loss(Attrition, .pred_Left). Log Loss is a metric that tests the accuracy of a classifier. It is suitable for binary and multi-class classification problems. Because the research question is aiming to predict a primary outcome (whether an employee left or stayed at the company) Log Loss is an appropriate metric to use. It consideres the correctness of a prediction as well as factors in the confidence of the prediction. Log loss helps assess how well a model can predict the likelihood of an outcome.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new data set. Relate these findings back to the research question and any similarities or differences compared to the CA assignment. The major findings are that these variables are not great predictors for whether or not an employee left the company. The performance estimate was 0.409, which is very low. The model is not good at predicting whether an employee left or not using things like hourly rate, education, distance from home, daily rate, and more.