library(tidyverse)
attrition_raw_tbl <- read_csv(“00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv”)
attrition_raw_tbl %>% glimpse()
attrition_raw_tbl %>% slice(0) %>% glimpse()
I have a dataset called attrition_raw_tbl that looks like this.
attrition_raw_tbl %>% glimpse() Rows: 1,470 Columns: 35 $ Age
41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 29, 32, 22, 53, 38, 24, … $ Attrition “Yes”, “No”, “Yes”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”… $ BusinessTravel “Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”, “Travel_Frequently”, “Travel_… $ DailyRate 1102, 279, 1373, 1392, 591, 1005, 1324, 1358, 216, 1299, 809, 153, 670, 1346, 103, 1… $ Department ”Sales”, “Research & Development”, “Research & Development”, “Research & Development… $ DistanceFromHome 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, 19, 24, 21, 5, 16, 2, 2, 11, 9, 7, 15, … $ Education 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, 4, 2, 2, 4, 3, 2, 4, 4, 2, 1, 3, 1, 4, … $ EducationField ”Life Sciences”, “Life Sciences”, “Other”, “Life Sciences”, “Medical”, “Life Science… $ EmployeeCount 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … $ EmployeeNumber 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28… $ EnvironmentSatisfaction 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, 2, 1, 4, 1, 4, 1, 3, 1, 3, 2, 3, 2, 3, … $ Gender ”Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Male”… $ HourlyRate 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 49, 31, 93, 50, 51, 80, 96, 78, 45, 96, … $ JobInvolvement 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, 4, 4, 4, 2, 3, 4, 2, 3, 3, 3, 3, 1, 3, … $ JobLevel 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 3, 1, 1, 5, 1, 2, … $ JobRole “Sales Executive”, “Research Scientist”, “Laboratory Technician”, “Research Scientis… $ JobSatisfaction 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 1, 2, 4, 4, 4, 3, 1, 2, 4, 1, 3, 1, 2, … $ MaritalStatus ”Single”, “Married”, “Single”, “Married”, “Married”, “Single”, “Married”, “Divorced”… $ MonthlyIncome 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193, 2911, 2661, … $ MonthlyRate 19479, 24907, 2396, 23159, 16632, 11864, 9964, 13335, 8787, 16577, 16479, 12682, 151… $ NumCompaniesWorked 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, 1, 0, 1, 2, 5, 0, 7, 0, 1, 2, 4, 1, 0, … $ Over18 “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”,… $ OverTime “Yes”, “No”, “Yes”, “Yes”, “No”, “No”, “Yes”, “No”, “No”, “No”, “No”, “Yes”, “No”, “… $ PercentSalaryHike 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 12, 17, 11, 14, 11, 12, 13, 16, 11, 18, … $ PerformanceRating 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, … $ RelationshipSatisfaction 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, 3, 4, 2, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, … $ StandardHours 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, … $ StockOptionLevel 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, … $ TotalWorkingYears 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3, 6, 10, 7, 1, 31, 6, 5, 10, 13, 0, 8, … $ TrainingTimesLastYear 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, 1, 5, 2, 3, 3, 5, 4, 4, 6, 2, 3, 5, 2, … $ WorkLifeBalance 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, … $ YearsAtCompany 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4, 10, 6, 1, 25, 3, 4, 5, 12, 0, 4, 14, 1… $ YearsInCurrentRole 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, 9, 2, 0, 8, 2, 2, 3, 6, 0, 2, 13, 2, 7,… $ YearsSinceLastPromotion 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, 8, 0, 0, 3, 1, 1, 0, 2, 0, 1, 4, 6, 4, … $ YearsWithCurrManager 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, 8, 5, 0, 7, 2, 3, 3, 11, 0, 3, 8, 7, 2,…
The goal is to help predict attrition for employees.
Please write R code to create a predictive model that predicts the probability of attrition.
library(tidyr) library(dplyr) library(caret)
set.seed(123) # for reproducibility train_index <- createDataPartition(attrition_raw_tbl$Attrition, p = 0.7, list = FALSE) train_data <- attrition_raw_tbl[train_index, ] test_data <- attrition_raw_tbl[-train_index, ]
model <- glm(Attrition ~ ., data = train_data, family = binomial)
predictions <- predict(model, newdata = test_data, type = “response”) predicted_classes <- ifelse(predictions > 0.5, “Yes”, “No”) # Threshold of 0.5 conf_matrix <- confusionMatrix(predicted_classes, test_data\(Attrition) accuracy <- conf_matrix\)overall[“Accuracy”]
print(conf_matrix) print(paste(“Accuracy:”, accuracy))
new_data <- data.frame( Age = c(30, 35, 40), BusinessTravel = c(“Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”), # Add other relevant features for new data points ) new_predictions <- predict(model, newdata = new_data, type = “response”) print(new_predictions)
Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.
library(tidymodels) library(h2o)
h2o.init()
set.seed(123) # for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7, strata = “Attrition”) train_data <- training(split) test_data <- testing(split)
attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>% step_dummy(all_nominal(), -all_outcomes()) %>% step_center(all_predictors()) %>% step_scale(all_predictors())
attrition_prep <- prep(attrition_recipe, training = train_data, retain = TRUE)
train_data_processed <- bake(attrition_prep, new_data = train_data) test_data_processed <- bake(attrition_prep, new_data = test_data)
train_h2o <- as.h2o(train_data_processed) test_h2o <- as.h2o(test_data_processed)
attrition_glm <- h2o.glm(x = setdiff(names(train_h2o), “Attrition”), y = “Attrition”, training_frame = train_h2o, family = “binomial”)
predictions <- h2o.predict(attrition_glm, newdata = test_h2o) predicted_classes <- ifelse(predictions > 0.5, “Yes”, “No”) # Threshold of 0.5
conf_matrix <- confusionMatrix(predicted_classes, test_data\(Attrition) accuracy <- conf_matrix\)overall[“Accuracy”]
print(conf_matrix) print(paste(“Accuracy:”, accuracy))
new_data <- data.frame( Age = c(30, 35, 40), BusinessTravel = c(“Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”), # Add other relevant features for new data points ) new_data_processed <- bake(attrition_prep, new_data = new_data) new_data_h2o <- as.h2o(new_data_processed) new_predictions <- h2o.predict(attrition_glm, newdata = new_data_h2o) print(new_predictions)
Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, urlSuffix = page, :
Please update the code to use h2o.performance in Step 5, instead of mean.
library(tidymodels) library(h2o)
h2o.init()
set.seed(123) # for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7, strata = “Attrition”) train_data <- training(split) test_data <- testing(split)
attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>% step_dummy(all_nominal(), -all_outcomes()) %>% step_center(all_predictors()) %>% step_scale(all_predictors())
attrition_prep <- prep(attrition_recipe, training = train_data, retain = TRUE)
train_data_processed <- bake(attrition_prep, new_data = train_data) test_data_processed <- bake(attrition_prep, new_data = test_data)
train_h2o <- as.h2o(train_data_processed) test_h2o <- as.h2o(test_data_processed)
attrition_glm <- h2o.glm(x = setdiff(names(train_h2o), “Attrition”), y = “Attrition”, training_frame = train_h2o, family = “binomial”)
predictions <- h2o.predict(attrition_glm, newdata = test_h2o) predicted_classes <- ifelse(predictions > 0.5, “Yes”, “No”) # Threshold of 0.5
perf <- h2o.performance(model = attrition_glm, newdata = test_h2o) accuracy <- h2o.accuracy(perf)
print(perf) print(paste(“Accuracy:”, accuracy))
new_data <- data.frame( Age = c(30, 35, 40), BusinessTravel = c(“Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”), # Add other relevant features for new data points ) new_data_processed <- bake(attrition_prep, new_data = new_data) new_data_h2o <- as.h2o(new_data_processed) new_predictions <- h2o.predict(attrition_glm, newdata = new_data_h2o) print(new_predictions)
attrition_raw_tbl <- read_csv(“00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv”)
library(tidyverse) library(tidymodels) library(h2o)
set.seed(123) # for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.8, strata = “Attrition”) train_data <- training(split) test_data <- testing(split)
attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>% step_rm(Over18, EmployeeCount, StandardHours) %>% # Remove columns with zero variance step_dummy(all_nominal(), one_hot = TRUE) %>% step_center(all_numeric(), -all_outcomes()) %>% step_scale(all_numeric(), -all_outcomes())
attrition_recipe_prep <- prep(attrition_recipe, training = train_data)
train_data_processed <- bake(attrition_recipe_prep, new_data = train_data) test_data_processed <- bake(attrition_recipe_prep, new_data = test_data)
h2o.init()
train_h2o <- as.h2o(train_data_processed) test_h2o <- as.h2o(test_data_processed)
predictors <- setdiff(names(train_data_processed), “Attrition_Yes”) response <- “Attrition_Yes”
automl <- h2o.automl( x = predictors, y = response, training_frame = train_h2o, max_runtime_secs = 3600, # Maximum runtime in seconds seed = 123 )
predictions <- h2o.predict(automl@leader, newdata = test_h2o)$predict
performance <- h2o.performance(automl@leader, newdata = test_h2o) print(performance)
summary(automl@leader)
h2o.shutdown()