I have a dataset called attrition_raw_tbl that looks like this.
attrition_raw_tbl %>% glimpse() Rows: 1,470 Columns: 35 $ Age
41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 29, 32, 22, 53, 38, 24, … $ Attrition “Yes”, “No”, “Yes”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”… $ BusinessTravel “Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”, “Travel_Frequently”, “Travel_… $ DailyRate 1102, 279, 1373, 1392, 591, 1005, 1324, 1358, 216, 1299, 809, 153, 670, 1346, 103, 1… $ Department ”Sales”, “Research & Development”, “Research & Development”, “Research & Development… $ DistanceFromHome 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, 19, 24, 21, 5, 16, 2, 2, 11, 9, 7, 15, … $ Education 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, 4, 2, 2, 4, 3, 2, 4, 4, 2, 1, 3, 1, 4, … $ EducationField ”Life Sciences”, “Life Sciences”, “Other”, “Life Sciences”, “Medical”, “Life Science… $ EmployeeCount 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … $ EmployeeNumber 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28… $ EnvironmentSatisfaction 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, 2, 1, 4, 1, 4, 1, 3, 1, 3, 2, 3, 2, 3, … $ Gender ”Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Male”… $ HourlyRate 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 49, 31, 93, 50, 51, 80, 96, 78, 45, 96, … $ JobInvolvement 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, 4, 4, 4, 2, 3, 4, 2, 3, 3, 3, 3, 1, 3, … $ JobLevel 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 3, 1, 1, 5, 1, 2, … $ JobRole “Sales Executive”, “Research Scientist”, “Laboratory Technician”, “Research Scientis… $ JobSatisfaction 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 1, 2, 4, 4, 4, 3, 1, 2, 4, 1, 3, 1, 2, … $ MaritalStatus ”Single”, “Married”, “Single”, “Married”, “Married”, “Single”, “Married”, “Divorced”… $ MonthlyIncome 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193, 2911, 2661, … $ MonthlyRate 19479, 24907, 2396, 23159, 16632, 11864, 9964, 13335, 8787, 16577, 16479, 12682, 151… $ NumCompaniesWorked 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, 1, 0, 1, 2, 5, 0, 7, 0, 1, 2, 4, 1, 0, … $ Over18 “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”,… $ OverTime “Yes”, “No”, “Yes”, “Yes”, “No”, “No”, “Yes”, “No”, “No”, “No”, “No”, “Yes”, “No”, “… $ PercentSalaryHike 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 12, 17, 11, 14, 11, 12, 13, 16, 11, 18, … $ PerformanceRating 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, … $ RelationshipSatisfaction 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, 3, 4, 2, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, … $ StandardHours 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, … $ StockOptionLevel 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, … $ TotalWorkingYears 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3, 6, 10, 7, 1, 31, 6, 5, 10, 13, 0, 8, … $ TrainingTimesLastYear 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, 1, 5, 2, 3, 3, 5, 4, 4, 6, 2, 3, 5, 2, … $ WorkLifeBalance 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, … $ YearsAtCompany 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4, 10, 6, 1, 25, 3, 4, 5, 12, 0, 4, 14, 1… $ YearsInCurrentRole 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, 9, 2, 0, 8, 2, 2, 3, 6, 0, 2, 13, 2, 7,… $ YearsSinceLastPromotion 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, 8, 0, 0, 3, 1, 1, 0, 2, 0, 1, 4, 6, 4, … $ YearsWithCurrManager 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, 8, 5, 0, 7, 2, 3, 3, 11, 0, 3, 8, 7, 2,…
The goal is to help predict attrition for employees.
Please write R code to create a predictive model that predicts the probability of attrition.
Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.
library(tidymodels) library(h2o) library(dplyr)
h2o.init()
attrition_raw_tbl <- attrition_raw_tbl %>% mutate( Attrition = as.factor(Attrition), # Convert target variable to factor BusinessTravel = as.factor(BusinessTravel), Department = as.factor(Department), EducationField = as.factor(EducationField), Gender = as.factor(Gender), JobRole = as.factor(JobRole), MaritalStatus = as.factor(MaritalStatus), Over18 = as.factor(Over18), OverTime = as.factor(OverTime) )
set.seed(123) # Set seed for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7) trainData <- training(split) testData <- testing(split)
train_h2o <- as.h2o(trainData) test_h2o <- as.h2o(testData)
target <- “Attrition” predictors <- setdiff(names(trainData), target)
model <- h2o.glm( x = predictors, # Predictors y = target, # Target training_frame = train_h2o, family = “binomial”, # Logistic regression lambda = 0, # No regularization seed = 123 )
summary(model)
pred_probs_h2o <- h2o.predict(model, test_h2o)
pred_probs_df <- as.data.frame(pred_probs_h2o) pred_probs_df$Actual <- as.data.frame(test_h2o)[, target]
pred_classes <- ifelse(pred_probs_df$p1 > 0.5, “Yes”, “No”) pred_classes <- factor(pred_classes, levels = c(“No”, “Yes”))
confusion <- confusionMatrix(pred_classes, factor(pred_probs_df$Actual)) print(confusion)
perf <- h2o.performance(model, newdata = test_h2o) auc_value <- h2o.auc(perf) cat(“AUC:”, auc_value, “”)
h2o.shutdown(prompt = FALSE)
Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, urlSuffix = page, :
library(tidymodels) library(h2o) library(dplyr) library(yardstick)
h2o.init()
attrition_raw_tbl <- attrition_raw_tbl %>% mutate( Attrition = as.factor(Attrition), # Convert target variable to factor BusinessTravel = as.factor(BusinessTravel), Department = as.factor(Department), EducationField = as.factor(EducationField), Gender = as.factor(Gender), JobRole = as.factor(JobRole), MaritalStatus = as.factor(MaritalStatus), Over18 = as.factor(Over18), OverTime = as.factor(OverTime) )
set.seed(123) # Set seed for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7) trainData <- training(split) testData <- testing(split)
train_h2o <- as.h2o(trainData) test_h2o <- as.h2o(testData)
target <- “Attrition” predictors <- setdiff(names(trainData), target)
model <- h2o.glm( x = predictors, # Predictors y = target, # Target training_frame = train_h2o, family = “binomial”, # Logistic regression lambda = 0, # No regularization seed = 123 )
summary(model)
pred_probs_h2o <- h2o.predict(model, test_h2o)
pred_probs_df <- as.data.frame(pred_probs_h2o) pred_probs_df$Actual <- as.data.frame(test_h2o)[, target]
pred_classes <- ifelse(pred_probs_df$p1 > 0.5, “Yes”, “No”) pred_classes <- factor(pred_classes, levels = c(“No”, “Yes”))
conf_mat <- conf_mat(data = pred_probs_df, truth = Actual, estimate = pred_classes) print(conf_mat)
perf <- h2o.performance(model, newdata = test_h2o) auc_value <- h2o.auc(perf) cat(“AUC:”, auc_value, “”)
h2o.shutdown(prompt = FALSE)
Please update the code to use h2o.performance in Step 5, instead of mean.
library(tidymodels) library(h2o) library(dplyr) library(yardstick)
h2o.init()
attrition_raw_tbl <- attrition_raw_tbl %>% mutate( Attrition = as.factor(Attrition), # Convert target variable to factor BusinessTravel = as.factor(BusinessTravel), Department = as.factor(Department), EducationField = as.factor(EducationField), Gender = as.factor(Gender), JobRole = as.factor(JobRole), MaritalStatus = as.factor(MaritalStatus), Over18 = as.factor(Over18), OverTime = as.factor(OverTime) )
set.seed(123) # Set seed for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7) trainData <- training(split) testData <- testing(split)
train_h2o <- as.h2o(trainData) test_h2o <- as.h2o(testData)
target <- “Attrition” predictors <- setdiff(names(trainData), target)
model <- h2o.glm( x = predictors, # Predictors y = target, # Target training_frame = train_h2o, family = “binomial”, # Logistic regression lambda = 0, # No regularization seed = 123 )
summary(model)
pred_probs_h2o <- h2o.predict(model, test_h2o)
pred_probs_df <- as.data.frame(pred_probs_h2o) pred_probs_df$Actual <- as.data.frame(test_h2o)[, target]
pred_classes <- ifelse(pred_probs_df$p1 > 0.5, “Yes”, “No”) pred_classes <- factor(pred_classes, levels = c(“No”, “Yes”))
conf_mat_result <- conf_mat( data = pred_probs_df, truth = Actual, estimate = pred_classes )
print(conf_mat_result)
perf <- h2o.performance(model, newdata = test_h2o) auc_value <- h2o.auc(perf) cat(“AUC:”, auc_value, “”)
h2o.shutdown(prompt = FALSE)
library(tidymodels) library(h2o) library(dplyr) library(yardstick)
h2o.init()
attrition_raw_tbl <- attrition_raw_tbl %>% mutate( Attrition = as.factor(Attrition), # Convert target variable to factor BusinessTravel = as.factor(BusinessTravel), Department = as.factor(Department), EducationField = as.factor(EducationField), Gender = as.factor(Gender), JobRole = as.factor(JobRole), MaritalStatus = as.factor(MaritalStatus), Over18 = as.factor(Over18), OverTime = as.factor(OverTime) )
set.seed(123) # Set seed for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7) trainData <- training(split) testData <- testing(split)
train_h2o <- as.h2o(trainData) test_h2o <- as.h2o(testData)
target <- “Attrition” predictors <- setdiff(names(trainData), target)
model <- h2o.glm( x = predictors, # Predictors y = target, # Target training_frame = train_h2o, family = “binomial”, # Logistic regression lambda = 0, # No regularization seed = 123 )
summary(model)
pred_probs_h2o <- h2o.predict(model, test_h2o)
pred_probs_df <- as.data.frame(pred_probs_h2o)
pred_probs_df$Actual <- as.data.frame(test_h2o)[, target]
pred_classes <- ifelse(pred_probs_df$p1 > 0.5, “Yes”, “No”) pred_classes <- factor(pred_classes, levels = c(“No”, “Yes”))
pred_probs_df$Predicted <- pred_classes
conf_mat_result <- conf_mat( data = pred_probs_df, truth = Actual, estimate = Predicted )
print(conf_mat_result)
perf <- h2o.performance(model, newdata = test_h2o) auc_value <- h2o.auc(perf) cat(“AUC:”, auc_value, “”)
h2o.shutdown(prompt = FALSE)
library(tidymodels) library(h2o) library(dplyr) library(yardstick)
h2o.init()
attrition_raw_tbl <- attrition_raw_tbl %>% mutate( Attrition = as.factor(Attrition), # Convert target variable to factor BusinessTravel = as.factor(BusinessTravel), Department = as.factor(Department), EducationField = as.factor(EducationField), Gender = as.factor(Gender), JobRole = as.factor(JobRole), MaritalStatus = as.factor(MaritalStatus), Over18 = as.factor(Over18), OverTime = as.factor(OverTime) )
set.seed(123) # Set seed for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7) trainData <- training(split) testData <- testing(split)
train_h2o <- as.h2o(trainData) test_h2o <- as.h2o(testData)
target <- “Attrition” predictors <- setdiff(names(trainData), target)
model <- h2o.glm( x = predictors, # Predictors y = target, # Target training_frame = train_h2o, family = “binomial”, # Logistic regression lambda = 0, # No regularization seed = 123 )
summary(model)
pred_probs_h2o <- h2o.predict(model, test_h2o)
pred_probs_df <- as.data.frame(pred_probs_h2o)
str(pred_probs_df) # Check if ‘p1’ and ‘predict’ columns are present
pred_probs_df$Actual <- as.data.frame(test_h2o)[, target]
pred_classes <- as.factor(pred_probs_df$predict) # This is the predicted class (“Yes”/“No”)
pred_probs_df$Predicted <- pred_classes # Add predicted classes to the dataframe
conf_mat_result <- conf_mat( data = pred_probs_df, truth = Actual, estimate = Predicted )
print(conf_mat_result)
perf <- h2o.performance(model, newdata = test_h2o) auc_value <- h2o.auc(perf) cat(“AUC:”, auc_value, “”)
h2o.shutdown(prompt = FALSE)