Code Along 12 —-

STEP 2: Write ChatGPT Prompts —-

- Goal: Create ChatGPT Prompts to build a ML model

library(tidyverse)

attrition_raw_tbl <- read_csv(“00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv”)

If data is not sensitive:

attrition_raw_tbl %>% glimpse()

If data is sensitive:

attrition_raw_tbl %>% slice(0) %>% glimpse()

Prompt 1:

I have a dataset called attrition_raw_tbl that looks like this.

attrition_raw_tbl %>% glimpse() Rows: 1,470 Columns: 35 $ Age 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 29, 32, 22, 53, 38, 24, … $ Attrition “Yes”, “No”, “Yes”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”, “No”… $ BusinessTravel “Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”, “Travel_Frequently”, “Travel_… $ DailyRate 1102, 279, 1373, 1392, 591, 1005, 1324, 1358, 216, 1299, 809, 153, 670, 1346, 103, 1… $ Department ”Sales”, “Research & Development”, “Research & Development”, “Research & Development… $ DistanceFromHome 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, 19, 24, 21, 5, 16, 2, 2, 11, 9, 7, 15, … $ Education 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, 4, 2, 2, 4, 3, 2, 4, 4, 2, 1, 3, 1, 4, … $ EducationField ”Life Sciences”, “Life Sciences”, “Other”, “Life Sciences”, “Medical”, “Life Science… $ EmployeeCount 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … $ EmployeeNumber 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28… $ EnvironmentSatisfaction 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, 2, 1, 4, 1, 4, 1, 3, 1, 3, 2, 3, 2, 3, … $ Gender ”Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Female”, “Male”, “Male”, “Male”… $ HourlyRate 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, 49, 31, 93, 50, 51, 80, 96, 78, 45, 96, … $ JobInvolvement 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, 4, 4, 4, 2, 3, 4, 2, 3, 3, 3, 3, 1, 3, … $ JobLevel 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 3, 1, 1, 5, 1, 2, … $ JobRole “Sales Executive”, “Research Scientist”, “Laboratory Technician”, “Research Scientis… $ JobSatisfaction 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, 1, 2, 4, 4, 4, 3, 1, 2, 4, 1, 3, 1, 2, … $ MaritalStatus ”Single”, “Married”, “Single”, “Married”, “Married”, “Single”, “Married”, “Divorced”… $ MonthlyIncome 5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193, 2911, 2661, … $ MonthlyRate 19479, 24907, 2396, 23159, 16632, 11864, 9964, 13335, 8787, 16577, 16479, 12682, 151… $ NumCompaniesWorked 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, 1, 0, 1, 2, 5, 0, 7, 0, 1, 2, 4, 1, 0, … $ Over18 “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”, “Y”,… $ OverTime “Yes”, “No”, “Yes”, “Yes”, “No”, “No”, “Yes”, “No”, “No”, “No”, “No”, “Yes”, “No”, “… $ PercentSalaryHike 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 12, 17, 11, 14, 11, 12, 13, 16, 11, 18, … $ PerformanceRating 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, … $ RelationshipSatisfaction 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, 3, 4, 2, 3, 3, 4, 2, 3, 4, 3, 4, 2, 4, … $ StandardHours 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, … $ StockOptionLevel 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, … $ TotalWorkingYears 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3, 6, 10, 7, 1, 31, 6, 5, 10, 13, 0, 8, … $ TrainingTimesLastYear 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, 1, 5, 2, 3, 3, 5, 4, 4, 6, 2, 3, 5, 2, … $ WorkLifeBalance 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, … $ YearsAtCompany 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4, 10, 6, 1, 25, 3, 4, 5, 12, 0, 4, 14, 1… $ YearsInCurrentRole 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2, 9, 2, 0, 8, 2, 2, 3, 6, 0, 2, 13, 2, 7,… $ YearsSinceLastPromotion 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0, 8, 0, 0, 3, 1, 1, 0, 2, 0, 1, 4, 6, 4, … $ YearsWithCurrManager 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3, 8, 5, 0, 7, 2, 3, 3, 11, 0, 3, 8, 7, 2,…

The goal is to help predict attrition for employees.

Please write R code to create a predictive model that predicts the probability of attrition.

Load required libraries

library(tidyr) library(dplyr) library(caret)

Assuming you’ve already loaded and prepared the dataset as attrition_raw_tbl

Split data into training and testing sets

set.seed(123) # for reproducibility train_index <- createDataPartition(attrition_raw_tbl$Attrition, p = 0.7, list = FALSE) train_data <- attrition_raw_tbl[train_index, ] test_data <- attrition_raw_tbl[-train_index, ]

Preprocess the data (handle missing values, encode categorical variables)

For simplicity, we’ll assume there are no missing values and all categorical variables are already encoded

Train a logistic regression model

model <- glm(Attrition ~ ., data = train_data, family = binomial)

Evaluate the model

predictions <- predict(model, newdata = test_data, type = “response”) predicted_classes <- ifelse(predictions > 0.5, “Yes”, “No”) # Threshold of 0.5 conf_matrix <- confusionMatrix(predicted_classes, test_data\(Attrition) accuracy <- conf_matrix\)overall[“Accuracy”]

Output the evaluation metrics

print(conf_matrix) print(paste(“Accuracy:”, accuracy))

Use the model to predict attrition probability for new data points

new_data <- data.frame( Age = c(30, 35, 40), BusinessTravel = c(“Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”), # Add other relevant features for new data points ) new_predictions <- predict(model, newdata = new_data, type = “response”) print(new_predictions)

Prompt 2:

Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.

Load required libraries

library(tidymodels) library(h2o)

Set up h2o

h2o.init()

Assuming you’ve already loaded and prepared the dataset as attrition_raw_tbl

Split data into training and testing sets

set.seed(123) # for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7, strata = “Attrition”) train_data <- training(split) test_data <- testing(split)

Define recipe for preprocessing

attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>% step_dummy(all_nominal(), -all_outcomes()) %>% step_center(all_predictors()) %>% step_scale(all_predictors())

Preprocess the data

attrition_prep <- prep(attrition_recipe, training = train_data, retain = TRUE)

Apply preprocessing to training and testing data

train_data_processed <- bake(attrition_prep, new_data = train_data) test_data_processed <- bake(attrition_prep, new_data = test_data)

Convert to h2o frames

train_h2o <- as.h2o(train_data_processed) test_h2o <- as.h2o(test_data_processed)

Train a GLM model using h2o

attrition_glm <- h2o.glm(x = setdiff(names(train_h2o), “Attrition”), y = “Attrition”, training_frame = train_h2o, family = “binomial”)

Make predictions on test data

predictions <- h2o.predict(attrition_glm, newdata = test_h2o) predicted_classes <- ifelse(predictions > 0.5, “Yes”, “No”) # Threshold of 0.5

Evaluate the model

conf_matrix <- confusionMatrix(predicted_classes, test_data\(Attrition) accuracy <- conf_matrix\)overall[“Accuracy”]

Output the evaluation metrics

print(conf_matrix) print(paste(“Accuracy:”, accuracy))

Use the model to predict attrition probability for new data points

new_data <- data.frame( Age = c(30, 35, 40), BusinessTravel = c(“Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”), # Add other relevant features for new data points ) new_data_processed <- bake(attrition_prep, new_data = new_data) new_data_h2o <- as.h2o(new_data_processed) new_predictions <- h2o.predict(attrition_glm, newdata = new_data_h2o) print(new_predictions)

Prompt 3:

Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, urlSuffix = page, :

Prompt 4:

Please update the code to use h2o.performance in Step 5, instead of mean.

Load required libraries

library(tidymodels) library(h2o)

Set up h2o

h2o.init()

Assuming you’ve already loaded and prepared the dataset as attrition_raw_tbl

Split data into training and testing sets

set.seed(123) # for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.7, strata = “Attrition”) train_data <- training(split) test_data <- testing(split)

Define recipe for preprocessing

attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>% step_dummy(all_nominal(), -all_outcomes()) %>% step_center(all_predictors()) %>% step_scale(all_predictors())

Preprocess the data

attrition_prep <- prep(attrition_recipe, training = train_data, retain = TRUE)

Apply preprocessing to training and testing data

train_data_processed <- bake(attrition_prep, new_data = train_data) test_data_processed <- bake(attrition_prep, new_data = test_data)

Convert to h2o frames

train_h2o <- as.h2o(train_data_processed) test_h2o <- as.h2o(test_data_processed)

Train a GLM model using h2o

attrition_glm <- h2o.glm(x = setdiff(names(train_h2o), “Attrition”), y = “Attrition”, training_frame = train_h2o, family = “binomial”)

Make predictions on test data

predictions <- h2o.predict(attrition_glm, newdata = test_h2o) predicted_classes <- ifelse(predictions > 0.5, “Yes”, “No”) # Threshold of 0.5

Evaluate the model using h2o.performance

perf <- h2o.performance(model = attrition_glm, newdata = test_h2o) accuracy <- h2o.accuracy(perf)

Output the evaluation metrics

print(perf) print(paste(“Accuracy:”, accuracy))

Use the model to predict attrition probability for new data points

new_data <- data.frame( Age = c(30, 35, 40), BusinessTravel = c(“Travel_Rarely”, “Travel_Frequently”, “Travel_Rarely”), # Add other relevant features for new data points ) new_data_processed <- bake(attrition_prep, new_data = new_data) new_data_h2o <- as.h2o(new_data_processed) new_predictions <- h2o.predict(attrition_glm, newdata = new_data_h2o) print(new_predictions)

attrition_raw_tbl <- read_csv(“00_data/WA_Fn-UseC_-HR-Employee-Attrition.csv”)

FINAL CHATGPT RESPONSE —-

Load required libraries

library(tidyverse) library(tidymodels) library(h2o)

Assuming you have your data loaded as ‘attrition_raw_tbl’

Step 1: Split the data into training and testing sets

set.seed(123) # for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.8, strata = “Attrition”) train_data <- training(split) test_data <- testing(split)

Step 2: Preprocess the data using tidymodels

For simplicity, let’s use recipe to preprocess the data

attrition_recipe <- recipe(Attrition ~ ., data = train_data) %>% step_rm(Over18, EmployeeCount, StandardHours) %>% # Remove columns with zero variance step_dummy(all_nominal(), one_hot = TRUE) %>% step_center(all_numeric(), -all_outcomes()) %>% step_scale(all_numeric(), -all_outcomes())

Fit the recipe

attrition_recipe_prep <- prep(attrition_recipe, training = train_data)

Apply the recipe to the data

train_data_processed <- bake(attrition_recipe_prep, new_data = train_data) test_data_processed <- bake(attrition_recipe_prep, new_data = test_data)

Step 3: Train a model using h2o.automl

h2o.init()

Convert data to H2OFrame

train_h2o <- as.h2o(train_data_processed) test_h2o <- as.h2o(test_data_processed)

Define predictors and response variable

predictors <- setdiff(names(train_data_processed), “Attrition_Yes”) response <- “Attrition_Yes”

Train AutoML model

automl <- h2o.automl( x = predictors, y = response, training_frame = train_h2o, max_runtime_secs = 3600, # Maximum runtime in seconds seed = 123 )

Step 4: Evaluate the best model from AutoML

predictions <- h2o.predict(, newdata = test_h2o)$predict

Step 5: Assess model performance using h2o.performance

performance <- h2o.performance(, newdata = test_h2o) print(performance)

Optionally, you can also inspect the best model

summary()

Shut down H2O

h2o.shutdown()