In this step, we will install and load the required libraries for data preprocessing and model building.
# Install packages (if not already installed)
install.packages("tidymodels", repos = "https://cloud.r-project.org/")
##
## The downloaded binary packages are in
## /var/folders/bb/skhfcqb91vd4ctz1b5htjbjh0000gn/T//RtmppMJQCG/downloaded_packages
install.packages("h2o", repos = "https://cloud.r-project.org/")
##
## The downloaded binary packages are in
## /var/folders/bb/skhfcqb91vd4ctz1b5htjbjh0000gn/T//RtmppMJQCG/downloaded_packages
# Load libraries
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.10
## ✔ dials 1.2.1 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
attrition_raw_tbl
is already loaded into the
environmentsummary(attrition_raw_tbl) str(attrition_raw_tbl)
attrition_raw_tbl <- attrition_raw_tbl %>% mutate( Attrition = as.factor(Attrition), BusinessTravel = as.factor(BusinessTravel), Department = as.factor(Department), EducationField = as.factor(EducationField), Gender = as.factor(Gender), JobRole = as.factor(JobRole), MaritalStatus = as.factor(MaritalStatus), OverTime = as.factor(OverTime), Over18 = as.factor(Over18) )
attrition_raw_tbl <- attrition_raw_tbl %>% drop_na()
set.seed(123) # Set seed for reproducibility split <- initial_split(attrition_raw_tbl, prop = 0.8) train_data <- training(split) test_data <- testing(split)
dim(train_data) dim(test_data)
h2o.init(nthreads = -1, max_mem_size = “4G”)
train_h2o <- as.h2o(train_data) test_h2o <- as.h2o(test_data)
h2o.head(train_h2o)
response <- “Attrition” predictors <- setdiff(names(train_h2o), response)
rf_model <- h2o.randomForest( x = predictors, y = response, training_frame = train_h2o, ntrees = 50, max_depth = 20, min_rows = 10, seed = 123 )
rf_model
predictions <- h2o.predict(rf_model, test_h2o)
predictions_df <- as.data.frame(predictions)
confusion_matrix <- h2o.confusionMatrix(rf_model, test_h2o) print(confusion_matrix)
perf <- h2o.performance(rf_model, test_h2o) print(perf)
summary(rf_model)
h2o.shutdown(prompt = FALSE)