Setup
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ rsample 1.3.0
## ✔ dials 1.4.0 ✔ tibble 3.2.1
## ✔ dplyr 1.1.4 ✔ tidyr 1.3.1
## ✔ infer 1.0.8 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ✔ recipes 1.3.0
## Warning: package 'broom' was built under R version 4.4.3
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
#install.packages("discrim")
#install.packages("naivebayes")
#install.packages("ranger")
library(ranger)
## Warning: package 'ranger' was built under R version 4.4.3
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.4.3
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
library(discrim)
## Warning: package 'discrim' was built under R version 4.4.3
##
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
##
## smoothness
Step One: Data Preparation
data("diamonds") # diamonds dataset found in ggplot2 library
diamonds <- diamonds %>%
mutate(ppc=price/carat) # price per carat
diamonds <- diamonds %>%
mutate(value = ifelse(ppc < median(ppc), "Low", "High")) %>%
mutate(value = factor(value, levels = c("Low", "High"))) # Convert 'value' to a factor
Step Two: Split The Data
# Setting a random seed for reproducibility
set.seed(123)
# Splitting the data into 80% training and 20% testing sets
data_split <- initial_split(diamonds, prop = 0.8, strata = value)
# Extracting the training set
train_data <- training(data_split)
# Extracting the testing set
test_data <- testing(data_split)
#Checking split levels
train_data$value <- factor(train_data$value, levels = c("Low", "High"))
test_data$value <- factor(test_data$value, levels = c("Low", "High"))
#Checking if data correct
# Check the size of training and testing sets
cat("Training Set Size:", nrow(train_data), "\n")
## Training Set Size: 43152
cat("Testing Set Size:", nrow(test_data), "\n")
## Testing Set Size: 10788
#checking distribution of 'value' in sets
table(train_data$value)
##
## Low High
## 21576 21576
table(test_data$value)
##
## Low High
## 5394 5394
#recipe for data pre processing
diamonds_recipe <- recipe(value ~ ., data = train_data) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors(), -all_outcomes())
#Validating accuracy
# prep recipe
prepared_recipe <- prep(diamonds_recipe)
# print the first few rows
print(head(diamonds_recipe))
## $var_info
## # A tibble: 12 × 4
## variable type role source
## <chr> <list> <chr> <chr>
## 1 carat <chr [2]> predictor original
## 2 cut <chr [2]> predictor original
## 3 color <chr [2]> predictor original
## 4 clarity <chr [2]> predictor original
## 5 depth <chr [2]> predictor original
## 6 table <chr [2]> predictor original
## 7 price <chr [2]> predictor original
## 8 x <chr [2]> predictor original
## 9 y <chr [2]> predictor original
## 10 z <chr [2]> predictor original
## 11 ppc <chr [2]> predictor original
## 12 value <chr [3]> outcome original
##
## $term_info
## # A tibble: 12 × 4
## variable type role source
## <chr> <list> <chr> <chr>
## 1 carat <chr [2]> predictor original
## 2 cut <chr [2]> predictor original
## 3 color <chr [2]> predictor original
## 4 clarity <chr [2]> predictor original
## 5 depth <chr [2]> predictor original
## 6 table <chr [2]> predictor original
## 7 price <chr [2]> predictor original
## 8 x <chr [2]> predictor original
## 9 y <chr [2]> predictor original
## 10 z <chr [2]> predictor original
## 11 ppc <chr [2]> predictor original
## 12 value <chr [3]> outcome original
##
## $steps
## $steps[[1]]
## • Centering and scaling for: all_numeric_predictors()
##
## $steps[[2]]
## • Dummy variables from: all_nominal_predictors() and -all_outcomes()
##
##
## $template
## # A tibble: 43,152 × 12
## carat cut color clarity depth table price x y z ppc value
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 0.7 Ideal G VS2 61.6 56 2757 5.7 5.67 3.5 3939. High
## 2 0.71 Very Good E VS2 62.4 57 2759 5.68 5.73 3.56 3886. High
## 3 0.78 Very Good G SI2 63.8 56 2759 5.81 5.85 3.72 3537. High
## 4 0.7 Good E VS2 57.5 58 2759 5.85 5.9 3.38 3941. High
## 5 0.73 Very Good E SI1 61.6 59 2760 5.77 5.78 3.56 3781. High
## 6 0.75 Premium E SI1 59.9 54 2760 6 5.96 3.58 3680 High
## 7 0.74 Ideal G SI1 61.6 55 2760 5.8 5.85 3.59 3730. High
## 8 0.75 Premium G VS2 61.7 58 2760 5.85 5.79 3.59 3680 High
## 9 0.74 Ideal I VVS2 62.3 55 2761 5.77 5.81 3.61 3731. High
## 10 0.59 Ideal E VVS2 62 55 2761 5.38 5.43 3.35 4680. High
## # ℹ 43,142 more rows
##
## $levels
## NULL
##
## $retained
## [1] NA
#1 random Forest Model
rf_model <- rand_forest(trees = 100) %>%
set_engine("ranger") %>%
set_mode("classification")
#2 Decision tree Model
tree_model <- decision_tree(tree_depth = 5, cost_complexity = 0.01) %>%
set_engine("rpart") %>%
set_mode("classification")
#3 K nearst neighbors
knn_model <- nearest_neighbor(neighbors = 5) %>%
set_engine("kknn") %>%
set_mode("classification")
#4 Naive Bayes Model
nb_model <- naive_Bayes(smoothness = 1, Laplace = 1) %>%
set_engine("naivebayes") %>%
set_mode("classification")
print(rf_model)
## Random Forest Model Specification (classification)
##
## Main Arguments:
## trees = 100
##
## Computational engine: ranger
print(tree_model)
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = 0.01
## tree_depth = 5
##
## Computational engine: rpart
print(knn_model)
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 5
##
## Computational engine: kknn
print(nb_model)
## Naive Bayes Model Specification (classification)
##
## Main Arguments:
## smoothness = 1
## Laplace = 1
##
## Computational engine: naivebayes
# Create workflows for each model
rf_workflow <- workflow() %>%
add_recipe(diamonds_recipe) %>%
add_model(rf_model)
tree_workflow <- workflow() %>%
add_recipe(diamonds_recipe) %>%
add_model(tree_model)
knn_workflow <- workflow() %>%
add_model(knn_model) %>%
add_recipe(diamonds_recipe)
nb_workflow <- workflow() %>%
add_model(nb_model) %>%
add_recipe(diamonds_recipe)
print(rf_workflow)
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## trees = 100
##
## Computational engine: ranger
print(tree_workflow)
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = 0.01
## tree_depth = 5
##
## Computational engine: rpart
print(knn_workflow)
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 5
##
## Computational engine: kknn
print(nb_workflow)
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: naive_Bayes()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Naive Bayes Model Specification (classification)
##
## Main Arguments:
## smoothness = 1
## Laplace = 1
##
## Computational engine: naivebayes
rf_fit <- fit(rf_workflow, data = train_data)
tree_fit <- fit(tree_workflow, data = train_data)
knn_fit <- fit(knn_workflow, data = train_data)
## Warning: package 'kknn' was built under R version 4.4.3
nb_fit <- fit(nb_workflow, data = train_data)
AUC Calculations
rf_auc <- predict(rf_fit, test_data, type = "prob") %>%
bind_cols(test_data) %>%
roc_auc(truth = value, .pred_Low)
tree_auc <- predict(tree_fit, test_data, type = "prob") %>%
bind_cols(test_data) %>%
roc_auc(truth = value, .pred_Low)
knn_auc <- predict(knn_fit, test_data, type = "prob") %>%
bind_cols(test_data) %>%
roc_auc(truth = value, .pred_Low)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
nb_auc <- predict(nb_fit, test_data, type = "prob") %>%
bind_cols(test_data) %>%
roc_auc(truth = value, .pred_Low)
# Summarize AUC Results
auc_results <- tibble(
Model = c("Decision Tree", "KNN", "Random Forest", "Naive Bayes"),
AUC = c(
tree_auc$.estimate,
knn_auc$.estimate,
rf_auc$.estimate,
nb_auc$.estimate
)
)
# Print AUC summary
print(auc_results)
## # A tibble: 4 × 2
## Model AUC
## <chr> <dbl>
## 1 Decision Tree 1
## 2 KNN 0.988
## 3 Random Forest 1
## 4 Naive Bayes 0.985
ROC Curves
rf_roc <- predict(rf_fit, test_data, type = "prob") %>%
bind_cols(test_data) %>%
roc_curve(truth = value, .pred_Low) %>%
mutate(model = "Random Forest")
tree_roc <- predict(tree_fit, test_data, type = "prob") %>%
bind_cols(test_data) %>%
roc_curve(truth = value, .pred_Low) %>%
mutate(model = "Decision Tree")
knn_roc <- predict(knn_fit, test_data, type = "prob") %>%
bind_cols(test_data) %>%
roc_curve(truth = value, .pred_Low) %>%
mutate(model = "KNN")
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
nb_roc <- predict(nb_fit, test_data, type = "prob") %>%
bind_cols(test_data) %>%
roc_curve(truth = value, .pred_Low) %>%
mutate(model = "Naive Bayes")
# Combine all ROC data
all_roc_data <- bind_rows(tree_roc, knn_roc, rf_roc, nb_roc)
ggplot(all_roc_data, aes(x = 1 - specificity, y = sensitivity, color = model)) +
geom_line(linewidth = 1) +
geom_abline(linetype = "dashed", color = "darkgray") + # Reference line for random chance
labs(
title = "ROC Curves for Decision Tree, KNN, Random Forest, and Naive Bayes",
x = "1 - Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)",
color = "Model"
) +
theme_minimal() +
theme(legend.position = "bottom")
#Bar plot for AUC value comparison
ggplot(auc_results, aes(x = Model, y = AUC, fill = Model)) +
geom_bar(stat = "identity", width = 0.6) +
geom_text(aes(label = round(AUC, 4)), vjust = -0.3) +
labs(
title = "AUC Comparison of Classification Models",
x = "Model",
y = "AUC (Area Under the Curve)"
) +
coord_cartesian(ylim = c(0.98, 1.00)) +
theme_minimal() +
theme(legend.position = "none")
##Results
The Models with the best AUC results were the Random Forest and Decision Tree Models