Diamonds and GGplot2

Setup

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ rsample      1.3.0
## ✔ dials        1.4.0     ✔ tibble       3.2.1
## ✔ dplyr        1.1.4     ✔ tidyr        1.3.1
## ✔ infer        1.0.8     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2
## ✔ recipes      1.3.0
## Warning: package 'broom' was built under R version 4.4.3
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
#install.packages("discrim")
#install.packages("naivebayes")
#install.packages("ranger")
library(ranger)
## Warning: package 'ranger' was built under R version 4.4.3
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.4.3
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
library(discrim)
## Warning: package 'discrim' was built under R version 4.4.3
## 
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
## 
##     smoothness

Tidy Models approach to classification

Step One: Data Preparation

data("diamonds") # diamonds dataset found in ggplot2 library
diamonds <- diamonds %>%
  mutate(ppc=price/carat) # price per carat
diamonds <- diamonds %>%
  mutate(value = ifelse(ppc < median(ppc), "Low", "High")) %>%
  mutate(value = factor(value, levels = c("Low", "High")))  # Convert 'value' to a factor

Step Two: Split The Data

# Setting a random seed for reproducibility
set.seed(123)

# Splitting the data into 80% training and 20% testing sets
data_split <- initial_split(diamonds, prop = 0.8, strata = value)

# Extracting the training set
train_data <- training(data_split)

# Extracting the testing set
test_data <- testing(data_split)
#Checking split levels
train_data$value <- factor(train_data$value, levels = c("Low", "High"))
test_data$value <- factor(test_data$value, levels = c("Low", "High"))
#Checking if data correct
# Check the size of training and testing sets
cat("Training Set Size:", nrow(train_data), "\n")
## Training Set Size: 43152
cat("Testing Set Size:", nrow(test_data), "\n")
## Testing Set Size: 10788
#checking distribution of 'value' in sets
table(train_data$value)
## 
##   Low  High 
## 21576 21576
table(test_data$value)
## 
##  Low High 
## 5394 5394

Step Three: Define Pre Processing Steps

#recipe for data pre processing
diamonds_recipe <- recipe(value ~ ., data = train_data) %>%
  step_normalize(all_numeric_predictors()) %>%  
  step_dummy(all_nominal_predictors(), -all_outcomes())  
#Validating accuracy
# prep recipe
prepared_recipe <- prep(diamonds_recipe)
# print the first few rows 
print(head(diamonds_recipe))
## $var_info
## # A tibble: 12 × 4
##    variable type      role      source  
##    <chr>    <list>    <chr>     <chr>   
##  1 carat    <chr [2]> predictor original
##  2 cut      <chr [2]> predictor original
##  3 color    <chr [2]> predictor original
##  4 clarity  <chr [2]> predictor original
##  5 depth    <chr [2]> predictor original
##  6 table    <chr [2]> predictor original
##  7 price    <chr [2]> predictor original
##  8 x        <chr [2]> predictor original
##  9 y        <chr [2]> predictor original
## 10 z        <chr [2]> predictor original
## 11 ppc      <chr [2]> predictor original
## 12 value    <chr [3]> outcome   original
## 
## $term_info
## # A tibble: 12 × 4
##    variable type      role      source  
##    <chr>    <list>    <chr>     <chr>   
##  1 carat    <chr [2]> predictor original
##  2 cut      <chr [2]> predictor original
##  3 color    <chr [2]> predictor original
##  4 clarity  <chr [2]> predictor original
##  5 depth    <chr [2]> predictor original
##  6 table    <chr [2]> predictor original
##  7 price    <chr [2]> predictor original
##  8 x        <chr [2]> predictor original
##  9 y        <chr [2]> predictor original
## 10 z        <chr [2]> predictor original
## 11 ppc      <chr [2]> predictor original
## 12 value    <chr [3]> outcome   original
## 
## $steps
## $steps[[1]]
## • Centering and scaling for: all_numeric_predictors()
## 
## $steps[[2]]
## • Dummy variables from: all_nominal_predictors() and -all_outcomes()
## 
## 
## $template
## # A tibble: 43,152 × 12
##    carat cut       color clarity depth table price     x     y     z   ppc value
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <fct>
##  1  0.7  Ideal     G     VS2      61.6    56  2757  5.7   5.67  3.5  3939. High 
##  2  0.71 Very Good E     VS2      62.4    57  2759  5.68  5.73  3.56 3886. High 
##  3  0.78 Very Good G     SI2      63.8    56  2759  5.81  5.85  3.72 3537. High 
##  4  0.7  Good      E     VS2      57.5    58  2759  5.85  5.9   3.38 3941. High 
##  5  0.73 Very Good E     SI1      61.6    59  2760  5.77  5.78  3.56 3781. High 
##  6  0.75 Premium   E     SI1      59.9    54  2760  6     5.96  3.58 3680  High 
##  7  0.74 Ideal     G     SI1      61.6    55  2760  5.8   5.85  3.59 3730. High 
##  8  0.75 Premium   G     VS2      61.7    58  2760  5.85  5.79  3.59 3680  High 
##  9  0.74 Ideal     I     VVS2     62.3    55  2761  5.77  5.81  3.61 3731. High 
## 10  0.59 Ideal     E     VVS2     62      55  2761  5.38  5.43  3.35 4680. High 
## # ℹ 43,142 more rows
## 
## $levels
## NULL
## 
## $retained
## [1] NA

Define The Models

#1 random Forest Model
rf_model <- rand_forest(trees = 100) %>%
  set_engine("ranger") %>%
  set_mode("classification")
#2 Decision tree Model
tree_model <- decision_tree(tree_depth = 5, cost_complexity = 0.01) %>%
  set_engine("rpart") %>%
  set_mode("classification")
#3 K nearst neighbors
knn_model <- nearest_neighbor(neighbors = 5) %>%
  set_engine("kknn") %>%
  set_mode("classification")
#4 Naive Bayes Model
nb_model <- naive_Bayes(smoothness = 1, Laplace = 1) %>%
  set_engine("naivebayes") %>%
  set_mode("classification")

Printing The Models

print(rf_model)
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   trees = 100
## 
## Computational engine: ranger
print(tree_model)
## Decision Tree Model Specification (classification)
## 
## Main Arguments:
##   cost_complexity = 0.01
##   tree_depth = 5
## 
## Computational engine: rpart
print(knn_model)
## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 5
## 
## Computational engine: kknn
print(nb_model)
## Naive Bayes Model Specification (classification)
## 
## Main Arguments:
##   smoothness = 1
##   Laplace = 1
## 
## Computational engine: naivebayes

Step Five: Create Workflows for Models

# Create workflows for each model
rf_workflow <- workflow() %>%
  add_recipe(diamonds_recipe) %>%
  add_model(rf_model)

tree_workflow <- workflow() %>%
  add_recipe(diamonds_recipe) %>%
  add_model(tree_model)

knn_workflow <- workflow() %>%
  add_model(knn_model) %>%
  add_recipe(diamonds_recipe)

nb_workflow <- workflow() %>%
  add_model(nb_model) %>%
  add_recipe(diamonds_recipe)

Printing Workflows for each Model

print(rf_workflow)
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   trees = 100
## 
## Computational engine: ranger
print(tree_workflow)
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
## 
## Main Arguments:
##   cost_complexity = 0.01
##   tree_depth = 5
## 
## Computational engine: rpart
print(knn_workflow)
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 5
## 
## Computational engine: kknn
print(nb_workflow)
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: naive_Bayes()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Naive Bayes Model Specification (classification)
## 
## Main Arguments:
##   smoothness = 1
##   Laplace = 1
## 
## Computational engine: naivebayes

Step Six: Fit Models with the Training Data

rf_fit <- fit(rf_workflow, data = train_data)
tree_fit <- fit(tree_workflow, data = train_data)
knn_fit <- fit(knn_workflow, data = train_data)
## Warning: package 'kknn' was built under R version 4.4.3
nb_fit <- fit(nb_workflow, data = train_data)

Step Seven: Evaluate Models with Test Data

AUC Calculations

rf_auc <- predict(rf_fit, test_data, type = "prob") %>%
  bind_cols(test_data) %>%
  roc_auc(truth = value, .pred_Low)
tree_auc <- predict(tree_fit, test_data, type = "prob") %>%
  bind_cols(test_data) %>%
  roc_auc(truth = value, .pred_Low)
knn_auc <- predict(knn_fit, test_data, type = "prob") %>%
  bind_cols(test_data) %>%
  roc_auc(truth = value, .pred_Low)
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
nb_auc <- predict(nb_fit, test_data, type = "prob") %>%
  bind_cols(test_data) %>%
  roc_auc(truth = value, .pred_Low)
# Summarize AUC Results
auc_results <- tibble(
  Model = c("Decision Tree", "KNN", "Random Forest", "Naive Bayes"),
  AUC = c(
    tree_auc$.estimate,
    knn_auc$.estimate,
    rf_auc$.estimate,
    nb_auc$.estimate
  )
)
# Print AUC summary
print(auc_results)
## # A tibble: 4 × 2
##   Model           AUC
##   <chr>         <dbl>
## 1 Decision Tree 1    
## 2 KNN           0.988
## 3 Random Forest 1    
## 4 Naive Bayes   0.985

ROC Curves

rf_roc <- predict(rf_fit, test_data, type = "prob") %>%
  bind_cols(test_data) %>%
  roc_curve(truth = value, .pred_Low) %>%
  mutate(model = "Random Forest")
tree_roc <- predict(tree_fit, test_data, type = "prob") %>%
  bind_cols(test_data) %>%
  roc_curve(truth = value, .pred_Low) %>%
  mutate(model = "Decision Tree")
knn_roc <- predict(knn_fit, test_data, type = "prob") %>%
  bind_cols(test_data) %>%
  roc_curve(truth = value, .pred_Low) %>%
  mutate(model = "KNN")
## Warning in model.matrix.default(mt2, test, contrasts.arg = contrasts.arg):
## variable '..y' is absent, its contrast will be ignored
nb_roc <- predict(nb_fit, test_data, type = "prob") %>%
  bind_cols(test_data) %>%
  roc_curve(truth = value, .pred_Low) %>%
  mutate(model = "Naive Bayes")
# Combine all ROC data
all_roc_data <- bind_rows(tree_roc, knn_roc, rf_roc, nb_roc)

Plotting the ROC Curves

ggplot(all_roc_data, aes(x = 1 - specificity, y = sensitivity, color = model)) +
  geom_line(linewidth = 1) +
  geom_abline(linetype = "dashed", color = "darkgray") +  # Reference line for random chance
  labs(
    title = "ROC Curves for Decision Tree, KNN, Random Forest, and Naive Bayes",
    x = "1 - Specificity (False Positive Rate)",
    y = "Sensitivity (True Positive Rate)",
    color = "Model"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

#Bar plot for AUC value comparison
ggplot(auc_results, aes(x = Model, y = AUC, fill = Model)) +
  geom_bar(stat = "identity", width = 0.6) +  
  geom_text(aes(label = round(AUC, 4)), vjust = -0.3) +  
  labs(
    title = "AUC Comparison of Classification Models",
    x = "Model",
    y = "AUC (Area Under the Curve)"
  ) +
  coord_cartesian(ylim = c(0.98, 1.00)) +  
  theme_minimal() +
  theme(legend.position = "none")  

##Results

The Models with the best AUC results were the Random Forest and Decision Tree Models