Phil Gonzalez_D590 ADS_Week 5_Decision Tree in R

library("mlbench")
data(Sonar)
library("rpart")
m <- rpart(Class ~ ., data = Sonar,
           method = "class")
library("rpart.plot")
rpart.plot(m)

p <- predict(m, Sonar, type = "class")
table(p, Sonar$Class)

##    
## p    M  R
##   M 95 10
##   R 16 87

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

set.seed(12)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger")
print(model)

## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    2    gini        0.8090731  0.6131571
##    2    extratrees  0.8136902  0.6234492
##   31    gini        0.7736954  0.5423516
##   31    extratrees  0.8285153  0.6521921
##   60    gini        0.7597299  0.5140905
##   60    extratrees  0.8157646  0.6255929
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
##  and min.node.size = 1.

plot(model)

model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5)

set.seed(42)
myGrid <- expand.grid(mtry = c(5, 10, 20, 40, 60),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneGrid = myGrid,
               trControl = trainControl(method = "cv",
                                        number = 5,
                                        verboseIter = FALSE))
print(model)

## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 166, 167, 167, 167, 165 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    5    gini        0.8076277  0.6098253
##    5    extratrees  0.8416579  0.6784745
##   10    gini        0.7927667  0.5799348
##   10    extratrees  0.8418848  0.6791453
##   20    gini        0.7882316  0.5718852
##   20    extratrees  0.8516355  0.6991879
##   40    gini        0.7880048  0.5716461
##   40    extratrees  0.8371229  0.6695638
##   60    gini        0.7833482  0.5613525
##   60    extratrees  0.8322448  0.6599318
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
##  and min.node.size = 1.

plot(model)

set.seed(42)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5,
               trControl = trainControl(method = "cv",
                                        number = 5,
                                        verboseIter = FALSE))
plot(model)

library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(caret)
library(MASS)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::select()        masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(parsnip)
library(party)

## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following object is masked from 'package:parsnip':
## 
##     fit
## 
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## Attaching package: 'party'
## 
## The following object is masked from 'package:dplyr':
## 
##     where

chocolate <- read_csv("chocolate_tibble.csv")

## Rows: 1795 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): company_location, bean_type, broad_bean_origin
## dbl (3): final_grade, review_date, cocoa_percent
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

names(chocolate) <- make.names(names(chocolate))

kable(head(chocolate, 5), "html", caption = "Table 1. Top 5 columns") %>%
  kable_styling("striped")

Table 1. Top 5 columns
final_grade	review_date	cocoa_percent	company_location	bean_type	broad_bean_origin
3.75	2016	0.63	France	NA	Sao Tome
2.75	2015	0.70	France	NA	Togo
3.00	2015	0.70	France	NA	Togo
3.50	2015	0.70	France	NA	Togo
3.50	2015	0.70	France	NA	Peru

set.seed(3456)
trainIndex <- createDataPartition(chocolate$final_grade, p = .8,
                                  list = FALSE)
chocolate_train <- chocolate[ trainIndex,]
chocolate_test <- chocolate[-trainIndex,]

spec <- decision_tree() %>%
  set_mode("regression") %>%
  set_engine("rpart")
print(spec)

## Decision Tree Model Specification (regression)
## 
## Computational engine: rpart

model <- spec %>%
  parsnip::fit(formula = final_grade ~ ., data = chocolate_train)

model2 <- rpart(final_grade ~ cocoa_percent + company_location, data = chocolate_train, method = "anova")

model3 <- ctree(final_grade ~ cocoa_percent + as.factor(company_location), data = chocolate_train)

rpart.plot(model2, box.palette = "RdBu", shadow.col = "gray", nn = TRUE)

plot(model3)

decision_tree(tree_depth = 1) %>%
  set_mode("regression") %>%
  set_engine("rpart") %>%
  parsnip::fit(formula = final_grade ~ .,
               data = chocolate_train)

## parsnip model object
## 
## n= 1438 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1438 321.2239 3.183936  
##   2) cocoa_percent>=0.885 31  11.5000 2.500000 *
##   3) cocoa_percent< 0.885 1407 294.9036 3.199005 *

library(MASS)
data(package = "MASS")
boston <- Boston
dim(boston)

## [1] 506  14

names(boston)

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

train = sample(1:nrow(Boston), 300)
Boston.rf = randomForest(medv ~ ., data = Boston, subset = train)

plot(Boston.rf)

importance(Boston.rf)

##         IncNodePurity
## crim       1167.57012
## zn          105.68616
## indus      1382.61992
## chas         76.04384
## nox        1448.30458
## rm         6039.15927
## age         616.75173
## dis        1167.19886
## rad         186.45117
## tax         874.09457
## ptratio    1697.38179
## black       480.12419
## lstat      6329.55370

varImpPlot(Boston.rf)

Phil Gonzalez_D590 ADS_Week 5_Decision Tree in R_Part 1

2025-09-29