library("mlbench")
data(Sonar)
library("rpart")
m <- rpart(Class ~ ., data = Sonar,
method = "class")
library("rpart.plot")
rpart.plot(m)

p <- predict(m, Sonar, type = "class")
table(p, Sonar$Class)
##
## p M R
## M 95 10
## R 16 87
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(12)
model <- train(Class ~ .,
data = Sonar,
method = "ranger")
print(model)
## Random Forest
##
## 208 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ...
## Resampling results across tuning parameters:
##
## mtry splitrule Accuracy Kappa
## 2 gini 0.8090731 0.6131571
## 2 extratrees 0.8136902 0.6234492
## 31 gini 0.7736954 0.5423516
## 31 extratrees 0.8285153 0.6521921
## 60 gini 0.7597299 0.5140905
## 60 extratrees 0.8157646 0.6255929
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
## and min.node.size = 1.
plot(model)

model <- train(Class ~ .,
data = Sonar,
method = "ranger",
tuneLength = 5)
set.seed(42)
myGrid <- expand.grid(mtry = c(5, 10, 20, 40, 60),
splitrule = c("gini", "extratrees"),
min.node.size = 1)
model <- train(Class ~ .,
data = Sonar,
method = "ranger",
tuneGrid = myGrid,
trControl = trainControl(method = "cv",
number = 5,
verboseIter = FALSE))
print(model)
## Random Forest
##
## 208 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 166, 167, 167, 167, 165
## Resampling results across tuning parameters:
##
## mtry splitrule Accuracy Kappa
## 5 gini 0.8076277 0.6098253
## 5 extratrees 0.8416579 0.6784745
## 10 gini 0.7927667 0.5799348
## 10 extratrees 0.8418848 0.6791453
## 20 gini 0.7882316 0.5718852
## 20 extratrees 0.8516355 0.6991879
## 40 gini 0.7880048 0.5716461
## 40 extratrees 0.8371229 0.6695638
## 60 gini 0.7833482 0.5613525
## 60 extratrees 0.8322448 0.6599318
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
## and min.node.size = 1.
plot(model)

set.seed(42)
model <- train(Class ~ .,
data = Sonar,
method = "ranger",
tuneLength = 5,
trControl = trainControl(method = "cv",
number = 5,
verboseIter = FALSE))
plot(model)

library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(parsnip)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
##
## The following object is masked from 'package:parsnip':
##
## fit
##
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## Attaching package: 'party'
##
## The following object is masked from 'package:dplyr':
##
## where
chocolate <- read_csv("chocolate_tibble.csv")
## Rows: 1795 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): company_location, bean_type, broad_bean_origin
## dbl (3): final_grade, review_date, cocoa_percent
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(chocolate) <- make.names(names(chocolate))
kable(head(chocolate, 5), "html", caption = "Table 1. Top 5 columns") %>%
kable_styling("striped")
Table 1. Top 5 columns
final_grade
|
review_date
|
cocoa_percent
|
company_location
|
bean_type
|
broad_bean_origin
|
3.75
|
2016
|
0.63
|
France
|
NA
|
Sao Tome
|
2.75
|
2015
|
0.70
|
France
|
NA
|
Togo
|
3.00
|
2015
|
0.70
|
France
|
NA
|
Togo
|
3.50
|
2015
|
0.70
|
France
|
NA
|
Togo
|
3.50
|
2015
|
0.70
|
France
|
NA
|
Peru
|
set.seed(3456)
trainIndex <- createDataPartition(chocolate$final_grade, p = .8,
list = FALSE)
chocolate_train <- chocolate[ trainIndex,]
chocolate_test <- chocolate[-trainIndex,]
spec <- decision_tree() %>%
set_mode("regression") %>%
set_engine("rpart")
print(spec)
## Decision Tree Model Specification (regression)
##
## Computational engine: rpart
model <- spec %>%
parsnip::fit(formula = final_grade ~ ., data = chocolate_train)
model2 <- rpart(final_grade ~ cocoa_percent + company_location, data = chocolate_train, method = "anova")
model3 <- ctree(final_grade ~ cocoa_percent + as.factor(company_location), data = chocolate_train)
rpart.plot(model2, box.palette = "RdBu", shadow.col = "gray", nn = TRUE)

plot(model3)

decision_tree(tree_depth = 1) %>%
set_mode("regression") %>%
set_engine("rpart") %>%
parsnip::fit(formula = final_grade ~ .,
data = chocolate_train)
## parsnip model object
##
## n= 1438
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1438 321.2239 3.183936
## 2) cocoa_percent>=0.885 31 11.5000 2.500000 *
## 3) cocoa_percent< 0.885 1407 294.9036 3.199005 *
library(MASS)
data(package = "MASS")
boston <- Boston
dim(boston)
## [1] 506 14
names(boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
train = sample(1:nrow(Boston), 300)
Boston.rf = randomForest(medv ~ ., data = Boston, subset = train)
plot(Boston.rf)

importance(Boston.rf)
## IncNodePurity
## crim 1167.57012
## zn 105.68616
## indus 1382.61992
## chas 76.04384
## nox 1448.30458
## rm 6039.15927
## age 616.75173
## dis 1167.19886
## rad 186.45117
## tax 874.09457
## ptratio 1697.38179
## black 480.12419
## lstat 6329.55370
varImpPlot(Boston.rf)
