library("mlbench")
data(Sonar)
library("rpart")
m <- rpart(Class ~ ., data = Sonar,
           method = "class")
library("rpart.plot")
rpart.plot(m)

p <- predict(m, Sonar, type = "class")
table(p, Sonar$Class)
##    
## p    M  R
##   M 95 10
##   R 16 87
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(12)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger")
print(model)
## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    2    gini        0.8090731  0.6131571
##    2    extratrees  0.8136902  0.6234492
##   31    gini        0.7736954  0.5423516
##   31    extratrees  0.8285153  0.6521921
##   60    gini        0.7597299  0.5140905
##   60    extratrees  0.8157646  0.6255929
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
##  and min.node.size = 1.
plot(model)

model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5)
set.seed(42)
myGrid <- expand.grid(mtry = c(5, 10, 20, 40, 60),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneGrid = myGrid,
               trControl = trainControl(method = "cv",
                                        number = 5,
                                        verboseIter = FALSE))
print(model)
## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 166, 167, 167, 167, 165 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    5    gini        0.8076277  0.6098253
##    5    extratrees  0.8416579  0.6784745
##   10    gini        0.7927667  0.5799348
##   10    extratrees  0.8418848  0.6791453
##   20    gini        0.7882316  0.5718852
##   20    extratrees  0.8516355  0.6991879
##   40    gini        0.7880048  0.5716461
##   40    extratrees  0.8371229  0.6695638
##   60    gini        0.7833482  0.5613525
##   60    extratrees  0.8322448  0.6599318
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
##  and min.node.size = 1.
plot(model)

set.seed(42)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5,
               trControl = trainControl(method = "cv",
                                        number = 5,
                                        verboseIter = FALSE))
plot(model)

library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::select()        masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(parsnip)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following object is masked from 'package:parsnip':
## 
##     fit
## 
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## Attaching package: 'party'
## 
## The following object is masked from 'package:dplyr':
## 
##     where
chocolate <- read_csv("chocolate_tibble.csv")
## Rows: 1795 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): company_location, bean_type, broad_bean_origin
## dbl (3): final_grade, review_date, cocoa_percent
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(chocolate) <- make.names(names(chocolate))

kable(head(chocolate, 5), "html", caption = "Table 1. Top 5 columns") %>%
  kable_styling("striped")
Table 1. Top 5 columns
final_grade review_date cocoa_percent company_location bean_type broad_bean_origin
3.75 2016 0.63 France NA Sao Tome
2.75 2015 0.70 France NA Togo
3.00 2015 0.70 France NA Togo
3.50 2015 0.70 France NA Togo
3.50 2015 0.70 France NA Peru
set.seed(3456)
trainIndex <- createDataPartition(chocolate$final_grade, p = .8,
                                  list = FALSE)
chocolate_train <- chocolate[ trainIndex,]
chocolate_test <- chocolate[-trainIndex,]
spec <- decision_tree() %>%
  set_mode("regression") %>%
  set_engine("rpart")
print(spec)
## Decision Tree Model Specification (regression)
## 
## Computational engine: rpart
model <- spec %>%
  parsnip::fit(formula = final_grade ~ ., data = chocolate_train)

model2 <- rpart(final_grade ~ cocoa_percent + company_location, data = chocolate_train, method = "anova")

model3 <- ctree(final_grade ~ cocoa_percent + as.factor(company_location), data = chocolate_train)
rpart.plot(model2, box.palette = "RdBu", shadow.col = "gray", nn = TRUE)

plot(model3)

decision_tree(tree_depth = 1) %>%
  set_mode("regression") %>%
  set_engine("rpart") %>%
  parsnip::fit(formula = final_grade ~ .,
               data = chocolate_train)
## parsnip model object
## 
## n= 1438 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1438 321.2239 3.183936  
##   2) cocoa_percent>=0.885 31  11.5000 2.500000 *
##   3) cocoa_percent< 0.885 1407 294.9036 3.199005 *
library(MASS)
data(package = "MASS")
boston <- Boston
dim(boston)
## [1] 506  14
names(boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
train = sample(1:nrow(Boston), 300)
Boston.rf = randomForest(medv ~ ., data = Boston, subset = train)
plot(Boston.rf)

importance(Boston.rf)
##         IncNodePurity
## crim       1167.57012
## zn          105.68616
## indus      1382.61992
## chas         76.04384
## nox        1448.30458
## rm         6039.15927
## age         616.75173
## dis        1167.19886
## rad         186.45117
## tax         874.09457
## ptratio    1697.38179
## black       480.12419
## lstat      6329.55370
varImpPlot(Boston.rf)