Step 1 -

Decision trees

data(Sonar)
m <- rpart(Class ~ ., data = Sonar,
           method = "class")
rpart.plot(m)

p <- predict(m, Sonar, type = "class")
table(p, Sonar$Class)
##    
## p    M  R
##   M 95 10
##   R 16 87

Training a random forest

set.seed(12)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger")
print(model)
## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    2    gini        0.8090731  0.6131571
##    2    extratrees  0.8136902  0.6234492
##   31    gini        0.7736954  0.5423516
##   31    extratrees  0.8285153  0.6521921
##   60    gini        0.7597299  0.5140905
##   60    extratrees  0.8157646  0.6255929
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
##  and min.node.size = 1.
plot(model)

model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5)
set.seed(42)
myGrid <- expand.grid(mtry = c(5, 10, 20, 40, 60),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1) ## Minimal node size; default 1 for classification
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneGrid = myGrid,
               trControl = trainControl(method = "cv",
                                       number = 5,
                                       verboseIter = FALSE))
print(model)
## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 166, 167, 167, 167, 165 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    5    gini        0.8076277  0.6098253
##    5    extratrees  0.8416579  0.6784745
##   10    gini        0.7927667  0.5799348
##   10    extratrees  0.8418848  0.6791453
##   20    gini        0.7882316  0.5718852
##   20    extratrees  0.8516355  0.6991879
##   40    gini        0.7880048  0.5716461
##   40    extratrees  0.8371229  0.6695638
##   60    gini        0.7833482  0.5613525
##   60    extratrees  0.8322448  0.6599318
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
##  and min.node.size = 1.
plot(model)

set.seed(42)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5,
               trControl = trainControl(method = "cv",
                                        number = 5,
                                        verboseIter = FALSE))
plot(model)

Step 2

Decision Trees

chocolate <- read_csv("chocolate_tibble.csv")
## Rows: 1795 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): company_location, bean_type, broad_bean_origin
## dbl (3): final_grade, review_date, cocoa_percent
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(chocolate) <- make.names(names(chocolate)) # normalizes columns names

kable(head(chocolate,5), "html", caption="Table 1. Top 5 columns") %>%
kable_styling("striped")
Table 1. Top 5 columns
final_grade review_date cocoa_percent company_location bean_type broad_bean_origin
3.75 2016 0.63 France NA Sao Tome
2.75 2015 0.70 France NA Togo
3.00 2015 0.70 France NA Togo
3.50 2015 0.70 France NA Togo
3.50 2015 0.70 France NA Peru

Split in Training - testing data

set.seed(3456)
trainIndex <- createDataPartition(chocolate$final_grade, p = .8,
                                  list = FALSE)
chocolate_train <- chocolate[ trainIndex,]
chocolate_test <- chocolate[-trainIndex,]

Constructing Regression Tree

spec <- decision_tree() %>%
set_mode("regression") %>%
set_engine("rpart")
print(spec)
## Decision Tree Model Specification (regression)
## 
## Computational engine: rpart
model <- spec %>%
parsnip::fit(formula = final_grade ~ .,data = chocolate_train)

model2 <- rpart(final_grade ~ cocoa_percent + company_location, data=chocolate_train, method="anova")

model3 <- ctree(
  final_grade ~ cocoa_percent + as.factor(company_location), 
  data = chocolate_train)
rpart.plot(model2, box.palette="RdBu", shadow.col="gray", nn=TRUE)

plot(model3)

Hyperparameters

decision_tree(tree_depth = 1) %>%
set_mode("regression") %>%
set_engine("rpart") %>%
parsnip::fit(formula = final_grade ~ .,
data = chocolate_train)
## parsnip model object
## 
## n= 1438 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1438 321.2239 3.183936  
##   2) cocoa_percent>=0.885 31  11.5000 2.500000 *
##   3) cocoa_percent< 0.885 1407 294.9036 3.199005 *

Random Forest

data(package="MASS")
boston<-Boston
dim(boston)
## [1] 506  14
names(boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
#training Sample with 300 observations
train=sample(1:nrow(Boston),300)
Boston.rf=randomForest(medv ~ . , data = Boston , subset = train)
plot(Boston.rf)

importance(Boston.rf)
##         IncNodePurity
## crim       1167.57012
## zn          105.68616
## indus      1382.61992
## chas         76.04384
## nox        1448.30458
## rm         6039.15927
## age         616.75173
## dis        1167.19886
## rad         186.45117
## tax         874.09457
## ptratio    1697.38179
## black       480.12419
## lstat      6329.55370
varImpPlot(Boston.rf)