Step 1 -

Decision trees

data(Sonar)
m <- rpart(Class ~ ., data = Sonar,
           method = "class")
rpart.plot(m)

p <- predict(m, Sonar, type = "class")
table(p, Sonar$Class)

##    
## p    M  R
##   M 95 10
##   R 16 87

Training a random forest

set.seed(12)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger")
print(model)

## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 208, 208, 208, 208, 208, 208, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    2    gini        0.8090731  0.6131571
##    2    extratrees  0.8136902  0.6234492
##   31    gini        0.7736954  0.5423516
##   31    extratrees  0.8285153  0.6521921
##   60    gini        0.7597299  0.5140905
##   60    extratrees  0.8157646  0.6255929
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 31, splitrule = extratrees
##  and min.node.size = 1.

plot(model)

model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5)

set.seed(42)
myGrid <- expand.grid(mtry = c(5, 10, 20, 40, 60),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1) ## Minimal node size; default 1 for classification
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneGrid = myGrid,
               trControl = trainControl(method = "cv",
                                       number = 5,
                                       verboseIter = FALSE))
print(model)

## Random Forest 
## 
## 208 samples
##  60 predictor
##   2 classes: 'M', 'R' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 166, 167, 167, 167, 165 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    5    gini        0.8076277  0.6098253
##    5    extratrees  0.8416579  0.6784745
##   10    gini        0.7927667  0.5799348
##   10    extratrees  0.8418848  0.6791453
##   20    gini        0.7882316  0.5718852
##   20    extratrees  0.8516355  0.6991879
##   40    gini        0.7880048  0.5716461
##   40    extratrees  0.8371229  0.6695638
##   60    gini        0.7833482  0.5613525
##   60    extratrees  0.8322448  0.6599318
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
##  and min.node.size = 1.

plot(model)

set.seed(42)
model <- train(Class ~ .,
               data = Sonar,
               method = "ranger",
               tuneLength = 5,
               trControl = trainControl(method = "cv",
                                        number = 5,
                                        verboseIter = FALSE))
plot(model)

Step 2

Decision Trees

chocolate <- read_csv("chocolate_tibble.csv")

## Rows: 1795 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): company_location, bean_type, broad_bean_origin
## dbl (3): final_grade, review_date, cocoa_percent
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

names(chocolate) <- make.names(names(chocolate)) # normalizes columns names

kable(head(chocolate,5), "html", caption="Table 1. Top 5 columns") %>%
kable_styling("striped")

Table 1. Top 5 columns
final_grade	review_date	cocoa_percent	company_location	bean_type	broad_bean_origin
3.75	2016	0.63	France	NA	Sao Tome
2.75	2015	0.70	France	NA	Togo
3.00	2015	0.70	France	NA	Togo
3.50	2015	0.70	France	NA	Togo
3.50	2015	0.70	France	NA	Peru

Split in Training - testing data

set.seed(3456)
trainIndex <- createDataPartition(chocolate$final_grade, p = .8,
                                  list = FALSE)
chocolate_train <- chocolate[ trainIndex,]
chocolate_test <- chocolate[-trainIndex,]

Constructing Regression Tree

spec <- decision_tree() %>%
set_mode("regression") %>%
set_engine("rpart")
print(spec)

## Decision Tree Model Specification (regression)
## 
## Computational engine: rpart

model <- spec %>%
parsnip::fit(formula = final_grade ~ .,data = chocolate_train)

model2 <- rpart(final_grade ~ cocoa_percent + company_location, data=chocolate_train, method="anova")

model3 <- ctree(
  final_grade ~ cocoa_percent + as.factor(company_location), 
  data = chocolate_train)

rpart.plot(model2, box.palette="RdBu", shadow.col="gray", nn=TRUE)

plot(model3)

Hyperparameters

decision_tree(tree_depth = 1) %>%
set_mode("regression") %>%
set_engine("rpart") %>%
parsnip::fit(formula = final_grade ~ .,
data = chocolate_train)

## parsnip model object
## 
## n= 1438 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1438 321.2239 3.183936  
##   2) cocoa_percent>=0.885 31  11.5000 2.500000 *
##   3) cocoa_percent< 0.885 1407 294.9036 3.199005 *

Random Forest

data(package="MASS")
boston<-Boston
dim(boston)

## [1] 506  14

names(boston)

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

#training Sample with 300 observations
train=sample(1:nrow(Boston),300)
Boston.rf=randomForest(medv ~ . , data = Boston , subset = train)

plot(Boston.rf)

importance(Boston.rf)

##         IncNodePurity
## crim       1167.57012
## zn          105.68616
## indus      1382.61992
## chas         76.04384
## nox        1448.30458
## rm         6039.15927
## age         616.75173
## dis        1167.19886
## rad         186.45117
## tax         874.09457
## ptratio    1697.38179
## black       480.12419
## lstat      6329.55370

varImpPlot(Boston.rf)

Applied Data Science Coding Practice 5 Part 1

Alexander Watkins

2024-09-24

Step 1 -

Decision trees

Training a random forest

Step 2

Decision Trees

Split in Training - testing data

Constructing Regression Tree

Hyperparameters

Random Forest