In this lesson students will learn …
Motivation/Background:
Sources:
pima<-read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/diabetes.csv",
header=TRUE)
pima$Outcome<-as.factor(pima$Outcome)
str(pima)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...
We will use stratified splitting to create 70/30 - training/testing datasets to build our models.
Note: We will use the same seed as we did for the knn example so that we can compare.
library(caret)
# Split the data into training and test set
set.seed(252)
caretSamp <- createDataPartition(pima$Outcome ,
p = 0.7,
list = FALSE)
## SPLIT TESTING AND TRAINING
trainCaret <- pima[caretSamp, ]
testCaret <- pima[-caretSamp, ]
set.seed(252)
library(rpart)
classTree<- rpart(Outcome ~ .,
data = trainCaret,
method = "class")
### PLOT TREE
library(rpart.plot)
rpart.plot(classTree)
### Plot CP
plotcp(classTree)
printcp(classTree)
##
## Classification tree:
## rpart(formula = Outcome ~ ., data = trainCaret, method = "class")
##
## Variables actually used in tree construction:
## [1] Age BMI DiabetesPedigreeFunction
## [4] Glucose Insulin Pregnancies
##
## Root node error: 188/538 = 0.34944
##
## n= 538
##
## CP nsplit rel error xerror xstd
## 1 0.329787 0 1.00000 1.00000 0.058825
## 2 0.026596 1 0.67021 0.73936 0.054006
## 3 0.017730 4 0.59043 0.70213 0.053089
## 4 0.010638 9 0.50000 0.73404 0.053879
## 5 0.010000 12 0.46809 0.75000 0.054257
## WHICH CP
minCP<-classTree$cptable[which.min(classTree$cptable[,"xerror"]),"CP"]
Find the CP that minimizes the error.
library(rpart.plot)
prune_classTree <- prune(classTree, cp = minCP )
rpart.plot(prune_classTree )
## DEFAULT TREE
### PREDICT
predTree1<-predict(classTree , testCaret, type="class")
### CONFUSION MATRIX
cmTree1<-table(testCaret$Outcome, predTree1)
cmTree1
## predTree1
## 0 1
## 0 114 36
## 1 30 50
#### CORRECT RATE
mean(testCaret$Outcome==predTree1)
## [1] 0.7130435
## PRUNED TREE
### PREDICT
predTree2<-predict(prune_classTree , testCaret, type="class")
### CONFUSION MATRIX
cmTree2<-table(testCaret$Outcome, predTree2)
cmTree2
## predTree2
## 0 1
## 0 124 26
## 1 42 38
#### CORRECT RATE
mean(testCaret$Outcome==predTree2)
## [1] 0.7043478
Try a second random split.
Try a third random split.
Bagging stands for “bootstrap aggregation”. Random bootstrap samples (with replacement) of the data are used to create trees.
The big idea is that averaging a set of observations reduces variance; however, you lose the interpretability.
### BAGGING
library(caret)
library(ipred) #includes the bagging function
library(rpart)
set.seed(252)
pimaBag <- bagging(Outcome ~ .,
data = trainCaret,
nbagg = 150,
coob = TRUE,
control = rpart.control(minsplit = 2, cp = 0))
## PREDICT
predBag<-predict(pimaBag, testCaret, type="class")
## CONFUSION MATRIX
cmBag<-table(testCaret$Outcome, predBag)
cmBag
## predBag
## 0 1
## 0 123 27
## 1 31 49
## CORRECT RATE
mean(testCaret$Outcome==predBag)
## [1] 0.7478261
library(tidyverse)
### BAG
set.seed(252)
caretBag <- train(Outcome ~.,
data = trainCaret,
method = "treebag",
trControl = trainControl("cv", number = 10),
importance = TRUE
)
predCaretBag <- caretBag %>% predict(testCaret)
# CONFUSION MATRIX
table(predCaretBag, testCaret$Outcome)
##
## predCaretBag 0 1
## 0 126 33
## 1 24 47
# CORRECT RATE
mean(predCaretBag == testCaret$Outcome)
## [1] 0.7521739
Problem with Bagging:
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
vip(caretBag)
The big idea is to improve on bagging to create decorrelated trees.
When building decision tree, each split only considers a random subset of predictors, \(m \approx \sqrt{p}\). A new sample is taken for each split. Therefore, each split is not allowed to consider a majority of the available predictors.
### RF
set.seed(252)
caretRF <- train(Outcome ~.,
data = trainCaret,
method = "rf",
trControl = trainControl("cv", number = 10),
importance = TRUE
)
# Best tuning parameter
caretRF$bestTune
## mtry
## 1 2
# Final model
caretRF$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 23.98%
## Confusion matrix:
## 0 1 class.error
## 0 297 53 0.1514286
## 1 76 112 0.4042553
## PREDICT
predCaretRF <- caretRF %>% predict(testCaret)
## TABLE
table(predCaretRF, testCaret$Outcome)
##
## predCaretRF 0 1
## 0 125 29
## 1 25 51
## MEAN
mean(predCaretRF == testCaret$Outcome)
## [1] 0.7652174
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Plot MeanDecreaseAccuracy
varImpPlot(caretRF$finalModel, type = 1)
# Plot MeanDecreaseGini
varImpPlot(caretRF$finalModel, type = 2)
#library(vip)
#vip(caretRF$finalModel, type = 1)
#vip(caretRF$finalModel, type = 2)
The big idea is that trees are grown sequentially. Learns slowly (i.e. updates based on some scaled version of the previous tree). Tends to perform well. Each tree grown using information from previously grown trees. Uses original data and information from residuals
Three tuning parameters:
library(tidyverse)
#install.packages("xgboost)
library(xgboost)
caretBoost <- train(Outcome ~.,
data = trainCaret,
method="xgbTree",
trControl = trainControl("cv", number = 10),
verbosity = 0)
# Best tuning parameter
caretBoost$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 58 50 1 0.4 0 0.6 1 0.75
# Make predictions on the test data
predCaretBoost <- caretBoost %>% predict(testCaret)
## TABLE
table(predCaretBoost, testCaret$Outcome)
##
## predCaretBoost 0 1
## 0 124 35
## 1 26 45
## MEAN
mean(predCaretBoost == testCaret$Outcome)
## [1] 0.7347826