install.packages("pacman")
Error in install.packages : Updating loaded packages
library("pacman")
p_load(ggplot2, dplyr, tidyr, tidyverse, ISLR, ISLR2, tibble, readr, purrr,stringr, forcats, lubridate, glmnet, leaps, caret, klaR, pls, mlbench, randomForest, tree, gbm, BART, MASS, rattle,rpart)
# set the axis range from 0 to 1
p <- seq(0, 1, 0.01)
# Calculate each error rate. Since 2 classes, use pˆm1 = 1 − pˆm2
classification.error <- 1 - pmax(p, 1 - p)
gini.index <- 2 * p * (1 - p)
entropy <- - (p * log(p) + (1 - p) * log(1 - p))
# Build the plot
matplot(p, cbind(classification.error, gini.index, entropy), col = c("blue", "green", "red" ), pch=16,main = "Classification Tree Measures with 2 classes", xlab = "Proportion of training observations", ylab = "Measure Values",ylim=c(0,1.1), lty=1)
legend(x='top', pch = 16,title = "Measures", col=c("blue", "green", "red"), legend=c("Classification Error Rate", "Gini Index", "Entropy"), box.lty=1)
set.seed(997)
train_index <- createDataPartition(Carseats$Sales, p = 0.6, list = FALSE, times = 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
dim(train)
[1] 482 11
tree.Carseats <- tree(Sales ~ ., data = train)
summary(tree.Carseats)
Regression tree:
tree(formula = Sales ~ ., data = train)
Variables actually used in tree construction:
[1] "ShelveLoc" "Price" "Age" "Income" "CompPrice"
[6] "Advertising"
Number of terminal nodes: 17
Residual mean deviance: 2.372 = 1103 / 465
Distribution of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-5.20700 -1.14500 -0.01706 0.00000 1.05200 4.45200
plot(tree.Carseats)
text(tree.Carseats, pretty = 0, cex = .55)
NA
NA
pred.tree <- predict(tree.Carseats, newdata = test)
mse.tree <- mean((pred.tree - test$Sales)^2)
mse.tree
[1] 4.284189
set.seed(997)
cv.carseats <- cv.tree(tree.Carseats, FUN = prune.tree)
cv.carseats
$size
[1] 17 16 15 14 13 12 11 9 8 7 5 4 3 2 1
$dev
[1] 1656.432 1734.245 1764.190 1774.177 1790.883 1790.883 1824.284 1831.200
[9] 2185.604 2185.604 2416.917 2412.573 2543.957 2815.409 3924.927
$k
[1] -Inf 46.46058 49.46359 50.39076 52.98134 53.58919
[7] 56.04805 64.42033 102.95100 102.97797 141.39331 154.54278
[13] 193.94041 418.38086 1118.65084
$method
[1] "deviance"
attr(,"class")
[1] "prune" "tree.sequence"
plot(cv.carseats$size , cv.carseats$dev, type = "b")
NA
NA
prune.carseats <- prune.tree(tree.Carseats , best = 5)
plot(prune.carseats)
text(prune.carseats , pretty = 0)
NA
NA
pred.tree.prune <- predict(prune.carseats, newdata = test)
mse.tree <- mean((pred.tree.prune - test$Sales)^2)
mse.tree
[1] 4.923
set.seed(997)
bag.Carseats <- randomForest(Sales ~., data = train, ntree = 500, mtry = ncol(train)-1, importance= TRUE)
bag.Carseats
Call:
randomForest(formula = Sales ~ ., data = train, ntree = 500, mtry = ncol(train) - 1, importance = TRUE)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 10
Mean of squared residuals: 1.245393
% Var explained: 84.67
install.packages("pacman")
Warning in install.packages :
package ‘pacman’ is in use and will not be installed
pred.carseats <- predict(bag.Carseats, newdata = test)
mse.carseats <- mean((pred.carseats - test$Sales)^2)
mse.carseats
[1] 2.255516
randomForest::importance(bag.Carseats)
%IncMSE IncNodePurity
CompPrice 55.264271 416.03406
Income 35.365201 228.65812
Advertising 44.294243 281.95318
Population 20.722038 104.81350
Price 111.812343 1093.21565
ShelveLoc 111.655615 1257.93758
Age 43.386355 307.35694
Education 27.637200 122.34371
Urban 7.459482 17.89376
US 11.484217 23.91682
set.seed(997)
rf.carseats <- randomForest(Sales~., data = train, importance = TRUE)
rf.carseats
Call:
randomForest(formula = Sales ~ ., data = train, importance = TRUE)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 3
Mean of squared residuals: 1.525753
% Var explained: 81.22
pred.rfCarseats <- predict(rf.carseats, mtry=3, newdata = test)
mse.rf <- mean((pred.rfCarseats - test$Sales)^2)
mse.rf
[1] 2.329565
randomForest::importance(rf.carseats)
%IncMSE IncNodePurity
CompPrice 35.060236 351.43146
Income 28.020843 277.23553
Advertising 36.421151 313.80800
Population 21.151835 233.73415
Price 62.483767 951.51825
ShelveLoc 61.262379 990.05196
Age 36.129786 406.01624
Education 23.340310 181.89627
Urban 8.790943 33.74796
US 13.333512 55.10219
x_train <- train[, -which(names(train) == "Sales")]
x_test <- test[, -which(names(test) == "Sales")]
y_train <- train$Sales
set.seed(997)
bart_fit <- wbart(x.train = x_train, y.train = y_train, x.test = x_test)
*****Into main of wbart
*****Data:
data:n,p,np: 482, 14, 64
y1,yn: 1.948402, -1.611598
x1,x[n*p]: 138.000000, 1.000000
xp1,xp[np*p]: 113.000000, 1.000000
*****Number of Trees: 200
*****Number of Cut Points: 71 ... 1
*****burn and ndpost: 100, 1000
*****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.287616,3.000000,0.210871
*****sigma: 1.040455
*****w (weights): 1.000000 ... 1.000000
*****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
*****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
*****printevery: 100
*****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
MCMC
done 0 (out of 1100)
done 100 (out of 1100)
done 200 (out of 1100)
done 300 (out of 1100)
done 400 (out of 1100)
done 500 (out of 1100)
done 600 (out of 1100)
done 700 (out of 1100)
done 800 (out of 1100)
done 900 (out of 1100)
done 1000 (out of 1100)
time: 6s
check counts
trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
pred_bart <- colMeans(bart_fit$yhat.test)
mse_bart <- mean((pred_bart - test$Sales)^2)
mse_bart
[1] 1.649856
data(OJ)
str(OJ)
'data.frame': 1070 obs. of 18 variables:
$ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
$ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
$ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
$ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
$ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
$ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
$ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
$ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
$ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
$ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
$ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
$ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
$ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
$ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
$ PctDiscMM : num 0 0.151 0 0 0 ...
$ PctDiscCH : num 0 0 0.0914 0 0 ...
$ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
$ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
set.seed(997)
train_index <- sample(1:nrow(OJ), 800)
train <- OJ[train_index, ]
test <- OJ[-train_index, ]
tree_OJ = rpart(Purchase~., data = train, method = "class", control = rpart.control(minsplit = 10, cp = 0.01))
summary(tree_OJ)
Call:
rpart(formula = Purchase ~ ., data = train, method = "class",
control = rpart.control(minsplit = 10, cp = 0.01))
n= 800
CP nsplit rel error xerror xstd
1 0.47648903 0 1.0000000 1.0000000 0.04341424
2 0.03134796 1 0.5235110 0.5611285 0.03695189
3 0.02507837 3 0.4608150 0.4952978 0.03529884
4 0.01567398 4 0.4357367 0.4576803 0.03424756
5 0.01000000 5 0.4200627 0.4733542 0.03469566
Variable importance
LoyalCH PriceDiff StoreID SalePriceMM WeekofPurchase
47 8 8 6 5
PriceMM DiscMM PctDiscMM PriceCH ListPriceDiff
5 5 5 4 4
STORE SalePriceCH
1 1
Node number 1: 800 observations, complexity param=0.476489
predicted class=CH expected loss=0.39875 P(node) =1
class counts: 481 319
probabilities: 0.601 0.399
left son=2 (448 obs) right son=3 (352 obs)
Primary splits:
LoyalCH < 0.5036 to the right, improve=126.45590, (0 missing)
StoreID < 3.5 to the right, improve= 39.41901, (0 missing)
PriceDiff < 0.015 to the right, improve= 22.68647, (0 missing)
SalePriceMM < 2.04 to the right, improve= 19.96728, (0 missing)
Store7 splits as RL, improve= 17.61501, (0 missing)
Surrogate splits:
StoreID < 3.5 to the right, agree=0.639, adj=0.179, (0 split)
WeekofPurchase < 237.5 to the right, agree=0.614, adj=0.122, (0 split)
PriceCH < 1.825 to the right, agree=0.600, adj=0.091, (0 split)
PriceMM < 2.04 to the right, agree=0.596, adj=0.082, (0 split)
ListPriceDiff < 0.085 to the right, agree=0.586, adj=0.060, (0 split)
Node number 2: 448 observations, complexity param=0.02507837
predicted class=CH expected loss=0.1495536 P(node) =0.56
class counts: 381 67
probabilities: 0.850 0.150
left son=4 (420 obs) right son=5 (28 obs)
Primary splits:
PriceDiff < -0.39 to the right, improve=14.53601, (0 missing)
LoyalCH < 0.7645725 to the right, improve=13.57180, (0 missing)
DiscMM < 0.72 to the left, improve=10.57814, (0 missing)
SalePriceMM < 1.435 to the right, improve=10.57814, (0 missing)
PctDiscMM < 0.3342595 to the left, improve=10.57814, (0 missing)
Surrogate splits:
DiscMM < 0.72 to the left, agree=0.971, adj=0.536, (0 split)
SalePriceMM < 1.435 to the right, agree=0.971, adj=0.536, (0 split)
PctDiscMM < 0.3342595 to the left, agree=0.971, adj=0.536, (0 split)
SalePriceCH < 2.075 to the left, agree=0.944, adj=0.107, (0 split)
Node number 3: 352 observations, complexity param=0.03134796
predicted class=MM expected loss=0.2840909 P(node) =0.44
class counts: 100 252
probabilities: 0.284 0.716
left son=6 (178 obs) right son=7 (174 obs)
Primary splits:
LoyalCH < 0.2761415 to the right, improve=17.104590, (0 missing)
STORE < 1.5 to the left, improve= 6.014799, (0 missing)
StoreID < 3.5 to the right, improve= 5.962806, (0 missing)
PriceDiff < 0.215 to the right, improve= 5.514752, (0 missing)
Store7 splits as RL, improve= 4.357584, (0 missing)
Surrogate splits:
STORE < 1.5 to the left, agree=0.634, adj=0.259, (0 split)
StoreID < 3.5 to the right, agree=0.599, adj=0.190, (0 split)
PriceCH < 1.875 to the left, agree=0.585, adj=0.161, (0 split)
SalePriceCH < 1.775 to the left, agree=0.585, adj=0.161, (0 split)
SalePriceMM < 2.04 to the left, agree=0.577, adj=0.144, (0 split)
Node number 4: 420 observations
predicted class=CH expected loss=0.1166667 P(node) =0.525
class counts: 371 49
probabilities: 0.883 0.117
Node number 5: 28 observations, complexity param=0.01567398
predicted class=MM expected loss=0.3571429 P(node) =0.035
class counts: 10 18
probabilities: 0.357 0.643
left son=10 (5 obs) right son=11 (23 obs)
Primary splits:
LoyalCH < 0.9748025 to the right, improve=5.031056, (0 missing)
PriceCH < 2.04 to the right, improve=2.777143, (0 missing)
DiscMM < 0.47 to the left, improve=2.777143, (0 missing)
SalePriceMM < 1.64 to the right, improve=2.777143, (0 missing)
SalePriceCH < 2.04 to the right, improve=2.777143, (0 missing)
Node number 6: 178 observations, complexity param=0.03134796
predicted class=MM expected loss=0.4382022 P(node) =0.2225
class counts: 78 100
probabilities: 0.438 0.562
left son=12 (102 obs) right son=13 (76 obs)
Primary splits:
PriceDiff < 0.05 to the right, improve=12.206500, (0 missing)
SalePriceMM < 2.04 to the right, improve=11.423010, (0 missing)
WeekofPurchase < 230.5 to the right, improve= 5.154373, (0 missing)
ListPriceDiff < 0.035 to the right, improve= 4.904148, (0 missing)
PriceMM < 2.04 to the right, improve= 4.791957, (0 missing)
Surrogate splits:
SalePriceMM < 1.94 to the right, agree=0.938, adj=0.855, (0 split)
DiscMM < 0.08 to the left, agree=0.826, adj=0.592, (0 split)
PctDiscMM < 0.038887 to the left, agree=0.826, adj=0.592, (0 split)
PriceMM < 2.04 to the right, agree=0.747, adj=0.408, (0 split)
ListPriceDiff < 0.135 to the right, agree=0.747, adj=0.408, (0 split)
Node number 7: 174 observations
predicted class=MM expected loss=0.1264368 P(node) =0.2175
class counts: 22 152
probabilities: 0.126 0.874
Node number 10: 5 observations
predicted class=CH expected loss=0 P(node) =0.00625
class counts: 5 0
probabilities: 1.000 0.000
Node number 11: 23 observations
predicted class=MM expected loss=0.2173913 P(node) =0.02875
class counts: 5 18
probabilities: 0.217 0.783
Node number 12: 102 observations
predicted class=CH expected loss=0.4019608 P(node) =0.1275
class counts: 61 41
probabilities: 0.598 0.402
Node number 13: 76 observations
predicted class=MM expected loss=0.2236842 P(node) =0.095
class counts: 17 59
probabilities: 0.224 0.776
fancyRpartPlot(tree_OJ)
set.seed(997)
tree.OJ <- tree(Purchase ~., data = train)
summary(tree.OJ)
Classification tree:
tree(formula = Purchase ~ ., data = train)
Variables actually used in tree construction:
[1] "LoyalCH" "SalePriceMM" "PriceDiff"
Number of terminal nodes: 8
Residual mean deviance: 0.7751 = 613.9 / 792
Misclassification error rate: 0.1675 = 134 / 800
Classification tree built both with growing tree with rpart and classification. Both models showed that Brand Loyalty was the most influential factor, followed by prices. Store location showed as equal weight in the rpart model, which points more towards the customer demographics and store display. Would need to be further investigated.
tree.OJ
node), split, n, deviance, yval, (yprob)
* denotes terminal node
1) root 800 1076.000 CH ( 0.60125 0.39875 )
2) LoyalCH < 0.574494 383 473.100 MM ( 0.30809 0.69191 )
4) LoyalCH < 0.0616725 76 10.650 MM ( 0.01316 0.98684 ) *
5) LoyalCH > 0.0616725 307 408.100 MM ( 0.38111 0.61889 )
10) SalePriceMM < 2.04 174 203.000 MM ( 0.27011 0.72989 ) *
11) SalePriceMM > 2.04 133 184.000 CH ( 0.52632 0.47368 )
22) LoyalCH < 0.277977 45 52.190 MM ( 0.26667 0.73333 ) *
23) LoyalCH > 0.277977 88 112.900 CH ( 0.65909 0.34091 ) *
3) LoyalCH > 0.574494 417 321.400 CH ( 0.87050 0.12950 )
6) LoyalCH < 0.764572 161 186.900 CH ( 0.73292 0.26708 )
12) PriceDiff < -0.065 38 50.020 MM ( 0.36842 0.63158 ) *
13) PriceDiff > -0.065 123 105.900 CH ( 0.84553 0.15447 )
26) PriceDiff < 0.31 79 84.790 CH ( 0.77215 0.22785 ) *
27) PriceDiff > 0.31 44 9.545 CH ( 0.97727 0.02273 ) *
7) LoyalCH > 0.764572 256 90.760 CH ( 0.95703 0.04297 ) *
plot(tree.OJ)
text(tree.OJ, pretty = 3, cex = 0.7)
NA
NA
# use predict function to get the response on the test data
tree.preds <- predict(tree.OJ, test, type = "class")
# Build a confusion matrix
confusion <- table(tree.preds, test$Purchase)
confusion
tree.preds CH MM
CH 147 28
MM 25 70
misclassification_rate <- 1 - sum(diag(confusion)) / sum(confusion)
print(misclassification_rate)
[1] 0.1962963
accuracy_rate <- sum(diag(confusion)) / sum(confusion)
print(accuracy_rate)
[1] 0.8037037
set.seed(997)
cv.oj <- cv.tree(tree.OJ, FUN = prune.misclass)
cv.oj
$size
[1] 8 7 5 2 1
$dev
[1] 158 158 173 179 319
$k
[1] -Inf 0.000000 5.000000 9.333333 147.000000
$method
[1] "misclass"
attr(,"class")
[1] "prune" "tree.sequence"
plot(cv.oj$size, cv.oj$dev, type = "b",
xlab = "Tree Size", ylab = "Cross-Validated Error",
main = "Optimal Tree Size")
optimal_size <- cv.oj$size[which.min(cv.oj$dev)]
cat("Optimal Tree Size:", optimal_size, "\n")
Optimal Tree Size: 8
prune.oj7 <- prune.misclass(tree.OJ, best = 7)
plot(prune.oj7)
text(prune.oj7, pretty = 0)
prune.oj8 <- prune.misclass(tree.OJ, best = 8)
plot(prune.oj8)
text(prune.oj8, pretty = 0)
prune.oj5 <- prune.misclass(tree.OJ, best = 5)
plot(prune.oj5)
text(prune.oj5, pretty = 0)
summary(prune.oj8)
Classification tree:
tree(formula = Purchase ~ ., data = train)
Variables actually used in tree construction:
[1] "LoyalCH" "SalePriceMM" "PriceDiff"
Number of terminal nodes: 8
Residual mean deviance: 0.7751 = 613.9 / 792
Misclassification error rate: 0.1675 = 134 / 800
summary(prune.oj5)
Classification tree:
snip.tree(tree = tree.OJ, nodes = 3L)
Variables actually used in tree construction:
[1] "LoyalCH" "SalePriceMM"
Number of terminal nodes: 5
Residual mean deviance: 0.8808 = 700.2 / 795
Misclassification error rate: 0.18 = 144 / 800
prune.oj.preds = predict(prune.oj5, newdata = test, type = "class")
confusion2 <- table(prune.oj.preds, test$Purchase)
confusion2
prune.oj.preds CH MM
CH 153 32
MM 19 66
misclassification_rate2 <- 1 - sum(diag(confusion2)) / sum(confusion2)
print(misclassification_rate2)
[1] 0.1888889
print(misclassification_rate)
[1] 0.1962963