Use the datasets from your HW 9. Go to https://www.kaggle.com/datasets and identiy two datasets, one with a quantitative response and the other with a qualitative responses. If you can find one dataset having both types of responses that will work as well. Your objective is to predict these responses based on ensemble tree based methods.
setwd("C:/Users/Sam/Documents/MATH_624/Module_13")
library(tree)
library(tidyverse)
library(randomForest)
library(gbm)
library(C50)
set.seed(1)
trans <- read.csv("Transactions.csv", stringsAsFactors = F)
trans <- trans[1:13]
trans$order_status <- as.factor(trans$order_status)
trans$brand <- as.factor(trans$brand)
trans$online_order <- as.factor(trans$online_order)
trans$product_line <- as.factor(trans$product_line)
trans$product_class <- as.factor(trans$product_class)
trans$product_size <- as.factor(trans$product_size)
trans$standard_cost <- gsub("\\$", "", as.character(trans$standard_cost))
trans$standard_cost <- gsub(",", "", as.character(trans$standard_cost))
trans$standard_cost <- as.numeric(trans$standard_cost)
trans <- subset(trans, select = -c(transaction_date))
trans <- trans %>%
filter(online_order != "") %>%
droplevels(trans$online_order) %>%
filter(order_status != "") %>%
droplevels(trans$order_status) %>%
filter(brand != "") %>%
droplevels(trans$product_line) %>%
filter(product_line != "") %>%
droplevels(trans$product_class) %>%
filter(product_class != "") %>%
droplevels(trans$product_class) %>%
filter(product_size != "") %>%
droplevels(trans$product_size)
trans2 <- filter(trans, product_id !=0)
trans2 = na.omit(trans2)
trans3 = trans2
l_list_price = log(trans3$list_price)
trans3 = data.frame(trans3, l_list_price)
trans3= subset(trans3, select = -list_price)
train = sample(18288,9144)
trainr.dat = trans3[train,]
testr.dat = trans3[-train,]
tree.transr = tree(l_list_price ~ ., data = trainr.dat)
cv.transr = cv.tree(tree.transr)
plot(cv.transr$size, cv.transr$dev, type = 'b')
prune.transr = prune.tree(tree.transr, best = 6)
yhatr = predict(prune.transr, newdata = testr.dat)
mean((yhatr - testr.dat$l_list_price)^2)
## [1] 0.2892857
set.seed(1)
bag.transr = randomForest(l_list_price ~ ., data = trainr.dat, mtry = 19, importance = TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
bag.transr
##
## Call:
## randomForest(formula = l_list_price ~ ., data = trainr.dat, mtry = 19, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 11
##
## Mean of squared residuals: 1.374081e-05
## % Var explained: 100
yhatr.bag = predict(bag.transr, newdata = testr.dat)
mean((yhatr.bag - testr.dat$l_list_price)^2)
## [1] 3.359279e-06
importance(bag.transr)
## %IncMSE IncNodePurity
## ï..transaction_id -2.8244257 5.386217e-02
## product_id 94.7530360 8.625917e+02
## customer_id -2.8476124 5.870063e-02
## online_order -0.1367930 2.988267e-03
## order_status -0.4642734 4.827332e-04
## brand 58.9696322 4.987925e+02
## product_line 19.7880262 6.608854e+01
## product_class 21.1357054 8.339317e+01
## product_size 49.2106916 4.020066e+02
## standard_cost 397.3422577 4.417199e+03
## product_first_sold_date 15.5163616 2.056048e+01
varImpPlot(bag.transr)
Standard_cost is placed at the top of the variance importance plots indicating the top important feature for the response variable list_price. Two variable importance measures are reported. The first one indicates mean decrease in prediction error, and the second measure indicates the decrease in node impurity.
set.seed(1)
rf.transr = randomForest(l_list_price ~ ., data = trainr.dat, mtry = 6, importance = TRUE)
rf.transr
##
## Call:
## randomForest(formula = l_list_price ~ ., data = trainr.dat, mtry = 6, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 9.316129e-05
## % Var explained: 99.99
yhatr.rf = predict(rf.transr, newdata = testr.dat)
mean((yhatr.rf - testr.dat$l_list_price)^2)
## [1] 5.398718e-05
importance(rf.transr)
## %IncMSE IncNodePurity
## ï..transaction_id 1.1441862 2.972905e+00
## product_id 47.9937983 9.891033e+02
## customer_id 0.2076332 3.035679e+00
## online_order 0.2140625 1.677771e-01
## order_status -1.0014045 1.442016e-02
## brand 33.0146673 5.855167e+02
## product_line 18.5550722 1.682456e+02
## product_class 26.7636413 2.719050e+02
## product_size 29.6581888 3.975105e+02
## standard_cost 143.2796909 3.796056e+03
## product_first_sold_date 14.0052313 1.442398e+02
varImpPlot(rf.transr)
Standard_cost is placed at the top of the variance importance plots indicating the top important feature for the response variable list_price. Two variable importance measures are reported. The first one indicates mean decrease in prediction error, and the second measure indicates the decrease in node impurity.
set.seed(1)
boost.transr = gbm(l_list_price ~ ., data = trainr.dat, distribution = "gaussian", n.trees = 500, interaction.depth = 4)
summary(boost.transr)
## var rel.inf
## standard_cost standard_cost 68.408663285
## product_id product_id 15.250380696
## product_size product_size 5.187241340
## brand brand 5.170193574
## product_line product_line 4.131984897
## product_class product_class 1.412360389
## product_first_sold_date product_first_sold_date 0.433123875
## customer_id customer_id 0.003939817
## ï..transaction_id ï..transaction_id 0.002112128
## online_order online_order 0.000000000
## order_status order_status 0.000000000
yhatr.boost = predict(boost.transr, newdata = testr.dat, n.trees = 500)
mean((yhatr.boost - testr.dat$l_list_price)^2)
## [1] 0.001325263
Boosting algorithm also shows the same features as the top important features for the response variable list_price. For this data, boosting aggregating procedure performs better than bagging, random forests, and a single tree.
train = sample(18288,9144)
trainc.dat = trans2[train,]
testc.dat = trans2[-train,]
tree.transc = tree(brand ~ ., trainc.dat)
cv.transc = cv.tree(tree.transc, FUN = prune.misclass)
plot(cv.transc$size, cv.transc$dev, type = "b")
prune.transc = prune.misclass(tree.transc, best = 5)
pruned.tree.pred = predict(prune.transc, testc.dat, type = "class")
table(pruned.tree.pred, (testc.dat$brand))
##
## pruned.tree.pred Giant Bicycles Norco Bicycles OHM Cycles Solex Trek Bicycles
## Giant Bicycles 1387 674 657 848 471
## Norco Bicycles 0 230 0 33 0
## OHM Cycles 0 0 578 497 0
## Solex 0 341 0 569 595
## Trek Bicycles 0 0 117 0 259
## WeareA2B 157 64 0 0 72
##
## pruned.tree.pred WeareA2B
## Giant Bicycles 477
## Norco Bicycles 180
## OHM Cycles 432
## Solex 92
## Trek Bicycles 0
## WeareA2B 414
1-mean(pruned.tree.pred == as.factor(testc.dat$brand))
## [1] 0.6241251
set.seed (1)
bag.transc = randomForest(brand ~ ., data = trans2, mtry = 8, importance = TRUE)
bag.transc
##
## Call:
## randomForest(formula = brand ~ ., data = trans2, mtry = 8, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## Giant Bicycles Norco Bicycles OHM Cycles Solex Trek Bicycles
## Giant Bicycles 3143 0 0 0 0
## Norco Bicycles 0 2625 0 0 0
## OHM Cycles 0 0 2753 0 0
## Solex 0 0 0 3896 0
## Trek Bicycles 0 0 0 0 2719
## WeareA2B 0 0 0 0 0
## WeareA2B class.error
## Giant Bicycles 0 0
## Norco Bicycles 0 0
## OHM Cycles 0 0
## Solex 0 0
## Trek Bicycles 0 0
## WeareA2B 3152 0
Standard_cost and list_price are placed at the top of the variance importance plots indicating the top important features for the response variable brand. Two variable importance measures are reported. The first one indicates mean decrease in prediction error, and the second measure indicates the decrease in node impurity.
varImpPlot(bag.transc)
set.seed (1)
rf.transc = randomForest(brand ~ ., data = trans2, mtry = 3, importance = TRUE)
rf.transc
##
## Call:
## randomForest(formula = brand ~ ., data = trans2, mtry = 3, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## Giant Bicycles Norco Bicycles OHM Cycles Solex Trek Bicycles
## Giant Bicycles 3143 0 0 0 0
## Norco Bicycles 0 2625 0 0 0
## OHM Cycles 0 0 2753 0 0
## Solex 0 0 0 3896 0
## Trek Bicycles 0 0 0 0 2719
## WeareA2B 0 0 0 0 0
## WeareA2B class.error
## Giant Bicycles 0 0
## Norco Bicycles 0 0
## OHM Cycles 0 0
## Solex 0 0
## Trek Bicycles 0 0
## WeareA2B 3152 0
varImpPlot(rf.transc)
Standard_cost and list_price are placed at the top of the variance importance plots indicating the top important features for the response variable brand. Two variable importance measures are reported. The first one indicates mean decrease in prediction error, and the second measure indicates the decrease in node impurity.
trainc.dat2 <- subset(trainc.dat, select = -c(brand))
testc.dat2 <- subset(testc.dat, select = -c(brand))
boost.transc <- C5.0(trainc.dat2, trainc.dat$brand, trials = 5)
pred <- predict(boost.transc, testc.dat2)
table(pred, testc.dat$brand)
##
## pred Giant Bicycles Norco Bicycles OHM Cycles Solex Trek Bicycles
## Giant Bicycles 1544 0 0 0 0
## Norco Bicycles 0 1309 0 0 0
## OHM Cycles 0 0 1352 0 0
## Solex 0 0 0 1947 0
## Trek Bicycles 0 0 0 0 1397
## WeareA2B 0 0 0 0 0
##
## pred WeareA2B
## Giant Bicycles 0
## Norco Bicycles 0
## OHM Cycles 0
## Solex 0
## Trek Bicycles 0
## WeareA2B 1595
1-mean(pred == testc.dat$brand)
## [1] 0
For this data, boosting aggregating procedure performs equal to that of bagging and random forests at an error rate of zero. A single tree had an error rate of 0.6241251. In this instance any method would be better than single tree. If data were different this could lead to differing results and potentially a superior method would be chosen, as opposed to a three way tie.