Go to https://www.kaggle.com/datasets and identity two data-sets, one with a quantitative responses and the other with a qualitative responses. If you can find one dataset having both types of responses that will work as well.

Your objective is to predict these responses based on tree based methods.

setwd("C:/Users/Sam/Documents/MATH_624/Module_12")
library(tree)
library(tidyverse)
set.seed(3)
trans <- read.csv("C:/Users/Sam/Documents/MATH_624/Module_12/Transactions.csv", stringsAsFactors = F)
trans <- trans[1:13]
trans$order_status <- as.factor(trans$order_status)
trans$brand <- as.factor(trans$brand)
trans$product_line <- as.factor(trans$product_line)
trans$product_class <- as.factor(trans$product_class)
trans$product_size <- as.factor(trans$product_size)

trans$standard_cost <- gsub("\\$", "", as.character(trans$standard_cost))
trans$standard_cost <- gsub(",", "", as.character(trans$standard_cost))
trans$standard_cost <- as.numeric(trans$standard_cost)
trans <- subset(trans, select = -c(transaction_date))
trans2 <- filter(trans, product_id !=0)
trans2 = na.omit(trans2)
trans3 = trans2

Do the followings for this homework assignment:

1. (15 pts) Construct an optimal regression tree to predict the quantitative response. Summarize the treebuilding steps relating to your dataset. Discuss the performance of the regression tree compared to that of a multiple linear regression model. Point out how you are addressing the over-fitting issue. Attach data pre-processing steps as an appendix. You can add the steps/results contributing to your discussion in the main report.

l_list_price = log(trans3$list_price)
trans3 = data.frame(trans3, l_list_price)
trans3= subset(trans3, select = -list_price)

train = sample(18288,9144)
trainr.dat = trans3[train,]
testr.dat = trans3[-train,]
Reg.mod.1 = lm(l_list_price ~ ., data = trainr.dat)

test.error = mean(testr.dat$l_list_price - predict(Reg.mod.1, newdata = testr.dat, type="response"))^2
test.error
## [1] 2.121333e-06
tree.transr = tree(l_list_price ~., data = trainr.dat)
summary(tree.transr)
## 
## Regression tree:
## tree(formula = l_list_price ~ ., data = trainr.dat)
## Variables actually used in tree construction:
## [1] "standard_cost" "product_id"    "brand"         "product_size" 
## Number of terminal nodes:  15 
## Residual mean deviance:  0.09775 = 892.3 / 9129 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.170000 -0.168400  0.006286  0.000000  0.176500  1.199000
plot(tree.transr)
text(tree.transr, pretty =0)

cv.transr = cv.tree(tree.transr)
names(cv.transr)
## [1] "size"   "dev"    "k"      "method"
par(mfrow = c(1,2))
plot(cv.transr$size, cv.transr$dev, type='b')
plot(cv.transr$k, cv.transr$dev, type='b')

cv.transr
## $size
##  [1] 15 14 13 12 11  8  7  5  4  3  2  1
## 
## $dev
##  [1]  895.2851  987.2533 1065.2683 1334.8920 1334.8920 1923.6330 2208.0706
##  [8] 3694.2588 3807.8291 3807.8291 4661.0320 6149.0506
## 
## $k
##  [1]       -Inf   77.18865   92.11283  131.41132  139.54705  196.48120
##  [7]  284.29819  381.04753  417.12651  420.51432  853.11597 1488.91126
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
prune.transr = prune.tree(tree.transr, best = 6)
plot(prune.transr)
text(prune.transr, pretty =0)

yhat = predict(prune.transr, newdata = testr.dat)
mean((yhat - testr.dat$l_list_price)^2)
## [1] 0.2510381

The regression tree better performed compared to the multiple regression model in terms of the test prediction error for this data set.

2. (15 pts) Construct an optimal classification tree to predict the qualitative response. Summarize the tree building steps relating to your dataset. Discuss the performance of the classification tree compared to that of a non-tree based classification method suitable for your response. Attach data preprocessing steps as an appendix. You can add the steps/results contributing to your discussion in the main report.

train = sample(18288,9144)
trainc.dat = trans2[train,]
testc.dat = trans2[-train,]
tree.transc = tree(as.factor(brand) ~ ., trainc.dat)

tree.pred = predict(tree.transc, testc.dat, type = "class")
table(tree.pred, as.factor(testc.dat$brand))
##                 
## tree.pred             Giant Bicycles Norco Bicycles OHM Cycles Solex
##                     0              0              0          0     0
##   Giant Bicycles    0           1395              0        162     0
##   Norco Bicycles    0              0            871          0   233
##   OHM Cycles        0              0              0       1101     0
##   Solex             0            179            342          0  1524
##   Trek Bicycles     0              0              0          0    20
##   WeareA2B          0              0             97        100   147
##                 
## tree.pred        Trek Bicycles WeareA2B
##                              0        0
##   Giant Bicycles            70        0
##   Norco Bicycles             0      265
##   OHM Cycles                 0       59
##   Solex                    301        0
##   Trek Bicycles           1003        0
##   WeareA2B                   0     1275
1 - mean(tree.pred == as.factor(testc.dat$brand))
## [1] 0.2159886
cv.transc = cv.tree(tree.transc, FUN = prune.misclass)
names(cv.transc)
## [1] "size"   "dev"    "k"      "method"
par(mfrow = c(1,2))
plot(cv.transc$size, cv.transc$dev, type="b")
plot(cv.transc$k, cv.transc$dev, type="b")

prune.transc = prune.misclass(tree.transc, best = 4)
plot(prune.transc)
text(prune.transc, pretty = 0)

pruned.tree.predc = predict(prune.transc, testc.dat, type="class")
table(pruned.tree.predc, as.factor(testc.dat$brand))
##                  
## pruned.tree.predc      Giant Bicycles Norco Bicycles OHM Cycles Solex
##                      0              0              0          0     0
##    Giant Bicycles    0           1398            680        656   840
##    Norco Bicycles    0              0              0          0     0
##    OHM Cycles        0              0              0        498   365
##    Solex             0              0              0          0     0
##    Trek Bicycles     0            176            630        109   623
##    WeareA2B          0              0              0        100    96
##                  
## pruned.tree.predc Trek Bicycles WeareA2B
##                               0        0
##    Giant Bicycles           482      495
##    Norco Bicycles             0        0
##    OHM Cycles                 0      145
##    Solex                      0        0
##    Trek Bicycles            892      693
##    WeareA2B                   0      266
1-mean(pruned.tree.predc == as.factor(testc.dat$brand))
## [1] 0.6660105

The pruned tree performs much better than full classification.