Go to https://www.kaggle.com/datasets and identity two data-sets, one with a quantitative responses and the other with a qualitative responses. If you can find one dataset having both types of responses that will work as well.
Your objective is to predict these responses based on tree based methods.
setwd("C:/Users/Sam/Documents/MATH_624/Module_12")
library(tree)
library(tidyverse)
set.seed(3)
trans <- read.csv("C:/Users/Sam/Documents/MATH_624/Module_12/Transactions.csv", stringsAsFactors = F)
trans <- trans[1:13]
trans$order_status <- as.factor(trans$order_status)
trans$brand <- as.factor(trans$brand)
trans$product_line <- as.factor(trans$product_line)
trans$product_class <- as.factor(trans$product_class)
trans$product_size <- as.factor(trans$product_size)
trans$standard_cost <- gsub("\\$", "", as.character(trans$standard_cost))
trans$standard_cost <- gsub(",", "", as.character(trans$standard_cost))
trans$standard_cost <- as.numeric(trans$standard_cost)
trans <- subset(trans, select = -c(transaction_date))
trans2 <- filter(trans, product_id !=0)
trans2 = na.omit(trans2)
trans3 = trans2
l_list_price = log(trans3$list_price)
trans3 = data.frame(trans3, l_list_price)
trans3= subset(trans3, select = -list_price)
train = sample(18288,9144)
trainr.dat = trans3[train,]
testr.dat = trans3[-train,]
Reg.mod.1 = lm(l_list_price ~ ., data = trainr.dat)
test.error = mean(testr.dat$l_list_price - predict(Reg.mod.1, newdata = testr.dat, type="response"))^2
test.error
## [1] 2.121333e-06
tree.transr = tree(l_list_price ~., data = trainr.dat)
summary(tree.transr)
##
## Regression tree:
## tree(formula = l_list_price ~ ., data = trainr.dat)
## Variables actually used in tree construction:
## [1] "standard_cost" "product_id" "brand" "product_size"
## Number of terminal nodes: 15
## Residual mean deviance: 0.09775 = 892.3 / 9129
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.170000 -0.168400 0.006286 0.000000 0.176500 1.199000
plot(tree.transr)
text(tree.transr, pretty =0)
cv.transr = cv.tree(tree.transr)
names(cv.transr)
## [1] "size" "dev" "k" "method"
par(mfrow = c(1,2))
plot(cv.transr$size, cv.transr$dev, type='b')
plot(cv.transr$k, cv.transr$dev, type='b')
cv.transr
## $size
## [1] 15 14 13 12 11 8 7 5 4 3 2 1
##
## $dev
## [1] 895.2851 987.2533 1065.2683 1334.8920 1334.8920 1923.6330 2208.0706
## [8] 3694.2588 3807.8291 3807.8291 4661.0320 6149.0506
##
## $k
## [1] -Inf 77.18865 92.11283 131.41132 139.54705 196.48120
## [7] 284.29819 381.04753 417.12651 420.51432 853.11597 1488.91126
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
prune.transr = prune.tree(tree.transr, best = 6)
plot(prune.transr)
text(prune.transr, pretty =0)
yhat = predict(prune.transr, newdata = testr.dat)
mean((yhat - testr.dat$l_list_price)^2)
## [1] 0.2510381
The regression tree better performed compared to the multiple regression model in terms of the test prediction error for this data set.
train = sample(18288,9144)
trainc.dat = trans2[train,]
testc.dat = trans2[-train,]
tree.transc = tree(as.factor(brand) ~ ., trainc.dat)
tree.pred = predict(tree.transc, testc.dat, type = "class")
table(tree.pred, as.factor(testc.dat$brand))
##
## tree.pred Giant Bicycles Norco Bicycles OHM Cycles Solex
## 0 0 0 0 0
## Giant Bicycles 0 1395 0 162 0
## Norco Bicycles 0 0 871 0 233
## OHM Cycles 0 0 0 1101 0
## Solex 0 179 342 0 1524
## Trek Bicycles 0 0 0 0 20
## WeareA2B 0 0 97 100 147
##
## tree.pred Trek Bicycles WeareA2B
## 0 0
## Giant Bicycles 70 0
## Norco Bicycles 0 265
## OHM Cycles 0 59
## Solex 301 0
## Trek Bicycles 1003 0
## WeareA2B 0 1275
1 - mean(tree.pred == as.factor(testc.dat$brand))
## [1] 0.2159886
cv.transc = cv.tree(tree.transc, FUN = prune.misclass)
names(cv.transc)
## [1] "size" "dev" "k" "method"
par(mfrow = c(1,2))
plot(cv.transc$size, cv.transc$dev, type="b")
plot(cv.transc$k, cv.transc$dev, type="b")
prune.transc = prune.misclass(tree.transc, best = 4)
plot(prune.transc)
text(prune.transc, pretty = 0)
pruned.tree.predc = predict(prune.transc, testc.dat, type="class")
table(pruned.tree.predc, as.factor(testc.dat$brand))
##
## pruned.tree.predc Giant Bicycles Norco Bicycles OHM Cycles Solex
## 0 0 0 0 0
## Giant Bicycles 0 1398 680 656 840
## Norco Bicycles 0 0 0 0 0
## OHM Cycles 0 0 0 498 365
## Solex 0 0 0 0 0
## Trek Bicycles 0 176 630 109 623
## WeareA2B 0 0 0 100 96
##
## pruned.tree.predc Trek Bicycles WeareA2B
## 0 0
## Giant Bicycles 482 495
## Norco Bicycles 0 0
## OHM Cycles 0 145
## Solex 0 0
## Trek Bicycles 892 693
## WeareA2B 0 266
1-mean(pruned.tree.predc == as.factor(testc.dat$brand))
## [1] 0.6660105
The pruned tree performs much better than full classification.