Libraries to load

1 Load Data

wine <- read.csv("winequality-data.csv", header = TRUE)
# Hitters <- na.omit(Hitters)
wine <- wine[, -13]
head(wine)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           6.7             0.15        0.38            1.7     0.037
## 2           6.7             0.44        0.31            1.9     0.030
## 3           6.1             0.17        0.21            1.9     0.090
## 4           6.6             0.39        0.22            4.0     0.038
## 5           6.8             0.32        0.34            6.0     0.050
## 6           8.3             0.28        0.27           17.5     0.045
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  20                   84 0.99046 3.09      0.53    11.4
## 2                  41                  104 0.99000 3.29      0.62    12.6
## 3                  44                  130 0.99255 3.07      0.41     9.7
## 4                  17                   98 0.99018 3.25      0.53    13.0
## 5                   5                  129 0.99530 3.19      0.40     9.1
## 6                  48                  253 1.00014 3.02      0.56     9.1
##   quality
## 1       6
## 2       7
## 3       5
## 4       7
## 5       5
## 6       6

2 Tree Implementation

Split the dataset into 50% training and 50% testing.

set.seed(500316995)

X <- wine[, -12]
y <- wine$quality
inTrain <- createDataPartition(y, p = 0.5)[[1]]

train <- wine[inTrain, ]
test <- wine[-inTrain, ]

X_train <- X[inTrain, ]
X_test <- X[-inTrain, ]
y_train <- y[inTrain]
y_test <- y[-inTrain]

2.1 Build decision tree model

method1

set.seed(500316995)
ct <- tree(train$quality ~ ., data = train)

summary(ct)
## 
## Regression tree:
## tree(formula = train$quality ~ ., data = train)
## Variables actually used in tree construction:
## [1] "alcohol"             "volatile.acidity"    "free.sulfur.dioxide"
## [4] "residual.sugar"     
## Number of terminal nodes:  9 
## Residual mean deviance:  0.5516 = 1076 / 1950 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.60400 -0.41190 -0.08046  0.00000  0.58810  2.58800
plot(ct)
text(ct)

method2

library(rpart)
library(rpart.plot)

# Create a decision tree model
decision_tree <- rpart(quality ~ ., data = train)
# Visualize the decision tree with rpart.plot
rpart.plot(decision_tree, box.palette = "RdBu", shadow.col = "gray", nn = TRUE)

2.2 Assess decision tree model

# check the RSS of the prediction
tree.pred <- predict(ct, newdata = X_test)

plot(tree.pred, y_test)
abline(0, 1)

tree.RSS <- sum((tree.pred - y_test)^2)
tree.RSS
## [1] 1120.156

3 Random forest implementation

3.1 Fit Random Forest

set.seed(500316995)
wine.forest <- randomForest(quality ~ ., data = train)
print(wine.forest)
## 
## Call:
##  randomForest(formula = quality ~ ., data = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.434991
##                     % Var explained: 45.33

forest.pred <- predict(wine.forest, newdata = X_test)

wine.forest$importance
##                      IncNodePurity
## fixed.acidity             96.40511
## volatile.acidity         155.54259
## citric.acid              100.39176
## residual.sugar           115.58033
## chlorides                112.31300
## free.sulfur.dioxide      159.30746
## total.sulfur.dioxide     115.13898
## density                  172.21832
## pH                       101.30907
## sulphates                 92.40652
## alcohol                  256.44731
varImpPlot(wine.forest)

3.2 Assess random forest

plot(forest.pred, y_test)
abline(0, 1)

forest.RSS <- sum((forest.pred - y_test)^2)
forest.RSS
## [1] 829.4077

Compared with the single decision tree, random forest have lower RSS.(1120 > 829)

3.3 Calibration

To fine tune ntree:

set.seed(500316995)
RSS.test <- c()
ntreesVal = c(50, 100, 200, 300, 400, 500, 600, 700, 800, 900)

for (i in ntreesVal) {
    rf.model <- randomForest(quality ~ ., data = train, ntree = i)
    rf.test <- predict(rf.model, newdata = X_test)
    RSS.rf <- sum((rf.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.rf)
}

data <- data.frame(ntreesVal, RSS.test)

fig <- plot_ly(data, x = ~ntreesVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'ntree' In Random Forest", xaxis = list(title = "ntree"), 
    yaxis = list(title = "RSS on test set"))

fig

So we find the suitable ntree value is around 500~600, and adding more trees does not improve performance.

To fine tune mtry:

set.seed(500316995)
RSS.test <- c()
mtryVal = seq(4, 9, by = 1)

for (i in mtryVal) {
    rf.model <- randomForest(quality ~ ., data = train, ntree = 500, mtry = i)
    rf.test <- predict(rf.model, newdata = X_test)
    RSS.rf <- sum((rf.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.rf)
}

data <- data.frame(mtryVal, RSS.test)

fig <- plot_ly(data, x = ~mtryVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'mtry' In Random Forest", xaxis = list(title = "mtry"), 
    yaxis = list(title = "RSS on test set"))

fig

So we find the suitable ntree value is around 5~6, and adding more variables will decrease performance.

4 Optional Boosting

4.1 gbm fitting

set.seed(500316995)
wine.boost <- gbm(quality ~ ., data = train, distribution = "gaussian", n.trees = 5000)
summary(wine.boost)

##                                       var   rel.inf
## free.sulfur.dioxide   free.sulfur.dioxide 12.976804
## alcohol                           alcohol 12.133237
## pH                                     pH 10.851045
## total.sulfur.dioxide total.sulfur.dioxide  9.701659
## volatile.acidity         volatile.acidity  9.077407
## sulphates                       sulphates  8.605636
## fixed.acidity               fixed.acidity  8.537015
## density                           density  8.386870
## residual.sugar             residual.sugar  7.935762
## chlorides                       chlorides  7.486914
## citric.acid                   citric.acid  4.307650
boost.pred <- predict(wine.boost, newdata = X_test)

plot(boost.pred, y_test)
abline(0, 1)

gbm.RSS <- sum((boost.pred - y_test)^2)
gbm.RSS
## [1] 1060.302

RSS on the test set prediction is 1060, higher than random forest.

To fine tune n.trees:

set.seed(500316995)
RSS.test <- c()
n.treesVal = seq(100, 1000, by = 100)

for (i in n.treesVal) {
    gbm.model <- gbm(quality ~ ., data = train, distribution = "gaussian", n.trees = i)
    gbm.test <- predict(gbm.model, newdata = X_test)
    RSS.gbm <- sum((gbm.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.gbm)
}

data <- data.frame(n.treesVal, RSS.test)

fig <- plot_ly(data, x = ~n.treesVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'n.trees' In a Boosted Tree", xaxis = list(title = "n.trees"), 
    yaxis = list(title = "RSS on test set"))

fig

So we find the suitable n.trees value is around 400, and adding more variables will decrease performance.

4.2 xgboost fitting

set.seed(500316995)
xgb_train = xgb.DMatrix(data = as.matrix(X_train), label = y_train)
xgb_test = xgb.DMatrix(data = as.matrix(X_test), label = y_test)
wine.xgb <- xgboost(xgb_train, nrounds = 300, max_depth = 2, eta = 0.01, verbose = 0)
xgb.pred <- predict(wine.xgb, newdata = xgb_test)

plot(xgb.pred, y_test)
abline(0, 1)

xgb.RSS <- sum((xgb.pred - y_test)^2)
xgb.RSS
## [1] 1193.055

RSS on the test set prediction is 1193.

To fine tune nrounds:

set.seed(500316995)
RSS.test <- c()
nroundsVal = c(300, 500, 1000, 1500)

for (i in nroundsVal) {
    xgb.model <- xgboost(xgb_train, nrounds = i, max_depth = 2, eta = 0.01, verbose = 0)
    xgb.test <- predict(xgb.model, newdata = xgb_test)
    RSS.xgb <- sum((xgb.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.xgb)
}

data <- data.frame(nroundsVal, RSS.test)

fig <- plot_ly(data, x = ~nroundsVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'nrounds' In Xgboost", xaxis = list(title = "nrounds"), 
    yaxis = list(title = "RSS on test set"))

fig

To fine tune eta:

set.seed(500316995)
RSS.test <- c()
etaVal = c(0.01, 0.02, 0.03)

for (i in etaVal) {
    xgb.model <- xgboost(xgb_train, nrounds = 1500, max_depth = 2, eta = i, verbose = 0)
    xgb.test <- predict(xgb.model, newdata = xgb_test)
    RSS.xgb <- sum((xgb.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.xgb)
}

data <- data.frame(etaVal, RSS.test)

fig <- plot_ly(data, x = ~etaVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'eta' In Xgboost", xaxis = list(title = "eta"), 
    yaxis = list(title = "RSS on test set"))

fig

APP. Student Info

Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 9
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: