Assignment_week09_June Yao

Libraries to load

1 Load Data

wine <- read.csv("winequality-data.csv", header = TRUE)
# Hitters <- na.omit(Hitters)
wine <- wine[, -13]
head(wine)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           6.7             0.15        0.38            1.7     0.037
## 2           6.7             0.44        0.31            1.9     0.030
## 3           6.1             0.17        0.21            1.9     0.090
## 4           6.6             0.39        0.22            4.0     0.038
## 5           6.8             0.32        0.34            6.0     0.050
## 6           8.3             0.28        0.27           17.5     0.045
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  20                   84 0.99046 3.09      0.53    11.4
## 2                  41                  104 0.99000 3.29      0.62    12.6
## 3                  44                  130 0.99255 3.07      0.41     9.7
## 4                  17                   98 0.99018 3.25      0.53    13.0
## 5                   5                  129 0.99530 3.19      0.40     9.1
## 6                  48                  253 1.00014 3.02      0.56     9.1
##   quality
## 1       6
## 2       7
## 3       5
## 4       7
## 5       5
## 6       6

2 Tree Implementation

Split the dataset into 50% training and 50% testing.

set.seed(500316995)

X <- wine[, -12]
y <- wine$quality
inTrain <- createDataPartition(y, p = 0.5)[[1]]

train <- wine[inTrain, ]
test <- wine[-inTrain, ]

X_train <- X[inTrain, ]
X_test <- X[-inTrain, ]
y_train <- y[inTrain]
y_test <- y[-inTrain]

2.1 Build decision tree model

method1

set.seed(500316995)
ct <- tree(train$quality ~ ., data = train)

summary(ct)
## 
## Regression tree:
## tree(formula = train$quality ~ ., data = train)
## Variables actually used in tree construction:
## [1] "alcohol"             "volatile.acidity"    "free.sulfur.dioxide"
## [4] "residual.sugar"     
## Number of terminal nodes:  9 
## Residual mean deviance:  0.5516 = 1076 / 1950 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.60400 -0.41190 -0.08046  0.00000  0.58810  2.58800
plot(ct)
text(ct)

method2

library(rpart)
library(rpart.plot)

# Create a decision tree model
decision_tree <- rpart(quality ~ ., data = train)
# Visualize the decision tree with rpart.plot
rpart.plot(decision_tree, box.palette = "RdBu", shadow.col = "gray", nn = TRUE)

2.2 Assess decision tree model

# check the RSS of the prediction
tree.pred <- predict(ct, newdata = X_test)

plot(tree.pred, y_test)
abline(0, 1)

tree.RSS <- sum((tree.pred - y_test)^2)
tree.RSS
## [1] 1120.156

3 Random forest implementation

3.1 Fit Random Forest

set.seed(500316995)
wine.forest <- randomForest(quality ~ ., data = train)
print(wine.forest)
## 
## Call:
##  randomForest(formula = quality ~ ., data = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.434991
##                     % Var explained: 45.33

forest.pred <- predict(wine.forest, newdata = X_test)

wine.forest$importance
##                      IncNodePurity
## fixed.acidity             96.40511
## volatile.acidity         155.54259
## citric.acid              100.39176
## residual.sugar           115.58033
## chlorides                112.31300
## free.sulfur.dioxide      159.30746
## total.sulfur.dioxide     115.13898
## density                  172.21832
## pH                       101.30907
## sulphates                 92.40652
## alcohol                  256.44731
varImpPlot(wine.forest)

3.2 Assess random forest

plot(forest.pred, y_test)
abline(0, 1)

forest.RSS <- sum((forest.pred - y_test)^2)
forest.RSS
## [1] 829.4077

Compared with the single decision tree, random forest have lower RSS.(1120 > 829)

3.3 Calibration

To fine tune ntree:

set.seed(500316995)
RSS.test <- c()
ntreesVal = c(50, 100, 200, 300, 400, 500, 600, 700, 800, 900)

for (i in ntreesVal) {
    rf.model <- randomForest(quality ~ ., data = train, ntree = i)
    rf.test <- predict(rf.model, newdata = X_test)
    RSS.rf <- sum((rf.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.rf)
}

data <- data.frame(ntreesVal, RSS.test)

fig <- plot_ly(data, x = ~ntreesVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'ntree' In Random Forest", xaxis = list(title = "ntree"), 
    yaxis = list(title = "RSS on test set"))

fig

So we find the suitable ntree value is around 500~600, and adding more trees does not improve performance.

To fine tune mtry:

set.seed(500316995)
RSS.test <- c()
mtryVal = seq(4, 9, by = 1)

for (i in mtryVal) {
    rf.model <- randomForest(quality ~ ., data = train, ntree = 500, mtry = i)
    rf.test <- predict(rf.model, newdata = X_test)
    RSS.rf <- sum((rf.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.rf)
}

data <- data.frame(mtryVal, RSS.test)

fig <- plot_ly(data, x = ~mtryVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'mtry' In Random Forest", xaxis = list(title = "mtry"), 
    yaxis = list(title = "RSS on test set"))

fig

So we find the suitable ntree value is around 5~6, and adding more variables will decrease performance.

4 Optional Boosting

4.1 `gbm` fitting

set.seed(500316995)
wine.boost <- gbm(quality ~ ., data = train, distribution = "gaussian", n.trees = 5000)
summary(wine.boost)

##                                       var   rel.inf
## free.sulfur.dioxide   free.sulfur.dioxide 12.976804
## alcohol                           alcohol 12.133237
## pH                                     pH 10.851045
## total.sulfur.dioxide total.sulfur.dioxide  9.701659
## volatile.acidity         volatile.acidity  9.077407
## sulphates                       sulphates  8.605636
## fixed.acidity               fixed.acidity  8.537015
## density                           density  8.386870
## residual.sugar             residual.sugar  7.935762
## chlorides                       chlorides  7.486914
## citric.acid                   citric.acid  4.307650

boost.pred <- predict(wine.boost, newdata = X_test)

plot(boost.pred, y_test)
abline(0, 1)

gbm.RSS <- sum((boost.pred - y_test)^2)
gbm.RSS
## [1] 1060.302

RSS on the test set prediction is 1060, higher than random forest.

To fine tune n.trees:

set.seed(500316995)
RSS.test <- c()
n.treesVal = seq(100, 1000, by = 100)

for (i in n.treesVal) {
    gbm.model <- gbm(quality ~ ., data = train, distribution = "gaussian", n.trees = i)
    gbm.test <- predict(gbm.model, newdata = X_test)
    RSS.gbm <- sum((gbm.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.gbm)
}

data <- data.frame(n.treesVal, RSS.test)

fig <- plot_ly(data, x = ~n.treesVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'n.trees' In a Boosted Tree", xaxis = list(title = "n.trees"), 
    yaxis = list(title = "RSS on test set"))

fig

So we find the suitable n.trees value is around 400, and adding more variables will decrease performance.

4.2 `xgboost` fitting

set.seed(500316995)
xgb_train = xgb.DMatrix(data = as.matrix(X_train), label = y_train)
xgb_test = xgb.DMatrix(data = as.matrix(X_test), label = y_test)
wine.xgb <- xgboost(xgb_train, nrounds = 300, max_depth = 2, eta = 0.01, verbose = 0)
xgb.pred <- predict(wine.xgb, newdata = xgb_test)

plot(xgb.pred, y_test)
abline(0, 1)

xgb.RSS <- sum((xgb.pred - y_test)^2)
xgb.RSS
## [1] 1193.055

RSS on the test set prediction is 1193.

To fine tune nrounds:

set.seed(500316995)
RSS.test <- c()
nroundsVal = c(300, 500, 1000, 1500)

for (i in nroundsVal) {
    xgb.model <- xgboost(xgb_train, nrounds = i, max_depth = 2, eta = 0.01, verbose = 0)
    xgb.test <- predict(xgb.model, newdata = xgb_test)
    RSS.xgb <- sum((xgb.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.xgb)
}

data <- data.frame(nroundsVal, RSS.test)

fig <- plot_ly(data, x = ~nroundsVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'nrounds' In Xgboost", xaxis = list(title = "nrounds"), 
    yaxis = list(title = "RSS on test set"))

fig

To fine tune eta:

set.seed(500316995)
RSS.test <- c()
etaVal = c(0.01, 0.02, 0.03)

for (i in etaVal) {
    xgb.model <- xgboost(xgb_train, nrounds = 1500, max_depth = 2, eta = i, verbose = 0)
    xgb.test <- predict(xgb.model, newdata = xgb_test)
    RSS.xgb <- sum((xgb.test - y_test)^2)
    RSS.test <- c(RSS.test, RSS.xgb)
}

data <- data.frame(etaVal, RSS.test)

fig <- plot_ly(data, x = ~etaVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'eta' In Xgboost", xaxis = list(title = "eta"), 
    yaxis = list(title = "RSS on test set"))

fig

5 Recommended extra: Tuning Boosted models

tune GBM using caret package:

# Basic Parameter Tuning
fitControl <- trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           ## repeated ten times
                           repeats = 5)

# Alternate Tuning Grids
gbmGrid <-  expand.grid(interaction.depth = c(1, 5, 9), 
                        n.trees = (2:20)*50, 
                        shrinkage = 0.1,
                        n.minobsinnode = 10)

set.seed(500316995)
gbmFit <- train(quality ~ ., data = train, 
                 method = "gbm", 
                 trControl = fitControl,
                 verbose = FALSE,
                 tuneGrid = gbmGrid)
gbmFit
## Stochastic Gradient Boosting 
## 
## 1959 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 1567, 1568, 1567, 1566, 1568, 1566, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE       Rsquared   MAE      
##   1                   100     0.7362527  0.3225697  0.5846532
##   1                   150     0.7305795  0.3314542  0.5774298
##   1                   200     0.7292758  0.3336155  0.5754446
##   1                   250     0.7289535  0.3350475  0.5747415
##   1                   300     0.7286795  0.3355301  0.5744462
##   1                   350     0.7281199  0.3367093  0.5733173
##   1                   400     0.7279783  0.3370753  0.5735390
##   1                   450     0.7279302  0.3374161  0.5730847
##   1                   500     0.7277324  0.3379215  0.5727456
##   1                   550     0.7270541  0.3395281  0.5721529
##   1                   600     0.7272055  0.3394400  0.5723890
##   1                   650     0.7272017  0.3396276  0.5725274
##   1                   700     0.7267240  0.3403297  0.5721790
##   1                   750     0.7267989  0.3404770  0.5722217
##   1                   800     0.7264276  0.3412214  0.5720308
##   1                   850     0.7263539  0.3414956  0.5719957
##   1                   900     0.7259669  0.3426450  0.5714543
##   1                   950     0.7261649  0.3419749  0.5716206
##   1                  1000     0.7266992  0.3413185  0.5716210
##   5                   100     0.7106975  0.3673910  0.5562537
##   5                   150     0.7079073  0.3734706  0.5510131
##   5                   200     0.7076066  0.3752111  0.5479786
##   5                   250     0.7086543  0.3746580  0.5469854
##   5                   300     0.7094686  0.3749694  0.5449420
##   5                   350     0.7106329  0.3744511  0.5446482
##   5                   400     0.7107651  0.3754967  0.5428648
##   5                   450     0.7121918  0.3748813  0.5428876
##   5                   500     0.7137284  0.3738052  0.5429280
##   5                   550     0.7146018  0.3735233  0.5425093
##   5                   600     0.7154602  0.3732026  0.5420693
##   5                   650     0.7162739  0.3729462  0.5414077
##   5                   700     0.7184044  0.3708320  0.5418238
##   5                   750     0.7197326  0.3698542  0.5417187
##   5                   800     0.7206636  0.3697226  0.5418726
##   5                   850     0.7220666  0.3684387  0.5420467
##   5                   900     0.7220383  0.3691863  0.5413047
##   5                   950     0.7227254  0.3692415  0.5412302
##   5                  1000     0.7240041  0.3681923  0.5411933
##   9                   100     0.7058751  0.3771756  0.5484470
##   9                   150     0.7044415  0.3816944  0.5441407
##   9                   200     0.7046766  0.3832219  0.5403264
##   9                   250     0.7057139  0.3838198  0.5389463
##   9                   300     0.7067026  0.3842534  0.5364726
##   9                   350     0.7075390  0.3843756  0.5344856
##   9                   400     0.7090520  0.3840878  0.5330889
##   9                   450     0.7108625  0.3825117  0.5330692
##   9                   500     0.7114202  0.3830780  0.5314629
##   9                   550     0.7122656  0.3830201  0.5307927
##   9                   600     0.7131852  0.3827910  0.5297958
##   9                   650     0.7142192  0.3825248  0.5292818
##   9                   700     0.7151591  0.3819745  0.5289880
##   9                   750     0.7159789  0.3818157  0.5282693
##   9                   800     0.7170267  0.3810389  0.5280034
##   9                   850     0.7179333  0.3805250  0.5274039
##   9                   900     0.7188259  0.3800430  0.5269181
##   9                   950     0.7195144  0.3798579  0.5262685
##   9                  1000     0.7204305  0.3790762  0.5259406
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  9, shrinkage = 0.1 and n.minobsinnode = 10.
trellis.par.set(caretTheme())
plot(gbmFit)

tune XGBoost using caret package:

# Basic Parameter Tuning
fitControl <- trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           ## repeated ten times
                           repeats = 5)

# Alternate Tuning Grids
xgbGrid <-  expand.grid(nrounds = c(300, 500, 1000, 1500),
                        max_depth = 2,
                        eta = c(0.01, 0.02, 0.03),
                        gamma = 0,
                        colsample_bytree = 1,
                        min_child_weight = 1,
                        subsample = 1
                        )

set.seed(500316995)
xgbFit <- train(quality ~ ., data = train, 
                 method = "xgbTree", 
                 trControl = fitControl,
                 verbose = FALSE,
                 tuneGrid = xgbGrid,
                 objective="reg:squarederror")
xgbFit
## eXtreme Gradient Boosting 
## 
## 1959 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 1567, 1568, 1567, 1566, 1568, 1566, ... 
## Resampling results across tuning parameters:
## 
##   eta   nrounds  RMSE       Rsquared   MAE      
##   0.01   300     0.7913690  0.3157648  0.6172729
##   0.01   500     0.7359278  0.3287945  0.5803894
##   0.01  1000     0.7238817  0.3445116  0.5701468
##   0.01  1500     0.7189925  0.3520156  0.5656466
##   0.02   300     0.7315816  0.3337868  0.5770693
##   0.02   500     0.7236587  0.3448285  0.5700531
##   0.02  1000     0.7155525  0.3578821  0.5623495
##   0.02  1500     0.7123323  0.3640907  0.5585572
##   0.03   300     0.7249205  0.3430016  0.5711306
##   0.03   500     0.7184423  0.3529718  0.5651800
##   0.03  1000     0.7118129  0.3649936  0.5581016
##   0.03  1500     0.7104156  0.3687531  0.5540925
## 
## Tuning parameter 'max_depth' was held constant at a value of 2
## Tuning
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## 
## Tuning parameter 'subsample' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 1500, max_depth = 2, eta
##  = 0.03, gamma = 0, colsample_bytree = 1, min_child_weight = 1 and subsample
##  = 1.
trellis.par.set(caretTheme())
plot(xgbFit,main = "Fine Tune Parameters on XGBoost",
          xlab = "nrounds",
          ylab = "RMSE" )

APP. Student Info

Course:　STAT5003_Computational Statistical Methods
Assignment:　Lab Week 9
Student Name:　Yujun Yao(June Yao)
SID:　500316995
Email:　yyao2983@uni.sydney.edu.au

Assignment_week09_June Yao_500316995

June Yao

2020-10-31

Libraries to load

1 Load Data

2 Tree Implementation

2.1 Build decision tree model

2.2 Assess decision tree model

3 Random forest implementation

3.1 Fit Random Forest

3.2 Assess random forest

3.3 Calibration

4 Optional Boosting

4.1 `gbm` fitting

4.2 `xgboost` fitting

5 Recommended extra: Tuning Boosted models

APP. Student Info

Assignment_week09_June Yao_500316995

June Yao

2020-10-31

Libraries to load

1 Load Data

2 Tree Implementation

2.1 Build decision tree model

2.2 Assess decision tree model

3 Random forest implementation

3.1 Fit Random Forest

3.2 Assess random forest

3.3 Calibration

4 Optional Boosting

4.1 gbm fitting

4.2 xgboost fitting

5 Recommended extra: Tuning Boosted models

APP. Student Info

4.1 `gbm` fitting

4.2 `xgboost` fitting