wine <- read.csv("winequality-data.csv", header = TRUE)
# Hitters <- na.omit(Hitters)
wine <- wine[, -13]
head(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 6.7 0.15 0.38 1.7 0.037
## 2 6.7 0.44 0.31 1.9 0.030
## 3 6.1 0.17 0.21 1.9 0.090
## 4 6.6 0.39 0.22 4.0 0.038
## 5 6.8 0.32 0.34 6.0 0.050
## 6 8.3 0.28 0.27 17.5 0.045
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 20 84 0.99046 3.09 0.53 11.4
## 2 41 104 0.99000 3.29 0.62 12.6
## 3 44 130 0.99255 3.07 0.41 9.7
## 4 17 98 0.99018 3.25 0.53 13.0
## 5 5 129 0.99530 3.19 0.40 9.1
## 6 48 253 1.00014 3.02 0.56 9.1
## quality
## 1 6
## 2 7
## 3 5
## 4 7
## 5 5
## 6 6Split the dataset into 50% training and 50% testing.
set.seed(500316995)
X <- wine[, -12]
y <- wine$quality
inTrain <- createDataPartition(y, p = 0.5)[[1]]
train <- wine[inTrain, ]
test <- wine[-inTrain, ]
X_train <- X[inTrain, ]
X_test <- X[-inTrain, ]
y_train <- y[inTrain]
y_test <- y[-inTrain]method1
set.seed(500316995)
ct <- tree(train$quality ~ ., data = train)
summary(ct)
##
## Regression tree:
## tree(formula = train$quality ~ ., data = train)
## Variables actually used in tree construction:
## [1] "alcohol" "volatile.acidity" "free.sulfur.dioxide"
## [4] "residual.sugar"
## Number of terminal nodes: 9
## Residual mean deviance: 0.5516 = 1076 / 1950
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.60400 -0.41190 -0.08046 0.00000 0.58810 2.58800
plot(ct)
text(ct)method2
library(rpart)
library(rpart.plot)
# Create a decision tree model
decision_tree <- rpart(quality ~ ., data = train)
# Visualize the decision tree with rpart.plot
rpart.plot(decision_tree, box.palette = "RdBu", shadow.col = "gray", nn = TRUE)set.seed(500316995)
wine.forest <- randomForest(quality ~ ., data = train)
print(wine.forest)
##
## Call:
## randomForest(formula = quality ~ ., data = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.434991
## % Var explained: 45.33
forest.pred <- predict(wine.forest, newdata = X_test)
wine.forest$importance
## IncNodePurity
## fixed.acidity 96.40511
## volatile.acidity 155.54259
## citric.acid 100.39176
## residual.sugar 115.58033
## chlorides 112.31300
## free.sulfur.dioxide 159.30746
## total.sulfur.dioxide 115.13898
## density 172.21832
## pH 101.30907
## sulphates 92.40652
## alcohol 256.44731
varImpPlot(wine.forest)Compared with the single decision tree, random forest have lower RSS.(1120 > 829)
To fine tune ntree:
set.seed(500316995)
RSS.test <- c()
ntreesVal = c(50, 100, 200, 300, 400, 500, 600, 700, 800, 900)
for (i in ntreesVal) {
rf.model <- randomForest(quality ~ ., data = train, ntree = i)
rf.test <- predict(rf.model, newdata = X_test)
RSS.rf <- sum((rf.test - y_test)^2)
RSS.test <- c(RSS.test, RSS.rf)
}
data <- data.frame(ntreesVal, RSS.test)
fig <- plot_ly(data, x = ~ntreesVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'ntree' In Random Forest", xaxis = list(title = "ntree"),
yaxis = list(title = "RSS on test set"))
figntree value is around 500~600, and adding more trees does not improve performance.
To fine tune mtry:
set.seed(500316995)
RSS.test <- c()
mtryVal = seq(4, 9, by = 1)
for (i in mtryVal) {
rf.model <- randomForest(quality ~ ., data = train, ntree = 500, mtry = i)
rf.test <- predict(rf.model, newdata = X_test)
RSS.rf <- sum((rf.test - y_test)^2)
RSS.test <- c(RSS.test, RSS.rf)
}
data <- data.frame(mtryVal, RSS.test)
fig <- plot_ly(data, x = ~mtryVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'mtry' In Random Forest", xaxis = list(title = "mtry"),
yaxis = list(title = "RSS on test set"))
figntree value is around 5~6, and adding more variables will decrease performance.
gbm fittingset.seed(500316995)
wine.boost <- gbm(quality ~ ., data = train, distribution = "gaussian", n.trees = 5000)
summary(wine.boost)## var rel.inf
## free.sulfur.dioxide free.sulfur.dioxide 12.976804
## alcohol alcohol 12.133237
## pH pH 10.851045
## total.sulfur.dioxide total.sulfur.dioxide 9.701659
## volatile.acidity volatile.acidity 9.077407
## sulphates sulphates 8.605636
## fixed.acidity fixed.acidity 8.537015
## density density 8.386870
## residual.sugar residual.sugar 7.935762
## chlorides chlorides 7.486914
## citric.acid citric.acid 4.307650
RSS on the test set prediction is 1060, higher than random forest.
To fine tune n.trees:
set.seed(500316995)
RSS.test <- c()
n.treesVal = seq(100, 1000, by = 100)
for (i in n.treesVal) {
gbm.model <- gbm(quality ~ ., data = train, distribution = "gaussian", n.trees = i)
gbm.test <- predict(gbm.model, newdata = X_test)
RSS.gbm <- sum((gbm.test - y_test)^2)
RSS.test <- c(RSS.test, RSS.gbm)
}
data <- data.frame(n.treesVal, RSS.test)
fig <- plot_ly(data, x = ~n.treesVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'n.trees' In a Boosted Tree", xaxis = list(title = "n.trees"),
yaxis = list(title = "RSS on test set"))
fign.trees value is around 400, and adding more variables will decrease performance.
xgboost fittingset.seed(500316995)
xgb_train = xgb.DMatrix(data = as.matrix(X_train), label = y_train)
xgb_test = xgb.DMatrix(data = as.matrix(X_test), label = y_test)
wine.xgb <- xgboost(xgb_train, nrounds = 300, max_depth = 2, eta = 0.01, verbose = 0)
xgb.pred <- predict(wine.xgb, newdata = xgb_test)
plot(xgb.pred, y_test)
abline(0, 1)RSS on the test set prediction is 1193.
To fine tune nrounds:
set.seed(500316995)
RSS.test <- c()
nroundsVal = c(300, 500, 1000, 1500)
for (i in nroundsVal) {
xgb.model <- xgboost(xgb_train, nrounds = i, max_depth = 2, eta = 0.01, verbose = 0)
xgb.test <- predict(xgb.model, newdata = xgb_test)
RSS.xgb <- sum((xgb.test - y_test)^2)
RSS.test <- c(RSS.test, RSS.xgb)
}
data <- data.frame(nroundsVal, RSS.test)
fig <- plot_ly(data, x = ~nroundsVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'nrounds' In Xgboost", xaxis = list(title = "nrounds"),
yaxis = list(title = "RSS on test set"))
figTo fine tune eta:
set.seed(500316995)
RSS.test <- c()
etaVal = c(0.01, 0.02, 0.03)
for (i in etaVal) {
xgb.model <- xgboost(xgb_train, nrounds = 1500, max_depth = 2, eta = i, verbose = 0)
xgb.test <- predict(xgb.model, newdata = xgb_test)
RSS.xgb <- sum((xgb.test - y_test)^2)
RSS.test <- c(RSS.test, RSS.xgb)
}
data <- data.frame(etaVal, RSS.test)
fig <- plot_ly(data, x = ~etaVal, y = ~RSS.test, type = "scatter", mode = "lines")
fig <- fig %>% layout(title = "Fine Tune 'eta' In Xgboost", xaxis = list(title = "eta"),
yaxis = list(title = "RSS on test set"))
figtune GBM using caret package:
# Basic Parameter Tuning
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 5)
# Alternate Tuning Grids
gbmGrid <- expand.grid(interaction.depth = c(1, 5, 9),
n.trees = (2:20)*50,
shrinkage = 0.1,
n.minobsinnode = 10)
set.seed(500316995)
gbmFit <- train(quality ~ ., data = train,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneGrid = gbmGrid)
gbmFit
## Stochastic Gradient Boosting
##
## 1959 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 1567, 1568, 1567, 1566, 1568, 1566, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 100 0.7362527 0.3225697 0.5846532
## 1 150 0.7305795 0.3314542 0.5774298
## 1 200 0.7292758 0.3336155 0.5754446
## 1 250 0.7289535 0.3350475 0.5747415
## 1 300 0.7286795 0.3355301 0.5744462
## 1 350 0.7281199 0.3367093 0.5733173
## 1 400 0.7279783 0.3370753 0.5735390
## 1 450 0.7279302 0.3374161 0.5730847
## 1 500 0.7277324 0.3379215 0.5727456
## 1 550 0.7270541 0.3395281 0.5721529
## 1 600 0.7272055 0.3394400 0.5723890
## 1 650 0.7272017 0.3396276 0.5725274
## 1 700 0.7267240 0.3403297 0.5721790
## 1 750 0.7267989 0.3404770 0.5722217
## 1 800 0.7264276 0.3412214 0.5720308
## 1 850 0.7263539 0.3414956 0.5719957
## 1 900 0.7259669 0.3426450 0.5714543
## 1 950 0.7261649 0.3419749 0.5716206
## 1 1000 0.7266992 0.3413185 0.5716210
## 5 100 0.7106975 0.3673910 0.5562537
## 5 150 0.7079073 0.3734706 0.5510131
## 5 200 0.7076066 0.3752111 0.5479786
## 5 250 0.7086543 0.3746580 0.5469854
## 5 300 0.7094686 0.3749694 0.5449420
## 5 350 0.7106329 0.3744511 0.5446482
## 5 400 0.7107651 0.3754967 0.5428648
## 5 450 0.7121918 0.3748813 0.5428876
## 5 500 0.7137284 0.3738052 0.5429280
## 5 550 0.7146018 0.3735233 0.5425093
## 5 600 0.7154602 0.3732026 0.5420693
## 5 650 0.7162739 0.3729462 0.5414077
## 5 700 0.7184044 0.3708320 0.5418238
## 5 750 0.7197326 0.3698542 0.5417187
## 5 800 0.7206636 0.3697226 0.5418726
## 5 850 0.7220666 0.3684387 0.5420467
## 5 900 0.7220383 0.3691863 0.5413047
## 5 950 0.7227254 0.3692415 0.5412302
## 5 1000 0.7240041 0.3681923 0.5411933
## 9 100 0.7058751 0.3771756 0.5484470
## 9 150 0.7044415 0.3816944 0.5441407
## 9 200 0.7046766 0.3832219 0.5403264
## 9 250 0.7057139 0.3838198 0.5389463
## 9 300 0.7067026 0.3842534 0.5364726
## 9 350 0.7075390 0.3843756 0.5344856
## 9 400 0.7090520 0.3840878 0.5330889
## 9 450 0.7108625 0.3825117 0.5330692
## 9 500 0.7114202 0.3830780 0.5314629
## 9 550 0.7122656 0.3830201 0.5307927
## 9 600 0.7131852 0.3827910 0.5297958
## 9 650 0.7142192 0.3825248 0.5292818
## 9 700 0.7151591 0.3819745 0.5289880
## 9 750 0.7159789 0.3818157 0.5282693
## 9 800 0.7170267 0.3810389 0.5280034
## 9 850 0.7179333 0.3805250 0.5274039
## 9 900 0.7188259 0.3800430 0.5269181
## 9 950 0.7195144 0.3798579 0.5262685
## 9 1000 0.7204305 0.3790762 0.5259406
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 9, shrinkage = 0.1 and n.minobsinnode = 10.
trellis.par.set(caretTheme())
plot(gbmFit) tune XGBoost using caret package:
# Basic Parameter Tuning
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 5)
# Alternate Tuning Grids
xgbGrid <- expand.grid(nrounds = c(300, 500, 1000, 1500),
max_depth = 2,
eta = c(0.01, 0.02, 0.03),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
set.seed(500316995)
xgbFit <- train(quality ~ ., data = train,
method = "xgbTree",
trControl = fitControl,
verbose = FALSE,
tuneGrid = xgbGrid,
objective="reg:squarederror")
xgbFit
## eXtreme Gradient Boosting
##
## 1959 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 1567, 1568, 1567, 1566, 1568, 1566, ...
## Resampling results across tuning parameters:
##
## eta nrounds RMSE Rsquared MAE
## 0.01 300 0.7913690 0.3157648 0.6172729
## 0.01 500 0.7359278 0.3287945 0.5803894
## 0.01 1000 0.7238817 0.3445116 0.5701468
## 0.01 1500 0.7189925 0.3520156 0.5656466
## 0.02 300 0.7315816 0.3337868 0.5770693
## 0.02 500 0.7236587 0.3448285 0.5700531
## 0.02 1000 0.7155525 0.3578821 0.5623495
## 0.02 1500 0.7123323 0.3640907 0.5585572
## 0.03 300 0.7249205 0.3430016 0.5711306
## 0.03 500 0.7184423 0.3529718 0.5651800
## 0.03 1000 0.7118129 0.3649936 0.5581016
## 0.03 1500 0.7104156 0.3687531 0.5540925
##
## Tuning parameter 'max_depth' was held constant at a value of 2
## Tuning
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
##
## Tuning parameter 'subsample' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 1500, max_depth = 2, eta
## = 0.03, gamma = 0, colsample_bytree = 1, min_child_weight = 1 and subsample
## = 1.
trellis.par.set(caretTheme())
plot(xgbFit,main = "Fine Tune Parameters on XGBoost",
xlab = "nrounds",
ylab = "RMSE" )Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 9
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: yyao2983@uni.sydney.edu.au