Shane Hylton
2022-12-15
Bagging vs. Boosting
It is easy to get overwhelmed by too many options
Start with nrounds and eta
max_depth: Ideally, keep this value under 10
gamma: Start low and increase in successive model runs
subsample: Start with 1. A value of 0.5 will randomly select half of the training set (Helps prevent overfitting)
Classes are Numeric, but the rough translation is as follows:
destfile <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data",
destfile)
zoo <- read.csv(destfile, header = FALSE)
colnames(zoo) <- c('animal_name','hair','feathers','eggs','milk','airborne',
'aquatic','predator','toothed','backbone','breathes',
'venomous','fins','legs','tail','domestic','catsize','label')
zoo_anon <- zoo[,-1] # Remove Animal Names
# Critical Step: All Labels must increase sequentially by 1, and they must start at 0.
#Not 1:7 but rather 0:6.
zoo_anon$label <- as.numeric(zoo_anon$label - 1)
set.seed(34);
sample_rows <- sample(nrow(zoo_anon),nrow(zoo_anon) *.7)
dt <- sort(sample_rows)
zoo_anon_mat <- sparse.model.matrix(label ~ .-1,data = zoo_anon)
test <- zoo_anon[-dt,]
train <- zoo_anon[dt,]
testm <- zoo_anon_mat[-dt,]
trainm <- zoo_anon_mat[dt,]
#XGBoost needs a matrix as input.
train_label <- train[,"label"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
test_label <- test[,"label"]
test_matrix <- xgb.DMatrix(data = testm, label = test_label)
## 6 x 16 sparse Matrix of class "dgCMatrix"
## [[ suppressing 16 column names 'hair', 'feathers', 'eggs' ... ]]
##
## 1 1 . . 1 . . 1 1 1 1 . . 4 . . 1
## 2 1 . . 1 . . . 1 1 1 . . 4 1 . 1
## 3 . . 1 . . 1 1 1 1 . . 1 . 1 . .
## 4 1 . . 1 . . 1 1 1 1 . . 4 . . 1
## 5 1 . . 1 . . 1 1 1 1 . . 4 1 . 1
## 6 1 . . 1 . . . 1 1 1 . . 4 1 . 1
## [1] "hair" "feathers" "eggs" "milk" "airborne" "aquatic"
## [7] "predator" "toothed" "backbone" "breathes" "venomous" "fins"
## [13] "legs" "tail" "domestic" "catsize"
nc <- length(unique(train_label))
xgb_params <- list("objective" = "multi:softmax","eval_metric" = "mlogloss",
"num_class" = nc,"eta" = 0.01)
watchlist <- list(train = train_matrix, test = test_matrix)
xgb_model <- xgb.train(params = xgb_params, data = train_matrix, nrounds = 10,
watchlist = watchlist)
## [1] train-mlogloss:1.921928 test-mlogloss:1.923682
## [2] train-mlogloss:1.898515 test-mlogloss:1.901969
## [3] train-mlogloss:1.875646 test-mlogloss:1.880749
## [4] train-mlogloss:1.853151 test-mlogloss:1.860008
## [5] train-mlogloss:1.831157 test-mlogloss:1.839316
## [6] train-mlogloss:1.809135 test-mlogloss:1.818870
## [7] train-mlogloss:1.787585 test-mlogloss:1.798463
## [8] train-mlogloss:1.766485 test-mlogloss:1.778915
## [9] train-mlogloss:1.746148 test-mlogloss:1.759863
## [10] train-mlogloss:1.725899 test-mlogloss:1.741098
e <- data.frame(xgb_model$evaluation_log)
fig_1 <- ggplot(e, aes(iter,train_mlogloss))+
geom_point(color = 'green',shape = 0)+
geom_line(data = e, aes(x=iter,y=test_mlogloss),color = 'red')+
ggtitle("XGBoost Model Log Loss vs. Number of Iterations")+
theme(plot.title = element_text(hjust = 0.5))
imp <- xgb.importance(colnames(train_matrix),model = xgb_model)
p <- predict(xgb_model,newdata = test_matrix)
pred <- data.frame(cbind(p, test_label))
pred$correct <- pred$p == pred$test_label
table(Prediction = pred$p,Actual = pred$test_label)
## Actual
## Prediction 0 1 2 3 4 5 6
## 0 10 0 0 0 0 0 0
## 1 0 10 0 0 0 0 0
## 2 0 0 2 0 1 0 0
## 3 0 0 0 2 0 0 0
## 5 0 0 0 0 0 3 0
## 6 0 0 0 0 0 0 3
##
## FALSE TRUE
## 1 30
set.seed(34);
trControl <- trainControl(method = "cv",number = 10,search = "grid")
train$label <- as.factor(train$label)
rf_default <- train(label~.,data = train,method = "rf",
metric = "Accuracy",trControl = trControl)
p <- data.frame(predict(rf_default,test))
pred <- data.frame(cbind(p, test_label))
pred$correct <- pred$p == pred$test_label
table(Prediction = pred$p,Actual = pred$test_label)
## Actual
## Prediction 0 1 2 3 4 5 6
## 0 10 0 0 0 0 0 0
## 1 0 10 0 0 0 0 0
## 2 0 0 2 0 1 0 0
## 3 0 0 0 2 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 3 0
## 6 0 0 0 0 0 0 3
##
## FALSE TRUE
## 1 30
destfile <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz", destfile)
traffic <- read.csv(destfile, header = FALSE)
colnames(traffic) <- traffic[1,]
traffic <- traffic[-1,]
rownames(traffic) <- c(seq(1,nrow(traffic)))
#str(traffic)
traffic <- traffic %>%
mutate_at(c('temp','rain_1h','snow_1h','clouds_all','traffic_volume'),as.numeric)
holiday <- model.matrix(~holiday - 1, traffic) # -1 to remove intercept
weather_main <- model.matrix(~weather_main - 1, traffic)
weather_description <- model.matrix(~weather_description - 1, traffic)
traffic <- traffic %>%
mutate(., hour = hour(date_time),
day = day(date_time),
month = month(date_time),
year = year(date_time))
traffic <- traffic[,-c(1,6,7,8)]
traffic_comb <- cbind(traffic,holiday,weather_main,weather_description)
set.seed(34);
sample_rows <- sample(nrow(traffic_comb),nrow(traffic_comb) *.7)
dt <- sort(sample_rows)
test_set_xgb <- traffic_comb[-dt,]
train_set <- traffic_comb[dt,]
train_x_mat <- data.matrix(train_set[,-5]) #xgboost requires a matrix as the input
train_y <- train_set[,5] #Response Variable
test_x_mat <- data.matrix(test_set_xgb[,-5])
test_y <- test_set_xgb[,5] #Response Variable
e <- data.frame(xgb_model$evaluation_log)
#which(e$test_rmse == min(e$test_rmse))
fig_2 <- ggplot(e, aes(iter,train_rmse))+
geom_point(color = 'blue',shape = 1)+
geom_line(data = e, aes(x=iter,y=test_rmse),color = 'red')+
ggtitle("XGBoost Model Root Mean Squared Error Per Iteration")+
theme(plot.title = element_text(hjust = 0.5))
imp <- xgb.importance(colnames(xgb_train),model = xgb_model)
xgb_model_tuned <- xgb.train(data=xgb_train,max.depth=3,watchlist=watchlist,
nrounds=25,verbose=0,eta = 0.25)
# eta of 0.01 removed all but month, clouds_all
e <- data.frame(xgb_model_tuned$evaluation_log)
#which(e$test_rmse == min(e$test_rmse))
fig_3 <- ggplot(e, aes(iter,train_rmse))+
geom_point(color = 'blue',shape = 1)+
geom_line(data = e, aes(x=iter,y=test_rmse),color = 'red')+
ggtitle("XGBoost Model Root Mean Squared Error Per Iteration")+
theme(plot.title = element_text(hjust = 0.5))
imp <- xgb.importance(colnames(xgb_train),model = xgb_model_tuned)
xgb_pred <- predict(xgb_model,xgb_test)
mse_xgb <- mean((test_y - xgb_pred)^2)
rmse_xgb <- RMSE(test_y,xgb_pred)
xgb_residuals <- data.frame(xgb_pred - test_y)
test_mean <- mean(xgb_pred)
tss <- sum((xgb_pred - test_mean)^2)
rss <- sum(xgb_residuals^2)
R2_xgb <- 1 - (rss/tss)
test_set_xgb$xgb_Pred <- xgb_pred
xgb_stats <- data.frame(rbind(mse_xgb,rmse_xgb,test_mean,R2_xgb))
colnames(xgb_stats) <- ("XGBoost Model Score")
xgb_stats$`XGBoost Model Score` <- round(xgb_stats$`XGBoost Model Score`,3)
rownames(xgb_stats) <- c("Mean_Squared_Error","Root_Mean_Squared_Error",
"Test_Mean","R_Squared")
xgb_compare <- cbind(xgb_stats,xgb_stats_tuned)
XGBoost Model Score | XGBoost Tuned Model Score | |
---|---|---|
Mean_Squared_Error | 790838.519 | 843712.299 |
Root_Mean_Squared_Error | 889.291 | 918.538 |
Test_Mean | 3247.804 | 3247.302 |
R_Squared | 0.747 | 0.720 |
Brownlee, Jason. “A Gentle Introduction to XGBoost for Applied Machine Learning.” MachineLearningMastery.com, 16 Feb. 2021, https://machinelearningmastery.com/gentle-introduction-xgboost-applied-machine-learning/.
prashant111 (Prashant Banerjee). “A Guide on XGBoost Hyperparameters Tuning.” Kaggle, Kaggle, 15 July 2020, https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook.
prashant111 (Prashant Banerjee). “Bagging vs Boosting.” Kaggle, Kaggle, 30 June 2020, https://www.kaggle.com/code/prashant111/bagging-vs-boosting/notebook.
UCI Machine Learning Repository: Metro Interstate Traffic Volume Data Set, https://archive.ics.uci.edu/ml/datasets/Metro+Interstate+Traffic+Volume.
“XGBoost R Tutorial.” XGBoost R Tutorial - Xgboost 1.7.2 Documentation, https://xgboost.readthedocs.io/en/stable/R-package/xgboostPresentation.html.