Include the R code for this HW.
install.packages(“randomForest”) install.packages(“gbm”)
knitr::opts_chunk$set(echo = TRUE)
library(ISLR2)
library(GGally)
library(tibble)
library(dplyr)
library(knitr)
library(kableExtra)
library(caret)
library(rpart)
library(glmnet)
library(randomForest)
library(gbm)
#add more libraries as needed.
Use hw5data1.Rdata to answer this question.
load("hw5data1.Rdata") #put this file in your working directory or in the same folder as your HW Rmd file
str(circle.trn)#your training data
## tibble [4,000 × 3] (S3: tbl_df/tbl/data.frame)
## $ x.1 : num [1:4000] -0.479 0.2 0.32 0.888 0.333 ...
## $ x.2 : num [1:4000] 0.36 0.784 0.581 -0.271 0.532 ...
## $ classes: Factor w/ 2 levels "1","2": 1 2 1 2 1 1 1 2 2 2 ...
str(circle.tst)# your test data
## tibble [1,000 × 3] (S3: tbl_df/tbl/data.frame)
## $ x.1 : num [1:1000] 0.9886 -0.0187 -0.4419 0.7234 0.1634 ...
## $ x.2 : num [1:1000] 0.352 -0.234 0.278 0.209 -0.259 ...
## $ classes: Factor w/ 2 levels "1","2": 2 1 1 1 1 2 1 1 2 1 ...
plot(x.2~x.1,data=circle.trn,col=circle.trn$classes,pch=19)
classes variable: categorical response variablex.1, x.2: feature variablescat("The plot generated in this scenario has a visibly non-linear decision boundary shown by the round shape formed between the two classes. A decision tree in this example would be formed with horizontal and vertical lines at x1 and x2. This data isn't easily segrated by those boundaries. A decision tree will work better it it is deep, but if only a few splits are present, approximating the data won't be accurate.")
## The plot generated in this scenario has a visibly non-linear decision boundary shown by the round shape formed between the two classes. A decision tree in this example would be formed with horizontal and vertical lines at x1 and x2. This data isn't easily segrated by those boundaries. A decision tree will work better it it is deep, but if only a few splits are present, approximating the data won't be accurate.
train() function of
caret package, with 10-fold cross validation. Use the grid
of odd numbers, from 1 to 101 for k. Choose best k. (For
this problem, do not need to consider scaling.)ctrl <- trainControl(method = "cv", number = 10)
k_grid <- data.frame(k = seq(1, 101, by = 2))
set.seed(1)
knn_model <- train(classes ~ ., data = circle.trn,method = "knn",trControl = ctrl,tuneGrid = k_grid)
print(knn_model)
## k-Nearest Neighbors
##
## 4000 samples
## 2 predictor
## 2 classes: '1', '2'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3600, 3601, 3600, 3599, 3601, 3600, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9899975 0.9799939
## 3 0.9864968 0.9729922
## 5 0.9864962 0.9729902
## 7 0.9884981 0.9769951
## 9 0.9864968 0.9729917
## 11 0.9872500 0.9744983
## 13 0.9864975 0.9729931
## 15 0.9867481 0.9734943
## 17 0.9877487 0.9754952
## 19 0.9869981 0.9739930
## 21 0.9870000 0.9739965
## 23 0.9875018 0.9750001
## 25 0.9880012 0.9760003
## 27 0.9875006 0.9749982
## 29 0.9882500 0.9764970
## 31 0.9860000 0.9719963
## 33 0.9852500 0.9704956
## 35 0.9857475 0.9714900
## 37 0.9862475 0.9724895
## 39 0.9874987 0.9749930
## 41 0.9864987 0.9729917
## 43 0.9855012 0.9709964
## 45 0.9839993 0.9679922
## 47 0.9849993 0.9699934
## 49 0.9859993 0.9719929
## 51 0.9854993 0.9709932
## 53 0.9847500 0.9694934
## 55 0.9842506 0.9684944
## 57 0.9839993 0.9679922
## 59 0.9844993 0.9689918
## 61 0.9852500 0.9704938
## 63 0.9852506 0.9704950
## 65 0.9845006 0.9689950
## 67 0.9822506 0.9644931
## 69 0.9835000 0.9669929
## 71 0.9837500 0.9674929
## 73 0.9837506 0.9674935
## 75 0.9837493 0.9674920
## 77 0.9835006 0.9669933
## 79 0.9840012 0.9679954
## 81 0.9842506 0.9684942
## 83 0.9840006 0.9679947
## 85 0.9835018 0.9669966
## 87 0.9845018 0.9689976
## 89 0.9852518 0.9704979
## 91 0.9845031 0.9690001
## 93 0.9845006 0.9689947
## 95 0.9845018 0.9689970
## 97 0.9855000 0.9709936
## 99 0.9850006 0.9699950
## 101 0.9852506 0.9704950
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
plot(knn_model)
cat("The best k seems to always be 1 no matter what the seed is set to.")
## The best k seems to always be 1 no matter what the seed is set to.
rpart() function of
rpart package. Use cp=0 to grow a big tree.
Then create the cp-table and cp vs size of plot. (By default, this
function use 10-fold cross validation. This number can be controlled
using xval= option if necessary. No need to change for this
HW.) Based on the result, choose the optimal cp value.tree_model <- rpart(classes ~ ., data = circle.trn, method = "class", control=rpart.control(cp = 0))
printcp(tree_model)
##
## Classification tree:
## rpart(formula = classes ~ ., data = circle.trn, method = "class",
## control = rpart.control(cp = 0))
##
## Variables actually used in tree construction:
## [1] x.1 x.2
##
## Root node error: 1982/4000 = 0.4955
##
## n= 4000
##
## CP nsplit rel error xerror xstd
## 1 0.2479818 0 1.000000 1.000000 0.0159543
## 2 0.1755802 2 0.504036 0.516650 0.0139262
## 3 0.1659939 3 0.328456 0.410696 0.0128470
## 4 0.0123613 4 0.162462 0.183148 0.0091662
## 5 0.0080727 7 0.122099 0.136226 0.0080058
## 6 0.0052136 10 0.093340 0.122603 0.0076224
## 7 0.0045409 13 0.077699 0.105449 0.0071009
## 8 0.0029011 16 0.064077 0.095358 0.0067704
## 9 0.0025227 21 0.048436 0.075681 0.0060624
## 10 0.0023545 22 0.045913 0.074672 0.0060234
## 11 0.0016818 28 0.031786 0.065086 0.0056373
## 12 0.0010091 31 0.026741 0.057518 0.0053097
## 13 0.0000000 34 0.023713 0.053986 0.0051487
cp_table <- tree_model$cptable
best_row <- which.min(cp_table[,"xerror"])
optimal_cp <- cp_table[best_row, "CP"]
plotcp(tree_model)
cat("The cp table has a minimum cross-validation error with a corresponding cp value. In this case, the cp value is:", optimal_cp, "\n")
## The cp table has a minimum cross-validation error with a corresponding cp value. In this case, the cp value is: 0
cat("The k value from part 'b' and the cp value from part 'c' will be used for refitting on the training data.")
## The k value from part 'b' and the cp value from part 'c' will be used for refitting on the training data.
best_k <- knn_model$bestTune$k
final_knn <- train(classes ~ .,data = circle.trn,method = "knn", tuneGrid = data.frame(k = best_k),trControl = trainControl(method = "none"))
pred_knn <- predict(final_knn, newdata = circle.tst)
acc_knn <- mean(pred_knn == circle.tst$classes)
cat("Test accuracy for k-NN:", round(acc_knn, 4), "\n")
## Test accuracy for k-NN: 0.986
final_tree <- rpart(classes ~ ., data = circle.trn, method = "class",
control = rpart.control(cp = optimal_cp))
pred_tree <- predict(final_tree, newdata = circle.tst, type = "class")
acc_tree <- mean(pred_tree == circle.tst$classes)
cat("Test accuracy for Decision Tree:", round(acc_tree, 4), "\n")
## Test accuracy for Decision Tree: 0.974
if (acc_knn > acc_tree) {
cat("k-NN performed better on the test data.\n")
} else if (acc_knn < acc_tree) {
cat("Decision Tree performed better on the test data.\n")
} else {
cat("Both models performed equally well on the test data.\n")
}
## k-NN performed better on the test data.
This question relates to the Boston data set of
ISLR2 package.
set.seed(42)
trn.idx=sample(1:nrow(ISLR2::Boston),450)
tst.boston=ISLR2::Boston[-trn.idx,]
trn.boston=ISLR2::Boston[trn.idx,]
We are splitting the data into two parts: a testing data that contains 56 observations, and the rest 450 observations as training data.
crim (our response variable) with
all the other variables in the data.train function of caret package for
this question.control=trainControl(method = "cv",number=10)
lm.boston <- train(medv ~ ., data = trn.boston, method = 'lm', trControl = control)
print(lm.boston)
## Linear Regression
##
## 450 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 404, 405, 405, 406, 406, 404, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.743683 0.7315433 3.372396
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
cv_rmse <- lm.boston$results$RMSE
cv_mse <- cv_rmse^2
cat("Cross-validated MSE for linear regression:", round(cv_mse, 4), "\n")
## Cross-validated MSE for linear regression: 22.5025
Consider two different pre-processing setups.
k.grid <- data.frame(k = seq(1, 25, by = 2))
set.seed(42)
knn_unscaled <- train(medv ~ ., data = trn.boston, method = "knn", trControl = control, tuneGrid = k.grid)
print(knn_unscaled)
## k-Nearest Neighbors
##
## 450 samples
## 12 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 404, 406, 406, 405, 404, 405, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 5.739286 0.6583236 3.876363
## 3 5.247912 0.6791403 3.594650
## 5 5.517600 0.6497725 3.831792
## 7 5.354100 0.6688601 3.780027
## 9 5.619590 0.6353906 3.923334
## 11 5.780034 0.6138758 4.001426
## 13 5.985913 0.5847458 4.119325
## 15 6.177502 0.5553526 4.263939
## 17 6.377251 0.5191853 4.421571
## 19 6.544803 0.4902447 4.531245
## 21 6.685154 0.4652833 4.609369
## 23 6.783900 0.4490671 4.667934
## 25 6.894021 0.4285970 4.733635
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 3.
best_k_unscaled <- knn_unscaled$bestTune$k
mse_unscaled <- knn_unscaled$results$RMSE[knn_unscaled$results$k == best_k_unscaled]^2
cat("Unscaled - Best k:", best_k_unscaled, "\n")
## Unscaled - Best k: 3
cat("Unscaled - CV MSE:", round(mse_unscaled, 4), "\n")
## Unscaled - CV MSE: 27.5406
preProcess = c("center","scale") option inside the train
function.knn_scaled <- train(medv ~ ., data = trn.boston, method = "knn", trControl = control, preProcess = c("center", "scale"), tuneGrid = k.grid)
print(knn_scaled)
## k-Nearest Neighbors
##
## 450 samples
## 12 predictor
##
## Pre-processing: centered (12), scaled (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 405, 406, 404, 406, 405, 404, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 4.454122 0.7565013 2.942764
## 3 4.472057 0.7706894 2.929714
## 5 4.684630 0.7545972 2.972817
## 7 4.711799 0.7564650 3.004763
## 9 4.792988 0.7452526 3.080773
## 11 4.733255 0.7516809 3.073481
## 13 4.751437 0.7504458 3.084301
## 15 4.770388 0.7464379 3.116690
## 17 4.805630 0.7422167 3.167369
## 19 4.869374 0.7391484 3.225850
## 21 4.911395 0.7364080 3.263665
## 23 4.974053 0.7307418 3.314144
## 25 5.035931 0.7251284 3.342815
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 1.
best_k_scaled <- knn_scaled$bestTune$k
mse_scaled <- knn_scaled$results$RMSE[knn_scaled$results$k == best_k_scaled]^2
cat("Scaled - Best k:", best_k_scaled, "\n")
## Scaled - Best k: 1
cat("Scaled - CV MSE:", round(mse_scaled, 4), "\n")
## Scaled - CV MSE: 19.8392
Which setup and k gives the lowest error?
if (mse_scaled < mse_unscaled) {
cat("The scaled setup performed better.\n")
cat("Best k (scaled):", best_k_scaled, "\n")
cat("CV MSE (scaled):", round(mse_scaled, 4), "\n")
} else {
cat("The unscaled setup performed better.\n")
cat("Best k (unscaled):", best_k_unscaled, "\n")
cat("CV MSE (unscaled):", round(mse_unscaled, 4), "\n")
}
## The scaled setup performed better.
## Best k (scaled): 1
## CV MSE (scaled): 19.8392
cat("Scaling improves kNN")
## Scaling improves kNN
train() function of the caret package:formula <-medv ~.
control=trainControl(method = "cv",number=10)
lasso<-train(formula,data=trn.boston,method = 'glmnet', trControl=control,preProc = c("center","scale"), tuneGrid = expand.grid(alpha = 1, lambda =seq(from=0,to=1,by=0.01)))
ridge<-train(formula,data=trn.boston,
method = 'glmnet',
trControl=control,preProc = c("center","scale"),
tuneGrid = expand.grid(alpha = 0, lambda =seq(from=0,to=1,by=0.01)))
Find best tuning parameter for each lasso and ridge regression, and report CV error.
cat("LASSO:\n")
## LASSO:
cat("Best lambda:", lasso$bestTune$lambda, "\n")
## Best lambda: 0.02
cat("CV MSE:", round(min(lasso$results$RMSE^2), 4), "\n\n")
## CV MSE: 22.2353
cat("RIDGE:\n")
## RIDGE:
cat("Best lambda:", ridge$bestTune$lambda, "\n")
## Best lambda: 0.64
cat("CV MSE:", round(min(ridge$results$RMSE^2), 4), "\n")
## CV MSE: 21.8323
train() function of the caret package.
Find best tuning parameter for each methods.bagging <- train(formula,data = trn.boston, method = "rf", trControl = control, tuneGrid = data.frame(mtry = ncol(trn.boston) - 1), importance = TRUE)
cat("Bagging:\n")
## Bagging:
cat("Total Number of Predictors @ split:", bagging$bestTune$mtry, "\n")
## Total Number of Predictors @ split: 12
cat("CV MSE:", round(min(bagging$results$RMSE^2), 4), "\n\n")
## CV MSE: 10.8141
rf_grid <- expand.grid(mtry = 2:10)
random_forest <- train(formula, data = trn.boston, method = "rf", trControl = control, tuneGrid = rf_grid, importance = TRUE)
cat("Random Forest:\n")
## Random Forest:
cat("Total Number of Predictors @ split:", random_forest$bestTune$mtry, "\n")
## Total Number of Predictors @ split: 4
cat("CV MSE:", round(min(random_forest$results$RMSE^2), 4), "\n\n")
## CV MSE: 9.6701
boosting <- train(formula, data = trn.boston, method = "gbm", trControl = control, verbose = FALSE, tuneGrid = expand.grid(n.trees = c(100, 200, 500), interaction.depth = c(1, 3, 5), shrinkage = c(0.01, 0.1), n.minobsinnode = 10))
best_boost <- boosting$bestTune
cat("Boosting:\n")
## Boosting:
cat("Best n Trees:", best_boost$n.trees, "\n")
## Best n Trees: 500
cat("Best Interaction Depth:", best_boost$interaction.depth, "\n")
## Best Interaction Depth: 3
cat("Best Shrinkage:", best_boost$shrinkage, "\n")
## Best Shrinkage: 0.1
cat("CV MSE:", round(min(boosting$results$RMSE^2), 4), "\n")
## CV MSE: 9.5228
cat("Boosting appears to be the best method because it has the lowest CV MSE.")
## Boosting appears to be the best method because it has the lowest CV MSE.
control <- trainControl(method = "none")
boost_model <- train(formula,data = trn.boston, method = "gbm", trControl = control,preProc = c("center", "scale"), verbose = FALSE, tuneGrid = expand.grid( n.trees = 500,interaction.depth = 3, shrinkage = 0.1, n.minobsinnode = 10))
boost_preds <- predict(boost_model, newdata = tst.boston)
test_mse <- mean((tst.boston$medv - boost_preds)^2)
cat("Test MSE for Boosting:", round(test_mse, 4), "\n")
## Test MSE for Boosting: 13.6115