helped by 111078513
insurance <- read.csv("C:/R-language/BACS/insurance.csv")
Question 1) Create some explanatory models to learn more about
charges:
According to the results,
age,bmi,children,smoker-yes(compared to smoker-no),region northeast,
region southwest(compared to northeast) are significantly related to
charges.
1-b) Create a decision tree (specifically, a regression tree) with
default parameters to rpart().
1-b-i) Plot a visual representation of the tree structure
library(rpart)
## Warning: 套件 'rpart' 是用 R 版本 4.2.3 來建造的
library(rpart.plot)
## Warning: 套件 'rpart.plot' 是用 R 版本 4.2.3 來建造的
insu_rpart <- rpart(charges ~ age + sex + bmi + children + smoker + region, data = insurance)
rpart.plot(insu_rpart)

1-b-ii) How deep is the tree (see nodes with “decisions” – ignore
the leaves at the bottom)
According to the plot, it is noticed that the depth of the
tree equals to two.
1-b-iii) How many leaf groups does it suggest to bin the data
into?
According to the plot, it suggests to bin the data into four
leaf groups.
1-b-iv) What conditions (combination of decisions) describe each
leaf group?
rules <- rpart.rules(insu_rpart)
print(rules)
## charges
## 5399 when smoker is no & age < 43
## 12300 when smoker is no & age >= 43
## 21369 when smoker is yes & bmi < 30
## 41693 when smoker is yes & bmi >= 30
Question 3) Let’s see if bagging helps our models
For bagging and boosting, we will only use split-sample testing to
save time: partition the data to create training and test sets using an
80:20 split. Use the regression model and decision tree you created in
Question 1 for bagging and boosting.
shuffled_indicies <- sample(1:nrow(insurance))[1:(nrow(insurance)*0.8)]
train_set <- insurance[shuffled_indicies, ]
test_set <- insurance[-shuffled_indicies, ]
3-a) Implement the bagged_learn(…) and bagged_predict(…) functions
using the hints in the class notes and help from your classmates on
Teams. Feel free to share your code on Teams to get feedback, or ask
others for help.
bagged_learn <- function(model, dataset, b=100) {
lapply(1:b, \(i) {
n = nrow(dataset)
train_set <- dataset[sample(1:n, n, replace = TRUE),]
update(model, data = train_set)
})
}
bagged_predict <- function(bagged_models, new_data) {
predictions <- lapply(bagged_models, \(model){
predict(model, new_data)
}) # get b predictions of new_data
as.data.frame(predictions) |> apply(X= _,1,mean) # apply a mean over the columns of predictions
}
RMSE_out <- function(actuals,preds){
sqrt(mean( (actuals - preds) ^ 2))
}
3-b) What is the RMSEout for the bagged OLS regression?
RMSE_out_baglm <- bagged_learn(insu_lm, train_set) |> bagged_predict(test_set) |>
RMSE_out(test_set$charges,preds = _)
cat("The RMSEout for the bagged OLS regression",RMSE_out_baglm)
## The RMSEout for the bagged OLS regression 5558.484
3-c) What is the RMSEout for the bagged decision tree?
RMSE_out_bagrpart <- bagged_learn(insu_rpart, train_set) |> bagged_predict(test_set) |>
RMSE_out(test_set$charges,preds = _)
cat("The RMSEout for the bagged OLS regression",RMSE_out_bagrpart)
## The RMSEout for the bagged OLS regression 4451.156
Question 4) Let’s see if boosting helps our models. You can use a
learning rate of 0.1 and adjust it if you find a better rate.
4-a) Write boosted_learn(…) and boosted_predict(…) functions using
the hints in the class notes and help from your classmates on
Teams.
boost_learn <- function(model, dataset, outcome, n=100, rate=0.1) {
predictors <- dataset[, !(colnames(dataset)== outcome)] # get data frame of only predictor variables
# Initialize residuals and models
res <- dataset[, c(outcome)] # set res to vector of actuals (y) to start
models <- list()
for (i in 1:n) {
this_model <- update(model, data = cbind(predictors,charges = res))
res <- res - rate*predict(this_model,predictors) # update residuals with learning rate
models[[i]] <- this_model # Store model
}
list(models=models, rate=rate)
}
boost_predict <- function(boosted_learning, new_data) {
boosted_models <- boosted_learning$models
rate <- boosted_learning$rate
n <- nrow(new_data)
predictions <- lapply(boosted_models,\(model){predict(model,new_data)}) # get predictions of new_data from each model
pred_frame <- as.data.frame(predictions) |> unname()
apply(pred_frame, 1, \(predict){0.1*sum(predict)}) # apply a sum over the columns of predictions, weighted by learning rate
}
4-b) What is the RMSEout for the boosted OLS regression?
RMSE_out_bootlm <- boost_learn(insu_lm, train_set, outcome="charges", n=1000) |>
boost_predict(test_set) |>
RMSE_out(test_set$charges, preds = _)
cat("The RMSEout for the boosted OLS regression is",RMSE_out_bootlm)
## The RMSEout for the boosted OLS regression is 5561.18
4-c) What is the RMSEout for the boosted decision tree?
RMSE_out_bootrpart <- boost_learn(insu_rpart, train_set, outcome = "charges", n=1000) |>
boost_predict(test_set) |>
RMSE_out(test_set$charges, pred = _)
cat("The RMSEout for the boosted decision tree is",RMSE_out_bootrpart)
## The RMSEout for the boosted decision tree is 4103.057
Question 5) Let’s engineer the best predictive decision trees. Let’s
repeat the bagging and boosting decision tree several times to see what
kind of base tree helps us learn the fastest. But this time, split the
data 70:20:10 — use 70% for training, 20% for fine-tuning, and use the
last 10% to report the final RMSEout.
shuffled_indicies <- sample(1:nrow(insurance))
n = nrow(insurance)
train_set <- insurance[shuffled_indicies[1:(n*0.7)], ]
tuning_set <- insurance[shuffled_indicies[(n*0.7):(n*0.9)], ]
test_set <- insurance[shuffled_indicies[(n*0.9):n], ]
5-a) Repeat the bagging of the decision tree, using a base tree of
maximum depth 1, 2, … n, keep training on the 70% training set while the
RMSEout of your 20% set keeps dropping; stop when the RMSEout has
started increasing again (show prediction error at each depth). Report
the final RMSEout using the final 10% of the data as your test set.
repeat_bagging <- function(model, train_set, tuning_set){
this_model <- update(model, control = rpart.control(maxdepth = 1))
rmse_out_rep <- bagged_learn(this_model, train_set, b=100) |>
bagged_predict(tuning_set) |>
RMSE_out(tuning_set$charges, preds = _)
k <- 1
print(paste("--- k =", k, "---"))
print(rmse_out_rep)
while(TRUE){
k <- k + 1
print(paste("--- k =", k, "---"))
control_text <- paste("update(this_model, control = rpart.control(maxdepth =",
k, "))")
temp_model <- eval(parse(text = control_text))
rmse_out_temp <- bagged_learn(temp_model, train_set, b=100) |>
bagged_predict(tuning_set) |>
RMSE_out(tuning_set$charges, preds = _)
if (rmse_out_temp > rmse_out_rep){
print(paste(rmse_out_temp, "(Stop)"))
result = list(model = this_model, k = k-1)
return(result)
}
rmse_out_rep <- rmse_out_temp
this_model <- temp_model
print(rmse_out_rep)
}
}
best_tree_bagging <- repeat_bagging(insu_rpart, train_set, tuning_set)
## [1] "--- k = 1 ---"
## [1] 7754.462
## [1] "--- k = 2 ---"
## [1] 5127.367
## [1] "--- k = 3 ---"
## [1] 4874.015
## [1] "--- k = 4 ---"
## [1] 4817.473
## [1] "--- k = 5 ---"
## [1] "4859.24071940965 (Stop)"
RMSE_out_repeat_bagged <- bagged_learn(best_tree_bagging$model, train_set, b=100) |>
bagged_predict(test_set) |>
RMSE_out(test_set$charges, preds = _)
print(paste("The RMSE_out for the Repeat Bagged decision tree is ",
round(RMSE_out_repeat_bagged, 2),
", where max depth = ", best_tree_bagging$k,
sep = ""))
## [1] "The RMSE_out for the Repeat Bagged decision tree is 5237.44, where max depth = 4"
5-b) Repeat the boosting of the decision tree, using a base tree of
maximum depth 1, 2, … n, keep training on the 70% training set while the
RMSEout of your 20% set keeps dropping; stop when the RMSEout has
started increasing again (show prediction error at each depth). Report
the final RMSEout using the final 10% of the data as your test set.
repeat_boosting <- function(model, train_set, tuning_set, maxdepth = 1){
this_model <- update(model, control = rpart.control(maxdepth = 1))
rmse_out_rep <- boost_learn(this_model, train_set, outcome="charges", n=1000) |>
boost_predict(tuning_set) |>
RMSE_out(tuning_set$charges, preds = _)
k <- 1
print(paste("--- k =", k, "---"))
print(rmse_out_rep)
while(TRUE){
k <- k + 1
print(paste("--- k =", k, "---"))
control_text <- paste("update(this_model, control = rpart.control(maxdepth =",
k, "))")
temp_model <- eval(parse(text = control_text))
rmse_out_temp <- boost_learn(temp_model, train_set, outcome="charges", n=1000) |>
boost_predict(tuning_set) |>
RMSE_out(tuning_set$charges, preds = _)
if (rmse_out_temp >= rmse_out_rep){
print(paste(rmse_out_temp, "(Stop)"))
result = list(model = this_model, k = k-1)
return(result)
}
rmse_out_rep <- rmse_out_temp
this_model <- temp_model
print(rmse_out_rep)
}
}
best_tree_boosting <- repeat_boosting(insu_rpart, train_set, tuning_set)
## [1] "--- k = 1 ---"
## [1] 6413.797
## [1] "--- k = 2 ---"
## [1] 4578.489
## [1] "--- k = 3 ---"
## [1] 4488.325
## [1] "--- k = 4 ---"
## [1] "4499.68781636166 (Stop)"
RMSE_out_repeat_boost <-
boost_learn(best_tree_boosting$model, train_set, outcome="charges", n=1000) |>
boost_predict(test_set) |>
RMSE_out(test_set$charges, preds = _)
print(paste("The RMSE_out for the Repeat Boosted decision tree is ",
round(RMSE_out_repeat_boost, 2),
", where max depth = ", best_tree_boosting$k, sep = ""))
## [1] "The RMSE_out for the Repeat Boosted decision tree is 5075.44, where max depth = 3"