Helped by 111078513
# Load the data and remove missing values
cars <- read.table("C:/R-language/BACS/auto-data.txt", header=FALSE, na.strings = "?")
names(cars) <- c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration",
"model_year", "origin", "car_name")
cars$car_name <- NULL
cars <- na.omit(cars)
# IMPORTANT: Shuffle the rows of data in advance for this project!
set.seed(27935752) # use your own seed, or use this one to compare to next class notes
cars <- cars[sample(1:nrow(cars)),]
# DV and IV of formulas we are interested in
cars_full <- mpg ~ cylinders + displacement + horsepower + weight + acceleration +
model_year + factor(origin)
cars_reduced <- mpg ~ weight + acceleration + model_year + factor(origin)
cars_full_poly2 <- mpg ~ poly(cylinders, 2) + poly(displacement, 2) + poly(horsepower, 2) +
poly(weight, 2) + poly(acceleration, 2) + model_year +
factor(origin)
cars_reduced_poly2 <- mpg ~ poly(weight, 2) + poly(acceleration,2) + model_year +
factor(origin)
cars_reduced_poly6 <- mpg ~ poly(weight, 6) + poly(acceleration,6) + model_year +
factor(origin)
# create the models
library(rpart)
## Warning: 套件 'rpart' 是用 R 版本 4.2.3 來建造的
library(rpart.plot)
## Warning: 套件 'rpart.plot' 是用 R 版本 4.2.3 來建造的
attach(cars)
lm_full <- lm(cars_full)
lm_reduced <- lm(cars_reduced)
lm_poly2_full <- lm(cars_full_poly2)
lm_poly2_reduced <- lm(cars_reduced_poly2)
lm_poly6_reduced <- lm(cars_reduced_poly6)
rt_full <- rpart(cars_full)
rt_reduced <- rpart(cars_reduced)
detach(cars)
Question1) Compute and report the in-sample fitting error (MSEin) of
all the models described above. It might be easier to first write a
function called mse_in(…) that returns the fitting error of a single
model.
mse_in <- function(model){
mean(residuals(model)^2)
}
#create the list which can easily be used:
modelsList <- list(lm_full, lm_reduced, lm_poly2_full, lm_poly2_reduced,
lm_poly6_reduced, rt_full, rt_reduced)
nameList <- c("lm(cars_full)", "lm(lm_reduced)", "lm(cars_full_poly2)",
"lm(cars_reduced_poly2)", "lm(cars_reduced_poly6)",
"rpart(cars_full)", "rpart(cars_reduced)")
formulaList <- c(cars_full, cars_reduced, cars_full_poly2,
cars_reduced_poly2, cars_reduced_poly6,
cars_full, cars_reduced)
functionList <- c(lm, lm, lm, lm, lm, rpart, rpart)
#report the in-sample fitting error
models_MSEin <- sapply(1:7,\(i){
model <- modelsList[[i]]
MSEin <- mse_in(model)
})
names(models_MSEin) <- nameList
models_MSEin
## lm(cars_full) lm(lm_reduced) lm(cars_full_poly2)
## 10.682122 10.971643 7.919030
## lm(cars_reduced_poly2) lm(cars_reduced_poly6) rpart(cars_full)
## 8.364546 8.254377 9.155146
## rpart(cars_reduced)
## 9.501344
Question3) Let’s use k-fold cross validation (k-fold CV) to see how
all these models perform predictively!
3-a-i) Use your k_fold_mse function to find and report the 10-fold
CV MSEout for all models.
models_MSEout <- sapply(1:7, \(i){k_fold_mse(cars, k=10,formula = formulaList[[i]], func = functionList[[i]])})
names(models_MSEout) <- nameList
models_MSEout
## lm(cars_full) lm(lm_reduced) lm(cars_full_poly2)
## 11.262460 11.415855 8.599373
## lm(cars_reduced_poly2) lm(cars_reduced_poly6) rpart(cars_full)
## 8.818607 9.267369 13.342221
## rpart(cars_reduced)
## 13.476272
3-a-ii) For all the models, which is bigger — the fit error (MSEin)
or the prediction error (MSEout)?
MSE_errors <- data.frame(method = nameList, fit_error = models_MSEin,
pred_error = models_MSEout)
MSE_errors[, c("fit_error", "pred_error")]
## fit_error pred_error
## lm(cars_full) 10.682122 11.262460
## lm(lm_reduced) 10.971643 11.415855
## lm(cars_full_poly2) 7.919030 8.599373
## lm(cars_reduced_poly2) 8.364546 8.818607
## lm(cars_reduced_poly6) 8.254377 9.267369
## rpart(cars_full) 9.155146 13.342221
## rpart(cars_reduced) 9.501344 13.476272
According to the result, the prediction error(MSEout) is bigger than
the fit error(MSEin).
3-a-iii) Does the 10-fold MSEout of a model remain stable (same
value) if you re-estimate it over and over again, or does it vary?
rep_kfold <- sapply(1:7, \(i){
mean(replicate(10, k_fold_mse(cars[sample(1:nrow(cars)),],
k = 10, formula = formulaList[[i]], func = functionList[[i]])))
})
MSE_errors$"10 times MSE_out" = rep_kfold
MSE_errors[,c(3,4)]
## pred_error 10 times MSE_out
## lm(cars_full) 11.262460 11.296352
## lm(lm_reduced) 11.415855 11.388517
## lm(cars_full_poly2) 8.599373 8.614284
## lm(cars_reduced_poly2) 8.818607 8.867121
## lm(cars_reduced_poly6) 9.267369 9.275757
## rpart(cars_full) 13.342221 13.025962
## rpart(cars_reduced) 13.476272 12.762983
The result of 10 times MSE_out is similar to previous
result, so I think it remain stable.
3-b) Make sure your k_fold_mse() function can accept as many folds
as there are rows (i.e., k=392).
3-b-i) How many rows are in the training dataset and test dataset of
each iteration of k-fold CV when k=392?
n = nrow(cars)
k = 392
folds <- cut(1:n, breaks = k, labels = FALSE)
test_indices <- which(folds == 5)
train_set <- cars[-test_indices, ]
print(paste("There are ", nrow(train_set), " in train_set.", sep = ""))
## [1] "There are 391 in train_set."
test_set <- cars[test_indices, ]
print(paste("There are ", nrow(test_set), " in test_set.", sep = ""))
## [1] "There are 1 in test_set."
3-b-ii) Report the k-fold CV MSEout for all models using k=392.
MSE_kfold_392 <- sapply(1:7, \(i){k_fold_mse(cars, k = 392,
formula = formulaList[[i]],
func = functionList[[i]])})
names(MSE_kfold_392) <- nameList
MSE_kfold_392
## lm(cars_full) lm(lm_reduced) lm(cars_full_poly2)
## 11.293439 11.380040 8.610385
## lm(cars_reduced_poly2) lm(cars_reduced_poly6) rpart(cars_full)
## 8.787013 9.177932 12.769791
## rpart(cars_reduced)
## 13.145150
3-b-iii) When k=392, does the MSEout of a model remain stable (same
value) if you re-estimate it over and over again, or does it vary? (show
a few repetitions for any model and decide!)
rep_kfold_392 <- sapply(1:7, \(i){
mean(replicate(10, k_fold_mse(cars[sample(1:nrow(cars)),],
k = 392, formula = formulaList[[i]],
func = functionList[[i]])))})
names(rep_kfold_392) <- nameList
rep_kfold_392
## lm(cars_full) lm(lm_reduced) lm(cars_full_poly2)
## 11.293439 11.380040 8.610385
## lm(cars_reduced_poly2) lm(cars_reduced_poly6) rpart(cars_full)
## 8.787013 9.177932 12.769791
## rpart(cars_reduced)
## 13.145150
3-b-iv) Looking at the fit error (MSEin) and prediction error
(MSEout; k=392) of the full models versus their reduced counterparts
(with the same training technique), does multicollinearity present in
the full models seem to hurt their fit error and/or prediction
error?
MSE_errors$"10 times MSE_out_392" <- rep_kfold_392
MSE_errors$"MSE_out_392" <- MSE_kfold_392
MSE_errors[c(1,2,6,7),c("fit_error","10 times MSE_out_392")]
## fit_error 10 times MSE_out_392
## lm(cars_full) 10.682122 11.29344
## lm(lm_reduced) 10.971643 11.38004
## rpart(cars_full) 9.155146 12.76979
## rpart(cars_reduced) 9.501344 13.14515
As we can see, 10 times MSE out value with k=392 is higher
than fit error(MSEin). However, there’s no significant difference
between lm() and rpart() cases within full models versus their reduced
counterparts. Therefore, multicollinearity seems not to
hurt.
3-b-v) Look at the fit error and prediction error (k=392) of the
reduced quadratic versus 6th order polynomial regressions — did adding
more higher-order terms hurt the fit and/or predictions?
MSE_errors[c(4, 5), c("fit_error", "10 times MSE_out_392")]
## fit_error 10 times MSE_out_392
## lm(cars_reduced_poly2) 8.364546 8.787013
## lm(cars_reduced_poly6) 8.254377 9.177932
As the results, in terms of fit error, the value of 6th
order polynomial regression is slightly lower than 2nd polynomial
regression. Nevertheless, in terms of 10 times replication of predicted
value, 6th order polynomial is bigger than 2nd order polynomial
regression.