Helped by 111078513

# Load the data and remove missing values
cars <- read.table("C:/R-language/BACS/auto-data.txt", header=FALSE, na.strings = "?")
names(cars) <- c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", 
                 "model_year", "origin", "car_name")
cars$car_name <- NULL
cars <- na.omit(cars)
# IMPORTANT: Shuffle the rows of data in advance for this project!
set.seed(27935752) # use your own seed, or use this one to compare to next class notes
cars <- cars[sample(1:nrow(cars)),]
# DV and IV of formulas we are interested in
cars_full <- mpg ~ cylinders + displacement + horsepower + weight + acceleration + 
  model_year + factor(origin)
cars_reduced <- mpg ~ weight + acceleration + model_year + factor(origin)
cars_full_poly2 <- mpg ~ poly(cylinders, 2) + poly(displacement, 2) + poly(horsepower, 2) + 
  poly(weight, 2) + poly(acceleration, 2) + model_year + 
  factor(origin)
cars_reduced_poly2 <- mpg ~ poly(weight, 2) + poly(acceleration,2) + model_year + 
  factor(origin)
cars_reduced_poly6 <- mpg ~ poly(weight, 6) + poly(acceleration,6) + model_year + 
  factor(origin)
# create the models
library(rpart)
## Warning: 套件 'rpart' 是用 R 版本 4.2.3 來建造的
library(rpart.plot)
## Warning: 套件 'rpart.plot' 是用 R 版本 4.2.3 來建造的
attach(cars)
lm_full <- lm(cars_full)
lm_reduced <- lm(cars_reduced)
lm_poly2_full <- lm(cars_full_poly2)
lm_poly2_reduced <- lm(cars_reduced_poly2)
lm_poly6_reduced <- lm(cars_reduced_poly6)
rt_full <- rpart(cars_full)
rt_reduced <- rpart(cars_reduced)
detach(cars)

Question1) Compute and report the in-sample fitting error (MSEin) of all the models described above. It might be easier to first write a function called mse_in(…) that returns the fitting error of a single model.

mse_in <- function(model){
  mean(residuals(model)^2)
}
#create the list which can easily be used:
modelsList <- list(lm_full, lm_reduced, lm_poly2_full, lm_poly2_reduced,
                   lm_poly6_reduced, rt_full, rt_reduced)
nameList <- c("lm(cars_full)", "lm(lm_reduced)", "lm(cars_full_poly2)",
              "lm(cars_reduced_poly2)", "lm(cars_reduced_poly6)",
              "rpart(cars_full)", "rpart(cars_reduced)")
formulaList <- c(cars_full, cars_reduced, cars_full_poly2,
                 cars_reduced_poly2, cars_reduced_poly6,
                 cars_full, cars_reduced)
functionList <- c(lm, lm, lm, lm, lm, rpart, rpart)
#report the in-sample fitting error
models_MSEin <- sapply(1:7,\(i){
  model <- modelsList[[i]]
  MSEin <- mse_in(model)
})
names(models_MSEin) <- nameList
models_MSEin
##          lm(cars_full)         lm(lm_reduced)    lm(cars_full_poly2) 
##              10.682122              10.971643               7.919030 
## lm(cars_reduced_poly2) lm(cars_reduced_poly6)       rpart(cars_full) 
##               8.364546               8.254377               9.155146 
##    rpart(cars_reduced) 
##               9.501344

Question2) Let’s try some simple evaluation of prediction error. Let’s work with the lm_reduced model and test its predictive performance with split-sample testing:

2-a) Split the data into 70:30 for training:test (did you remember to shuffle the data earlier?)

train_indices <- sample(1:nrow(cars), size=0.70*nrow(cars))
train_set <- cars[train_indices,]
test_set <- cars[-train_indices,]

2-b) Retrain the lm_reduced model on just the training dataset (call the new model: trained_model); Show the coefficients of the trained model.

train_model <- lm(cars_reduced, data = train_set)
summary(train_model)$coefficients
##                      Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept)     -19.884850373 5.0923616796  -3.904839 1.193915e-04
## weight           -0.005691225 0.0003403875 -16.719839 1.790091e-43
## acceleration      0.059620087 0.0844249988   0.706190 4.806835e-01
## model_year        0.772666424 0.0626661183  12.329891 5.706636e-28
## factor(origin)2   2.218726662 0.6636237064   3.343351 9.457739e-04
## factor(origin)3   2.148123177 0.6416118917   3.348010 9.306470e-04

2-c) Use the trained_model model to predict the mpg of the test dataset

2-c-i) What is the in-sample mean-square fitting error (MSEin) of the trained model?

mean(residuals(train_model)^2)
## [1] 12.1074

2-c-ii) What is the out-of-sample mean-square prediction error (MSEout) of the test dataset?

pred <- predict(train_model, test_set)
mean((pred - test_set$mpg)^2)
## [1] 8.505918

2-d) Show a data frame of the test set’s actual mpg values, the predicted mpg values, and the difference of the two (“out = predictive error); Just show us the first several rows of this dataframe.

mpg_difference <- data.frame("Actual" = test_set$mpg, "Predicted" = pred, "Difference" = test_set$mpg - pred)
head(mpg_difference)
##     Actual Predicted  Difference
## 372     29  30.05738 -1.05737552
## 214     13  16.47532 -3.47532289
## 81      22  23.07057 -1.07057048
## 384     38  35.33296  2.66703558
## 85      27  26.92741  0.07258502
## 103     26  28.89266 -2.89265898

Question3) Let’s use k-fold cross validation (k-fold CV) to see how all these models perform predictively!

3-a) Write a function that performs k-fold cross-validation (see class notes and ask us online for hints!). Name your function k_fold_mse(model, dataset, k=10, …) – it should return the MSEout of the operation. Your function must accept a model, dataset and number of folds (k) but can also have whatever other parameters you wish.

# Calculate mse_out across all folds
k_fold_mse <- function(dataset, k=10, formula, func) {
  fold_pred_errors <- sapply(1:k, \(i) {
    fold_i_pe(i, k, dataset,nrow(dataset), formula, func)
  })
  pred_errors <- unlist(fold_pred_errors)
  mean(pred_errors^2)
}

# Calculate prediction error for fold i out of k
fold_i_pe <- function(i, k, dataset, nrow, formula, func) {
  folds <- cut(1:nrow, k, labels = FALSE)
  test_indices <- which(folds == i)
  test_set <- dataset[test_indices,]
  train_set <- dataset[-test_indices,]
  trained_model <- func(formula,data = train_set)
  predictions <- predict(trained_model,test_set)
  test_set$mpg - predictions
}

3-a-i) Use your k_fold_mse function to find and report the 10-fold CV MSEout for all models.

models_MSEout <- sapply(1:7, \(i){k_fold_mse(cars, k=10,formula = formulaList[[i]], func = functionList[[i]])})
names(models_MSEout) <- nameList
models_MSEout
##          lm(cars_full)         lm(lm_reduced)    lm(cars_full_poly2) 
##              11.262460              11.415855               8.599373 
## lm(cars_reduced_poly2) lm(cars_reduced_poly6)       rpart(cars_full) 
##               8.818607               9.267369              13.342221 
##    rpart(cars_reduced) 
##              13.476272

3-a-ii) For all the models, which is bigger — the fit error (MSEin) or the prediction error (MSEout)?

MSE_errors <- data.frame(method = nameList, fit_error = models_MSEin,
                     pred_error = models_MSEout)
MSE_errors[, c("fit_error", "pred_error")]
##                        fit_error pred_error
## lm(cars_full)          10.682122  11.262460
## lm(lm_reduced)         10.971643  11.415855
## lm(cars_full_poly2)     7.919030   8.599373
## lm(cars_reduced_poly2)  8.364546   8.818607
## lm(cars_reduced_poly6)  8.254377   9.267369
## rpart(cars_full)        9.155146  13.342221
## rpart(cars_reduced)     9.501344  13.476272

According to the result, the prediction error(MSEout) is bigger than the fit error(MSEin).

3-a-iii) Does the 10-fold MSEout of a model remain stable (same value) if you re-estimate it over and over again, or does it vary?

rep_kfold <- sapply(1:7, \(i){
  mean(replicate(10, k_fold_mse(cars[sample(1:nrow(cars)),],
                                k = 10, formula = formulaList[[i]], func = functionList[[i]])))
})
MSE_errors$"10 times MSE_out" = rep_kfold
MSE_errors[,c(3,4)]
##                        pred_error 10 times MSE_out
## lm(cars_full)           11.262460        11.296352
## lm(lm_reduced)          11.415855        11.388517
## lm(cars_full_poly2)      8.599373         8.614284
## lm(cars_reduced_poly2)   8.818607         8.867121
## lm(cars_reduced_poly6)   9.267369         9.275757
## rpart(cars_full)        13.342221        13.025962
## rpart(cars_reduced)     13.476272        12.762983

The result of 10 times MSE_out is similar to previous result, so I think it remain stable.

3-b) Make sure your k_fold_mse() function can accept as many folds as there are rows (i.e., k=392).

3-b-i) How many rows are in the training dataset and test dataset of each iteration of k-fold CV when k=392?

n = nrow(cars)
k = 392
folds <- cut(1:n, breaks = k, labels = FALSE)
test_indices <- which(folds == 5)
train_set <- cars[-test_indices, ]
print(paste("There are ", nrow(train_set), " in train_set.", sep = ""))
## [1] "There are 391 in train_set."
test_set <- cars[test_indices, ]
print(paste("There are ", nrow(test_set), " in test_set.", sep = ""))
## [1] "There are 1 in test_set."

3-b-ii) Report the k-fold CV MSEout for all models using k=392.

MSE_kfold_392 <- sapply(1:7, \(i){k_fold_mse(cars, k = 392,
                                       formula = formulaList[[i]],
                                       func = functionList[[i]])})

names(MSE_kfold_392) <- nameList
MSE_kfold_392
##          lm(cars_full)         lm(lm_reduced)    lm(cars_full_poly2) 
##              11.293439              11.380040               8.610385 
## lm(cars_reduced_poly2) lm(cars_reduced_poly6)       rpart(cars_full) 
##               8.787013               9.177932              12.769791 
##    rpart(cars_reduced) 
##              13.145150

3-b-iii) When k=392, does the MSEout of a model remain stable (same value) if you re-estimate it over and over again, or does it vary? (show a few repetitions for any model and decide!)

rep_kfold_392 <- sapply(1:7, \(i){
  mean(replicate(10, k_fold_mse(cars[sample(1:nrow(cars)),],
                                k = 392, formula = formulaList[[i]],
                                func = functionList[[i]])))})

names(rep_kfold_392) <- nameList
rep_kfold_392
##          lm(cars_full)         lm(lm_reduced)    lm(cars_full_poly2) 
##              11.293439              11.380040               8.610385 
## lm(cars_reduced_poly2) lm(cars_reduced_poly6)       rpart(cars_full) 
##               8.787013               9.177932              12.769791 
##    rpart(cars_reduced) 
##              13.145150

3-b-iv) Looking at the fit error (MSEin) and prediction error (MSEout; k=392) of the full models versus their reduced counterparts (with the same training technique), does multicollinearity present in the full models seem to hurt their fit error and/or prediction error?

MSE_errors$"10 times MSE_out_392" <- rep_kfold_392
MSE_errors$"MSE_out_392" <- MSE_kfold_392
MSE_errors[c(1,2,6,7),c("fit_error","10 times MSE_out_392")]
##                     fit_error 10 times MSE_out_392
## lm(cars_full)       10.682122             11.29344
## lm(lm_reduced)      10.971643             11.38004
## rpart(cars_full)     9.155146             12.76979
## rpart(cars_reduced)  9.501344             13.14515

As we can see, 10 times MSE out value with k=392 is higher than fit error(MSEin). However, there’s no significant difference between lm() and rpart() cases within full models versus their reduced counterparts. Therefore, multicollinearity seems not to hurt.

3-b-v) Look at the fit error and prediction error (k=392) of the reduced quadratic versus 6th order polynomial regressions — did adding more higher-order terms hurt the fit and/or predictions?

MSE_errors[c(4, 5), c("fit_error", "10 times MSE_out_392")]
##                        fit_error 10 times MSE_out_392
## lm(cars_reduced_poly2)  8.364546             8.787013
## lm(cars_reduced_poly6)  8.254377             9.177932

As the results, in terms of fit error, the value of 6th order polynomial regression is slightly lower than 2nd polynomial regression. Nevertheless, in terms of 10 times replication of predicted value, 6th order polynomial is bigger than 2nd order polynomial regression.