Reading In Necessary Functions and Data

# Functions used

docv =
function (x, y, set, predfun, loss, nfold = 10, doran = TRUE, 
          verbose = TRUE, ...) 
{
  if (!(is.matrix(x) | is.data.frame(x))) {
    cat("error in docv: x is not a matrix or data frame\n")
    return(0)
  }
  if (!(is.vector(y))) {
    cat("error in docv: y is not a vector\n")
    return(0)
  }
  if (!(length(y) == nrow(x))) {
    cat("error in docv: length(y) != nrow(x)\n")
    return(0)
  }
  nset = nrow(set)
  n = length(y)
  if (n == nfold) 
    doran = FALSE
  cat("in docv: nset,n,nfold: ", nset, n, nfold, "\n")
  lossv = rep(0, nset)
  if (doran) {
    ii = sample(1:n, n)
    y = y[ii]
    x = x[ii, , drop = FALSE]
  }
  fs = round(n/nfold)
  for (i in 1:nfold) {
    bot = (i - 1) * fs + 1
    top = ifelse(i == nfold, n, i * fs)
    ii = bot:top
    if (verbose) 
      cat("on fold: ", i, ", range: ", bot, ":", top, "\n")
    xin = x[-ii, , drop = FALSE]
    yin = y[-ii]
    xout = x[ii, , drop = FALSE]
    yout = y[ii]
    for (k in 1:nset) {
      yhat = predfun(xin, yin, xout, set[k, ], ...)
      lossv[k] = lossv[k] + loss(yout, yhat)
    }
  }
  return(lossv)
}


docvknn =
 function (x, y, k, nfold = 10, doran = TRUE, verbose = TRUE) 
   {
     return(docv(x, y, matrix(k, ncol = 1), doknn, mse, nfold = nfold, 
         doran = doran, verbose = verbose))
 }

# Code from the Notes
rmse = function(y,yhat) {return(sqrt(mean((y-yhat)^2)))}
mabe = function(y,yhat) {return(mean(abs(y-yhat)))}

cd = read.csv("http://www.rob-mcculloch.org/data/susedcars.csv")

cd = cd[,c(1,4,6)]
names(cd)
## [1] "price"   "mileage" "color"
cd$color = as.factor(cd$color)
cd$mileage = cd$mileage/1000
cd$price = cd$price/1000

Use out of sample performance (a train/test split) to decide which of these two models is best:

linear model of log(y) on mileage and color

linear model of y on mileage, mileage squared, and color

Use out of sample rmse and graphics to compare the two models

n = nrow(cd)
set.seed(99)
pin = .75 #percent train (or percent in-sample)
ii = sample(1:n,floor(pin*n))
cdtr = cd[ii,]
cdte = cd[-ii,]
cat("dimension of train data:",dim(cdtr),"\n")
## dimension of train data: 750 3
cat("dimension of test data:",dim(cdte),"\n")
## dimension of test data: 250 3
lm3var = lm(price~mileage+color,cdtr)
yhtest3var = predict(lm3var,cdte)

cat("out of sample rmse for three varaible linaer model: ",rmse(cdte$price,yhtest3var),"\n") #root mean squared error
## out of sample rmse for three varaible linaer model:  10.62553
lms = range(c(cd$price,yhtest3var))
library(ggplot2)

p <- ggplot(data.frame(yhtest3var, price = cdte$price), aes(x = yhtest3var, y = price)) + 
  geom_point() +
  xlim(lms) + ylim(lms) + 
  geom_abline(intercept = 0, slope = 1, colour = 'red') +
  labs(
    title = "3 Variable Linear Model Price Prediction vs Actual Price",
    subtitle = paste("out of sample rmse for three varaible linaer model: ",rmse(cdte$price,yhtest3var)),
    x = "Price  Prediction",
    y = "Actual Price"
  ) +
  theme_minimal() +  # Clean background
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),  # Centered, bold, larger title
    plot.subtitle = element_text(hjust = 0.5, size = 14, color = "black"),  # Centered, smaller, gray subtitle
    axis.title = element_text(size = 14, face = "bold"),  # Larger, bold axis labels
    axis.text = element_text(size = 12),  # Increase tick label size
    panel.background = element_rect(fill = "white", color = "white"),  # White background
    panel.grid.major = element_line(color = "lightgrey"),  # Light grey major grid lines
    panel.grid.minor = element_line(color = "lightgrey"),  # Light grey minor grid lines
    axis.line = element_line(color = "black")  # Add x and y axis lines
  )



print(p)

lprice <- log(cdtr$price)
lmlog  = lm(lprice~mileage,cdtr)
lyhat = predict(lmlog,cdte)
yhatexp = exp(lyhat)

cat("out of sample rmse for ln(milage) model: ",rmse(cdte$price,yhatexp),"\n") #root mean squared error
## out of sample rmse for ln(milage) model:  9.986602
lms = range(c(cd$price,yhatexp))
p = ggplot(data.frame(yhatexp,price=cdte$price),aes(x=yhatexp,y=price)) + geom_point() +
  xlim(lms) + ylim(lms) + geom_abline(intercept=0,slope=1,colour='red')+
  labs(
    title = "Log(Mileage) Linear Model Price Prediction vs Actual Price",
    subtitle = paste("out of sample rmse for ln(milage) model: ",rmse(cdte$price,yhatexp)),
    x = "Price Pediction ",
    y = "Actual Price"
  ) +
  theme_minimal() +  # Clean background
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),  # Centered, bold, larger title
    plot.subtitle = element_text(hjust = 0.5, size = 14, color = "black"),
    axis.title = element_text(size = 14, face = "bold"),  # Larger, bold axis labels
    axis.text = element_text(size = 12),  # Increase tick label size
    panel.background = element_rect(fill = "white", color = "white"),  # White background
    panel.grid.major = element_line(color = "lightgrey"),  # Light grey major grid lines
    panel.grid.minor = element_line(color = "lightgrey"),  # Light grey minor grid lines
    axis.line = element_line(color = "black")  # Add x and y axis lines
  )

print(p)