Reading In Necessary Functions and Data
# Functions used
docv =
function (x, y, set, predfun, loss, nfold = 10, doran = TRUE,
verbose = TRUE, ...)
{
if (!(is.matrix(x) | is.data.frame(x))) {
cat("error in docv: x is not a matrix or data frame\n")
return(0)
}
if (!(is.vector(y))) {
cat("error in docv: y is not a vector\n")
return(0)
}
if (!(length(y) == nrow(x))) {
cat("error in docv: length(y) != nrow(x)\n")
return(0)
}
nset = nrow(set)
n = length(y)
if (n == nfold)
doran = FALSE
cat("in docv: nset,n,nfold: ", nset, n, nfold, "\n")
lossv = rep(0, nset)
if (doran) {
ii = sample(1:n, n)
y = y[ii]
x = x[ii, , drop = FALSE]
}
fs = round(n/nfold)
for (i in 1:nfold) {
bot = (i - 1) * fs + 1
top = ifelse(i == nfold, n, i * fs)
ii = bot:top
if (verbose)
cat("on fold: ", i, ", range: ", bot, ":", top, "\n")
xin = x[-ii, , drop = FALSE]
yin = y[-ii]
xout = x[ii, , drop = FALSE]
yout = y[ii]
for (k in 1:nset) {
yhat = predfun(xin, yin, xout, set[k, ], ...)
lossv[k] = lossv[k] + loss(yout, yhat)
}
}
return(lossv)
}
docvknn =
function (x, y, k, nfold = 10, doran = TRUE, verbose = TRUE)
{
return(docv(x, y, matrix(k, ncol = 1), doknn, mse, nfold = nfold,
doran = doran, verbose = verbose))
}
# Code from the Notes
rmse = function(y,yhat) {return(sqrt(mean((y-yhat)^2)))}
mabe = function(y,yhat) {return(mean(abs(y-yhat)))}
cd = read.csv("http://www.rob-mcculloch.org/data/susedcars.csv")
cd = cd[,c(1,4,6)]
names(cd)
## [1] "price" "mileage" "color"
cd$color = as.factor(cd$color)
cd$mileage = cd$mileage/1000
cd$price = cd$price/1000
Use out of sample performance (a train/test split) to decide which of these two models is best:
linear model of log(y) on mileage and color
linear model of y on mileage, mileage squared, and color
Use out of sample rmse and graphics to compare the two models
n = nrow(cd)
set.seed(99)
pin = .75 #percent train (or percent in-sample)
ii = sample(1:n,floor(pin*n))
cdtr = cd[ii,]
cdte = cd[-ii,]
cat("dimension of train data:",dim(cdtr),"\n")
## dimension of train data: 750 3
cat("dimension of test data:",dim(cdte),"\n")
## dimension of test data: 250 3
lm3var = lm(price~mileage+color,cdtr)
yhtest3var = predict(lm3var,cdte)
cat("out of sample rmse for three varaible linaer model: ",rmse(cdte$price,yhtest3var),"\n") #root mean squared error
## out of sample rmse for three varaible linaer model: 10.62553
lms = range(c(cd$price,yhtest3var))
library(ggplot2)
p <- ggplot(data.frame(yhtest3var, price = cdte$price), aes(x = yhtest3var, y = price)) +
geom_point() +
xlim(lms) + ylim(lms) +
geom_abline(intercept = 0, slope = 1, colour = 'red') +
labs(
title = "3 Variable Linear Model Price Prediction vs Actual Price",
subtitle = paste("out of sample rmse for three varaible linaer model: ",rmse(cdte$price,yhtest3var)),
x = "Price Prediction",
y = "Actual Price"
) +
theme_minimal() + # Clean background
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16), # Centered, bold, larger title
plot.subtitle = element_text(hjust = 0.5, size = 14, color = "black"), # Centered, smaller, gray subtitle
axis.title = element_text(size = 14, face = "bold"), # Larger, bold axis labels
axis.text = element_text(size = 12), # Increase tick label size
panel.background = element_rect(fill = "white", color = "white"), # White background
panel.grid.major = element_line(color = "lightgrey"), # Light grey major grid lines
panel.grid.minor = element_line(color = "lightgrey"), # Light grey minor grid lines
axis.line = element_line(color = "black") # Add x and y axis lines
)
print(p)
lprice <- log(cdtr$price)
lmlog = lm(lprice~mileage,cdtr)
lyhat = predict(lmlog,cdte)
yhatexp = exp(lyhat)
cat("out of sample rmse for ln(milage) model: ",rmse(cdte$price,yhatexp),"\n") #root mean squared error
## out of sample rmse for ln(milage) model: 9.986602
lms = range(c(cd$price,yhatexp))
p = ggplot(data.frame(yhatexp,price=cdte$price),aes(x=yhatexp,y=price)) + geom_point() +
xlim(lms) + ylim(lms) + geom_abline(intercept=0,slope=1,colour='red')+
labs(
title = "Log(Mileage) Linear Model Price Prediction vs Actual Price",
subtitle = paste("out of sample rmse for ln(milage) model: ",rmse(cdte$price,yhatexp)),
x = "Price Pediction ",
y = "Actual Price"
) +
theme_minimal() + # Clean background
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16), # Centered, bold, larger title
plot.subtitle = element_text(hjust = 0.5, size = 14, color = "black"),
axis.title = element_text(size = 14, face = "bold"), # Larger, bold axis labels
axis.text = element_text(size = 12), # Increase tick label size
panel.background = element_rect(fill = "white", color = "white"), # White background
panel.grid.major = element_line(color = "lightgrey"), # Light grey major grid lines
panel.grid.minor = element_line(color = "lightgrey"), # Light grey minor grid lines
axis.line = element_line(color = "black") # Add x and y axis lines
)
print(p)