We will load the data from file auto-mpg. This data set has 5 data elements; displacement, horsepower, weight, acceleration, mpg. In our analysis, we will have mpg has a dependent variable and the other 4 as independent. We will attempt to fit a polynomial model of various degree and measure the cross validation error.
df_mpg <- read.table("auto-mpg.data")
names(df_mpg) <- c('displacement', 'horsepower', 'weight', 'acceleration', 'mpg')
head(df_mpg)
## displacement horsepower weight acceleration mpg
## 1 307 130 3504 12.0 18
## 2 350 165 3693 11.5 15
## 3 318 150 3436 11.0 18
## 4 304 150 3433 12.0 16
## 5 302 140 3449 10.5 17
## 6 429 198 4341 10.0 15
degree <- 1:8
cv.err5 <- c()
# Degree = 1
glm.fit1 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[1]), data = df_mpg)
estimate_k1 <- cv.glm(data = df_mpg, glmfit = glm.fit1, K=5)
cv.err5 <- c(cv.err5, estimate_k1$delta[1])
# Degree = 2
glm.fit2 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[2]), data = df_mpg)
estimate_k2 <- cv.glm(data = df_mpg, glmfit = glm.fit2, K=5)
cv.err5 <- c(cv.err5, estimate_k2$delta[1])
# Degree = 3
glm.fit3 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[3]), data = df_mpg)
estimate_k3 <- cv.glm(data = df_mpg, glmfit = glm.fit3, K=5)
cv.err5 <- c(cv.err5, estimate_k3$delta[1])
# Degree = 4
glm.fit4 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[4]), data = df_mpg)
estimate_k4 <- cv.glm(data = df_mpg, glmfit = glm.fit4, K=5)
cv.err5<- c(cv.err5, estimate_k4$delta[1])
# Degree = 5
glm.fit5 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[5]), data = df_mpg)
estimate_k5 <- cv.glm(data = df_mpg, glmfit = glm.fit5, K=5)
cv.err5 <- c(cv.err5, estimate_k5$delta[1])
# Degree = 6
glm.fit6 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[6]), data = df_mpg)
estimate_k6 <- cv.glm(data = df_mpg, glmfit = glm.fit6, K=5)
cv.err5 <- c(cv.err5, estimate_k6$delta[1])
# Degree = 7
glm.fit7 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[7]), data = df_mpg)
estimate_k7 <- cv.glm(data = df_mpg, glmfit = glm.fit7, K=5)
cv.err5 <- c(cv.err5, estimate_k7$delta[1])
# Degree = 8
glm.fit8 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[8]), data = df_mpg)
estimate_k8 <- cv.glm(data = df_mpg, glmfit = glm.fit8, K=5)
cv.err5 <- c(cv.err5, estimate_k8$delta[1])
We will now plot the validation error function.
plot(degree, cv.err5, type='b')