Load Data

We will load the data from file auto-mpg. This data set has 5 data elements; displacement, horsepower, weight, acceleration, mpg. In our analysis, we will have mpg has a dependent variable and the other 4 as independent. We will attempt to fit a polynomial model of various degree and measure the cross validation error.

df_mpg <- read.table("auto-mpg.data")
names(df_mpg) <- c('displacement', 'horsepower', 'weight', 'acceleration', 'mpg')

head(df_mpg)
##   displacement horsepower weight acceleration mpg
## 1          307        130   3504         12.0  18
## 2          350        165   3693         11.5  15
## 3          318        150   3436         11.0  18
## 4          304        150   3433         12.0  16
## 5          302        140   3449         10.5  17
## 6          429        198   4341         10.0  15

Fitting Polynomial Models

degree <- 1:8
cv.err5 <- c()

# Degree = 1

glm.fit1 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[1]), data = df_mpg)


estimate_k1 <- cv.glm(data = df_mpg, glmfit = glm.fit1, K=5)

cv.err5 <- c(cv.err5, estimate_k1$delta[1])

# Degree = 2

glm.fit2 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[2]), data = df_mpg)


estimate_k2 <- cv.glm(data = df_mpg, glmfit = glm.fit2, K=5)

cv.err5 <- c(cv.err5, estimate_k2$delta[1])

# Degree = 3

glm.fit3 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[3]), data = df_mpg)


estimate_k3 <- cv.glm(data = df_mpg, glmfit = glm.fit3, K=5)

cv.err5 <- c(cv.err5, estimate_k3$delta[1])

# Degree = 4

glm.fit4 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[4]), data = df_mpg)


estimate_k4 <- cv.glm(data = df_mpg, glmfit = glm.fit4, K=5)

cv.err5<- c(cv.err5, estimate_k4$delta[1])

# Degree = 5

glm.fit5 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[5]), data = df_mpg)


estimate_k5 <- cv.glm(data = df_mpg, glmfit = glm.fit5, K=5)

cv.err5 <- c(cv.err5, estimate_k5$delta[1])

# Degree = 6

glm.fit6 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[6]), data = df_mpg)


estimate_k6 <- cv.glm(data = df_mpg, glmfit = glm.fit6, K=5)

cv.err5 <- c(cv.err5, estimate_k6$delta[1])

# Degree = 7

glm.fit7 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[7]), data = df_mpg)


estimate_k7 <- cv.glm(data = df_mpg, glmfit = glm.fit7, K=5)

cv.err5 <- c(cv.err5, estimate_k7$delta[1])

# Degree = 8

glm.fit8 <- glm(mpg~poly(displacement+horsepower+weight+acceleration,degree[8]), data = df_mpg)


estimate_k8 <- cv.glm(data = df_mpg, glmfit = glm.fit8, K=5)

cv.err5 <- c(cv.err5, estimate_k8$delta[1])

We will now plot the validation error function.

plot(degree, cv.err5, type='b')