1 Goal


The goal of this tutorial is to learn how to do polynomial models of any degree.


2 Libraries


library(caret)

3 Data import


# We are going to use the dataset mtcars
data(mtcars)
mtcars <- as.data.frame(mtcars)
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

4 Plotting the dependence between variables


# We want to check the relationship between Milles per Gallon and Displacement
ggplot(data = mtcars) + geom_point( aes(x = mpg, y = disp)) + 
  xlab("Milles per Gallon (mpg)") + ylab("Distance (cu. in.)") + 
  ggtitle("Distance vs mpg for 1973/1974 cars") +
  theme(plot.title = element_text(hjust = 0.5)) 


5 Using polinomial models to predict

5.1 Linear model


# We are going to use first a linear model
my_model1 <- lm(disp ~ mpg, data = mtcars)

# And we predict the actual data
my_pred1 <- predict(my_model1, newdata = mtcars)

ggplot(data = mtcars) + geom_point( aes(x = mpg, y = disp)) + 
  xlab("Milles per Gallon (mpg)") + ylab("Distance (cu. in.)") + 
  ggtitle("Distance vs mpg for 1973/1974 cars") +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_line(aes(x = mpg, y = my_pred1), colour = "red", size = 1)


5.2 Second degree Polinomial model


# We are going to use a second degree polynomial model
my_model2 <- lm(disp ~ poly(mpg, 2), data = mtcars)

# And we predict the actual data
my_pred2 <- predict(my_model2, newdata = mtcars)

ggplot(data = mtcars) + geom_point( aes(x = mpg, y = disp)) + 
  xlab("Milles per Gallon (mpg)") + ylab("Distance (cu. in.)") + 
  ggtitle("Distance vs mpg for 1973/1974 cars") +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_line(aes(x = mpg, y = my_pred2), colour = "red", size = 1)


5.3 Higher degree Polinomial model


# We are going to use a 12 degree polynomial model causing overfitting
my_model3 <- lm(disp ~ poly(mpg, 12), data = mtcars)

# And we predict the actual data
my_pred3 <- predict(my_model3, newdata = mtcars)

ggplot(data = mtcars) + geom_point( aes(x = mpg, y = disp)) + 
  xlab("Milles per Gallon (mpg)") + ylab("Distance (cu. in.)") + 
  ggtitle("Distance vs mpg for 1973/1974 cars") +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_line(aes(x = mpg, y = my_pred3), colour = "red", size = 1)


6 Conclusion


In this tutorial we have learnt how to use polynomial models of any degree. The function poly should be used to avoid colinearity between different degrees of the same variables.