In this simple tutorial we will demonstrate how to build and visualize fit and residuals for polynomial regression for several degrees.We will use a data set containing values for six biomechanical features used to classify orthopaedic patients into 3 classes (normal, disk hernia or spondilolysthesis) or 2 classes (normal or abnormal).In this example we will proceed with the 2 classes dataset.

First set working directory and load the file in R and explore it

setwd("/Users/christoschalitsios/Documents/vertebral_column_data")
spine<-read.csv("spine_2c.csv")
str(spine)
## 'data.frame':    310 obs. of  7 variables:
##  $ plv_inc     : num  63 39.1 68.8 69.3 49.7 ...
##  $ plv_tilt    : num  22.55 10.06 22.22 24.65 9.65 ...
##  $ l_l_angle   : num  39.6 25 50.1 44.3 28.3 ...
##  $ sacr_slope  : num  40.5 29 46.6 44.6 40.1 ...
##  $ plv_radius  : num  98.7 114.4 106 101.9 108.2 ...
##  $ grade_spondy: num  -0.25 4.56 -3.53 11.21 7.92 ...
##  $ class       : Factor w/ 2 levels "AB","NO": 1 1 1 1 1 1 1 1 1 1 ...
summary(spine)
##     plv_inc          plv_tilt       l_l_angle        sacr_slope    
##  Min.   : 26.15   Min.   :-6.55   Min.   : 14.00   Min.   : 13.37  
##  1st Qu.: 46.43   1st Qu.:10.67   1st Qu.: 37.00   1st Qu.: 33.35  
##  Median : 58.69   Median :16.36   Median : 49.56   Median : 42.41  
##  Mean   : 60.50   Mean   :17.54   Mean   : 51.93   Mean   : 42.95  
##  3rd Qu.: 72.88   3rd Qu.:22.12   3rd Qu.: 63.00   3rd Qu.: 52.69  
##  Max.   :129.83   Max.   :49.43   Max.   :125.74   Max.   :121.43  
##    plv_radius      grade_spondy    class   
##  Min.   : 70.08   Min.   :-11.06   AB:210  
##  1st Qu.:110.71   1st Qu.:  1.60   NO:100  
##  Median :118.27   Median : 11.77           
##  Mean   :117.92   Mean   : 26.30           
##  3rd Qu.:125.47   3rd Qu.: 41.28           
##  Max.   :163.07   Max.   :418.54

Use the str() and summary() functions to get an idea about the structure of the data and some basic descriptive statistics.Lets explore a little further the data with saome useful visualizations.The corrplot() is a good start to understand some basic relations among variables and continue with scatterplots

Looking at the corrplot it is clear that several variables have some pretty good correlations so lets pick “l_l_angle”(lumbar lordosis angle) and “sacr_slope”(sacral slope) which have a moderate ‘pearson r’ = 0.6

So, now we will fit 3 models a linear and two polynomials up to degree 3 and a higher degree in order to illustrate the flexibility of polynomials Fisrt the linear

lm1<-lm(l_l_angle~sacr_slope,data=spine)
lm1
## 
## Call:
## lm(formula = l_l_angle ~ sacr_slope, data = spine)
## 
## Coefficients:
## (Intercept)   sacr_slope  
##     16.4022       0.8271
flinear<-function(x){
  return(0.8271*x+16.4022)
}

x<-spine$sacr_slope
y<-flinear(x)
means_linear<-data.frame(x,y)
spine_linear<-spine[,c(4,3)]
means_linear$group<-1:310
spine_linear$group<-1:310
colnames(means_linear)<-c("sacr_slope","l_l_angle","group")
groups<-rbind(means_linear,spine_linear)
linear<-ggplot()+geom_point(data=spine,aes(x=sacr_slope,y=l_l_angle))+
  stat_function(aes(spine$sacr_slope),fun=flinear)+
  geom_point(data=means_linear,aes(x=x,y=y),color="red")+
  geom_line(data=groups,aes(x=sacr_slope,y=l_l_angle,group=group),color="grey",alpha=0.4)+ggtitle("Linear")
linear

Now the second degree polynomial

poly2<-lm(l_l_angle~poly(sacr_slope,2,raw=T),data=spine)
poly2
## 
## Call:
## lm(formula = l_l_angle ~ poly(sacr_slope, 2, raw = T), data = spine)
## 
## Coefficients:
##                   (Intercept)  poly(sacr_slope, 2, raw = T)1  
##                     -3.905348                       1.751516  
## poly(sacr_slope, 2, raw = T)2  
##                     -0.009581
fpoly2<-function(x){
  return(-0.009581*x^2+1.751516*x-3.905348)
}

x_poly2<-spine$sacr_slope
y_poly2<-fpoly2(x)
means_poly2<-data.frame(x_poly2,y_poly2)
spine_poly2<-spine[,c(4,3)]
means_poly2$group<-1:310
spine_poly2$group<-1:310
colnames(means_poly2)<-c("sacr_slope","l_l_angle","group")
groups_poly<-rbind(means_poly2,spine_poly2)
poly2<-ggplot()+geom_point(data=spine,aes(x=sacr_slope,y=l_l_angle))+
  stat_function(aes(x=spine$sacr_slope),fun=fpoly2)+
  geom_point(data=means_poly2,aes(x=x_poly2,y=y_poly2),color="red")+
  geom_line(data=groups_poly,aes(x=sacr_slope,y=l_l_angle,group=group),color="grey",alpha=0.4)+ggtitle("Polynomial degree 2")
poly2

Then the third degree

poly3<-lm(l_l_angle~poly(sacr_slope,3,raw=T),data=spine)
poly3
## 
## Call:
## lm(formula = l_l_angle ~ poly(sacr_slope, 3, raw = T), data = spine)
## 
## Coefficients:
##                   (Intercept)  poly(sacr_slope, 3, raw = T)1  
##                    27.0583125                     -0.2324786  
## poly(sacr_slope, 3, raw = T)2  poly(sacr_slope, 3, raw = T)3  
##                     0.0275644                     -0.0002009
fpoly3<-function(x){
  return(-0.0002009*x^3+0.0275644*x^2-0.2324786*x+27.0583125)
}

x_poly3<-spine$sacr_slope
y_poly3<-fpoly3(x)
means_poly3<-data.frame(x_poly3,y_poly3)
spine_poly3<-spine[,c(4,3)]
means_poly3$group<-1:310
spine_poly3$group<-1:310
colnames(means_poly3)<-c("sacr_slope","l_l_angle","group")
groups_poly3<-rbind(means_poly3,spine_poly3)

poly3<-ggplot()+geom_point(data=spine,aes(x=sacr_slope,y=l_l_angle))+
  stat_function(aes(x=spine$sacr_slope),fun=fpoly3)+
  geom_point(data=means_poly3,aes(x=x_poly3,y=y_poly3),color="red")+
  geom_line(data=groups_poly3,aes(x=sacr_slope,y=l_l_angle,group=group),color="grey",alpha=0.4)+ggtitle("Polynomial degree 3")
poly3

And finally a more extreme example since such a fit is very good to explain the current data but it does not generalize well.If you add new unseen data the fit will be pretty bad

poly5<-lm(l_l_angle~poly(sacr_slope,5,raw=T),data=spine)
poly5
## 
## Call:
## lm(formula = l_l_angle ~ poly(sacr_slope, 5, raw = T), data = spine)
## 
## Coefficients:
##                   (Intercept)  poly(sacr_slope, 5, raw = T)1  
##                     4.861e+00                      3.124e+00  
## poly(sacr_slope, 5, raw = T)2  poly(sacr_slope, 5, raw = T)3  
##                    -1.464e-01                      3.761e-03  
## poly(sacr_slope, 5, raw = T)4  poly(sacr_slope, 5, raw = T)5  
##                    -4.009e-05                      1.441e-07
fpoly5<-function(x){
  return((1.441*10^-07)*x^5+(-4.009*10^-05)*x^4+(3.761*10^-03)*x^3+(-1.464*10^-01)*x^2+(3.124*10^00)*x+(4.861*10^00))
}

x_poly5<-spine$sacr_slope
y_poly5<-fpoly5(x)
means_poly5<-data.frame(x_poly5,y_poly5)
spine_poly5<-spine[,c(4,3)]
means_poly5$group<-1:310
spine_poly5$group<-1:310
colnames(means_poly5)<-c("sacr_slope","l_l_angle","group")
groups_poly5<-rbind(means_poly5,spine_poly5)

poly5<-ggplot()+geom_point(data=spine,aes(x=sacr_slope,y=l_l_angle))+
  stat_function(aes(x=spine$sacr_slope),fun=fpoly5)+
  geom_point(data=means_poly5,aes(x=x_poly5,y=y_poly5),color="red")+
  geom_line(data=groups_poly5,aes(x=sacr_slope,y=l_l_angle,group=group),color="grey",alpha=0.4)+ggtitle("Polynomial degree 5")
poly5

library(gridExtra)
grid.arrange(linear,poly2,poly3,poly5)