In this simple tutorial we will demonstrate how to build and visualize fit and residuals for polynomial regression for several degrees.We will use a data set containing values for six biomechanical features used to classify orthopaedic patients into 3 classes (normal, disk hernia or spondilolysthesis) or 2 classes (normal or abnormal).In this example we will proceed with the 2 classes dataset.
First set working directory and load the file in R and explore it
setwd("/Users/christoschalitsios/Documents/vertebral_column_data")
spine<-read.csv("spine_2c.csv")
str(spine)
## 'data.frame': 310 obs. of 7 variables:
## $ plv_inc : num 63 39.1 68.8 69.3 49.7 ...
## $ plv_tilt : num 22.55 10.06 22.22 24.65 9.65 ...
## $ l_l_angle : num 39.6 25 50.1 44.3 28.3 ...
## $ sacr_slope : num 40.5 29 46.6 44.6 40.1 ...
## $ plv_radius : num 98.7 114.4 106 101.9 108.2 ...
## $ grade_spondy: num -0.25 4.56 -3.53 11.21 7.92 ...
## $ class : Factor w/ 2 levels "AB","NO": 1 1 1 1 1 1 1 1 1 1 ...
summary(spine)
## plv_inc plv_tilt l_l_angle sacr_slope
## Min. : 26.15 Min. :-6.55 Min. : 14.00 Min. : 13.37
## 1st Qu.: 46.43 1st Qu.:10.67 1st Qu.: 37.00 1st Qu.: 33.35
## Median : 58.69 Median :16.36 Median : 49.56 Median : 42.41
## Mean : 60.50 Mean :17.54 Mean : 51.93 Mean : 42.95
## 3rd Qu.: 72.88 3rd Qu.:22.12 3rd Qu.: 63.00 3rd Qu.: 52.69
## Max. :129.83 Max. :49.43 Max. :125.74 Max. :121.43
## plv_radius grade_spondy class
## Min. : 70.08 Min. :-11.06 AB:210
## 1st Qu.:110.71 1st Qu.: 1.60 NO:100
## Median :118.27 Median : 11.77
## Mean :117.92 Mean : 26.30
## 3rd Qu.:125.47 3rd Qu.: 41.28
## Max. :163.07 Max. :418.54
Use the str() and summary() functions to get an idea about the structure of the data and some basic descriptive statistics.Lets explore a little further the data with saome useful visualizations.The corrplot() is a good start to understand some basic relations among variables and continue with scatterplots
Looking at the corrplot it is clear that several variables have some pretty good correlations so lets pick “l_l_angle”(lumbar lordosis angle) and “sacr_slope”(sacral slope) which have a moderate ‘pearson r’ = 0.6
So, now we will fit 3 models a linear and two polynomials up to degree 3 and a higher degree in order to illustrate the flexibility of polynomials Fisrt the linear
lm1<-lm(l_l_angle~sacr_slope,data=spine)
lm1
##
## Call:
## lm(formula = l_l_angle ~ sacr_slope, data = spine)
##
## Coefficients:
## (Intercept) sacr_slope
## 16.4022 0.8271
flinear<-function(x){
return(0.8271*x+16.4022)
}
x<-spine$sacr_slope
y<-flinear(x)
means_linear<-data.frame(x,y)
spine_linear<-spine[,c(4,3)]
means_linear$group<-1:310
spine_linear$group<-1:310
colnames(means_linear)<-c("sacr_slope","l_l_angle","group")
groups<-rbind(means_linear,spine_linear)
linear<-ggplot()+geom_point(data=spine,aes(x=sacr_slope,y=l_l_angle))+
stat_function(aes(spine$sacr_slope),fun=flinear)+
geom_point(data=means_linear,aes(x=x,y=y),color="red")+
geom_line(data=groups,aes(x=sacr_slope,y=l_l_angle,group=group),color="grey",alpha=0.4)+ggtitle("Linear")
linear
Now the second degree polynomial
poly2<-lm(l_l_angle~poly(sacr_slope,2,raw=T),data=spine)
poly2
##
## Call:
## lm(formula = l_l_angle ~ poly(sacr_slope, 2, raw = T), data = spine)
##
## Coefficients:
## (Intercept) poly(sacr_slope, 2, raw = T)1
## -3.905348 1.751516
## poly(sacr_slope, 2, raw = T)2
## -0.009581
fpoly2<-function(x){
return(-0.009581*x^2+1.751516*x-3.905348)
}
x_poly2<-spine$sacr_slope
y_poly2<-fpoly2(x)
means_poly2<-data.frame(x_poly2,y_poly2)
spine_poly2<-spine[,c(4,3)]
means_poly2$group<-1:310
spine_poly2$group<-1:310
colnames(means_poly2)<-c("sacr_slope","l_l_angle","group")
groups_poly<-rbind(means_poly2,spine_poly2)
poly2<-ggplot()+geom_point(data=spine,aes(x=sacr_slope,y=l_l_angle))+
stat_function(aes(x=spine$sacr_slope),fun=fpoly2)+
geom_point(data=means_poly2,aes(x=x_poly2,y=y_poly2),color="red")+
geom_line(data=groups_poly,aes(x=sacr_slope,y=l_l_angle,group=group),color="grey",alpha=0.4)+ggtitle("Polynomial degree 2")
poly2
Then the third degree
poly3<-lm(l_l_angle~poly(sacr_slope,3,raw=T),data=spine)
poly3
##
## Call:
## lm(formula = l_l_angle ~ poly(sacr_slope, 3, raw = T), data = spine)
##
## Coefficients:
## (Intercept) poly(sacr_slope, 3, raw = T)1
## 27.0583125 -0.2324786
## poly(sacr_slope, 3, raw = T)2 poly(sacr_slope, 3, raw = T)3
## 0.0275644 -0.0002009
fpoly3<-function(x){
return(-0.0002009*x^3+0.0275644*x^2-0.2324786*x+27.0583125)
}
x_poly3<-spine$sacr_slope
y_poly3<-fpoly3(x)
means_poly3<-data.frame(x_poly3,y_poly3)
spine_poly3<-spine[,c(4,3)]
means_poly3$group<-1:310
spine_poly3$group<-1:310
colnames(means_poly3)<-c("sacr_slope","l_l_angle","group")
groups_poly3<-rbind(means_poly3,spine_poly3)
poly3<-ggplot()+geom_point(data=spine,aes(x=sacr_slope,y=l_l_angle))+
stat_function(aes(x=spine$sacr_slope),fun=fpoly3)+
geom_point(data=means_poly3,aes(x=x_poly3,y=y_poly3),color="red")+
geom_line(data=groups_poly3,aes(x=sacr_slope,y=l_l_angle,group=group),color="grey",alpha=0.4)+ggtitle("Polynomial degree 3")
poly3
And finally a more extreme example since such a fit is very good to explain the current data but it does not generalize well.If you add new unseen data the fit will be pretty bad
poly5<-lm(l_l_angle~poly(sacr_slope,5,raw=T),data=spine)
poly5
##
## Call:
## lm(formula = l_l_angle ~ poly(sacr_slope, 5, raw = T), data = spine)
##
## Coefficients:
## (Intercept) poly(sacr_slope, 5, raw = T)1
## 4.861e+00 3.124e+00
## poly(sacr_slope, 5, raw = T)2 poly(sacr_slope, 5, raw = T)3
## -1.464e-01 3.761e-03
## poly(sacr_slope, 5, raw = T)4 poly(sacr_slope, 5, raw = T)5
## -4.009e-05 1.441e-07
fpoly5<-function(x){
return((1.441*10^-07)*x^5+(-4.009*10^-05)*x^4+(3.761*10^-03)*x^3+(-1.464*10^-01)*x^2+(3.124*10^00)*x+(4.861*10^00))
}
x_poly5<-spine$sacr_slope
y_poly5<-fpoly5(x)
means_poly5<-data.frame(x_poly5,y_poly5)
spine_poly5<-spine[,c(4,3)]
means_poly5$group<-1:310
spine_poly5$group<-1:310
colnames(means_poly5)<-c("sacr_slope","l_l_angle","group")
groups_poly5<-rbind(means_poly5,spine_poly5)
poly5<-ggplot()+geom_point(data=spine,aes(x=sacr_slope,y=l_l_angle))+
stat_function(aes(x=spine$sacr_slope),fun=fpoly5)+
geom_point(data=means_poly5,aes(x=x_poly5,y=y_poly5),color="red")+
geom_line(data=groups_poly5,aes(x=sacr_slope,y=l_l_angle,group=group),color="grey",alpha=0.4)+ggtitle("Polynomial degree 5")
poly5
library(gridExtra)
grid.arrange(linear,poly2,poly3,poly5)