Dataset
library(splines)
library(ggplot2)
df <- data.frame(x=1:15,
y=c(2, 10, 20, 25, 20, 10, 19, 15, 19, 10, 6, 20, 31, 31, 40))
#create scatter plot with line of best fit
ggplot(df, aes(x=x, y=y)) +
geom_point() +
geom_smooth(method=loess, se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
Fitted curve with high order terms
# plot a line
m1 <- lm(y~x,data=df)
plot(y~x,df)
lines(df$x,predict(m1),col="blue")
# # square
m2_1 <- lm(y~ I(x**2)+x,data=df)
plot(y~x,df)
lines(df$x,predict(m2_1),col="blue")
# a cube curve
m2 <- lm(y~ I(x**3)+I(x**2)+x,data=df)
plot(y~x,df)
lines(df$x,predict(m2),col="blue")
Spline curve by bins/ sections (one power)
# divide datasets into bins, cut off 4 and 11
newdf = data.frame(x = df$x,
y = df$y,
tt1 = (df$x - 4) * (df$x > 4),
tt2 = (df$x - 11) * (df$x > 11))
# spline curve with line sharp join connection , if tt1 is one power of x. tt1 has a big influence to the original cube curve.
# cubic
m3 <- lm(y~ I(x**3)+I(x**2)+x+tt1+tt2,data=newdf)
plot(y~x,newdf)
lines(newdf$x,predict(m3),col="blue")
# square
m3_2 <- lm(y~ I(x**2)+x+tt1+tt2,data=newdf)
plot(y~x,newdf)
lines(newdf$x,predict(m3_2),col="blue")
Spline curve by bins (high order term, more smooth in connection)
independent variables and spline terms should be a third-degree polynomial.
# divide datasets into bins, 4, 11
newdf = data.frame(x = df$x,
y = df$y,
tt1 = (df$x - 4)**1 * (df$x > 4),
tt2 = (df$x - 11)**1 * (df$x > 11))
# spline curve with smooth join connection , if tt1 is cube of x. tt1 has a little influence to the original cube curve.
# cubic
m3 <- lm(y~ I(x**3)+I(x**2)+x+I(tt1**3)+I(tt2**3),data=newdf)
summary(m3)
##
## Call:
## lm(formula = y ~ I(x^3) + I(x^2) + x + I(tt1^3) + I(tt2^3), data = newdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.848 -1.995 0.686 2.177 7.345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16.22068 18.59306 -0.872 0.406
## I(x^3) 0.21546 0.57347 0.376 0.716
## I(x^2) -3.82574 6.19736 -0.617 0.552
## x 21.14499 20.23340 1.045 0.323
## I(tt1^3) -0.07695 0.63034 -0.122 0.906
## I(tt2^3) -0.39690 0.34543 -1.149 0.280
##
## Residual standard error: 5.399 on 9 degrees of freedom
## Multiple R-squared: 0.8229, Adjusted R-squared: 0.7246
## F-statistic: 8.367 on 5 and 9 DF, p-value: 0.003385
plot(y~x,newdf)
lines(newdf$x,predict(m3),col="blue")
#################### square
m3_3 <- lm(y~ I(x**2)+x+I(tt1**3)+I(tt2**3),data=newdf)
summary(m3_3)
##
## Call:
## lm(formula = y ~ I(x^2) + x + I(tt1^3) + I(tt2^3), data = newdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4288 -1.3394 0.6158 2.3573 7.5122
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.83433 7.20456 -1.365 0.20217
## I(x^2) -1.50265 0.40298 -3.729 0.00392 **
## x 13.68074 3.66713 3.731 0.00391 **
## I(tt1^3) 0.15928 0.04294 3.710 0.00404 **
## I(tt2^3) -0.46721 0.27759 -1.683 0.12327
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.162 on 10 degrees of freedom
## Multiple R-squared: 0.8202, Adjusted R-squared: 0.7482
## F-statistic: 11.4 on 4 and 10 DF, p-value: 0.0009593
plot(y~x,newdf)
lines(newdf$x,predict(m3_3),col="blue")
Using a function
# creating b-spline term in a regression model
# Remember that the default spline model in R is a third-degree polynomial. This is because it is hard for the eye to detect the discontinuity at the knots.
model <- lm(y~ bs(x, knots=c(4,11)),data=newdf)
summary(model)
##
## Call:
## lm(formula = y ~ bs(x, knots = c(4, 11)), data = newdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.848 -1.995 0.686 2.177 7.345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.314 5.303 0.248 0.809863
## bs(x, knots = c(4, 11))1 14.140 9.701 1.458 0.178948
## bs(x, knots = c(4, 11))2 29.480 10.178 2.896 0.017703 *
## bs(x, knots = c(4, 11))3 -6.919 11.464 -0.604 0.561025
## bs(x, knots = c(4, 11))4 31.787 8.756 3.630 0.005483 **
## bs(x, knots = c(4, 11))5 38.226 7.456 5.127 0.000622 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.399 on 9 degrees of freedom
## Multiple R-squared: 0.8229, Adjusted R-squared: 0.7246
## F-statistic: 8.367 on 5 and 9 DF, p-value: 0.003385
# plot x and predictions
lims <- range(df$x)
lims.grid <- seq(from=lims[1],to=lims[2])
pred <- predict(model,newdata=list(x=lims.grid),se=T)
# head(pred)
# plot with std error area
plot(y~x,newdf)
lines(newdf$x,pred$fit,col="blue")
lines(newdf$x,pred$fit+2*pred$se.fit,col="green")
lines(newdf$x,pred$fit-2*pred$se.fit,col="green")
# how to determine K knots
# A somewhat more objective approach is to use cross-validation.
# This procedure can be repeated for different numbers of knots K. Then the value of K giving the smallest RSS is chosen.