knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
theme_set(theme_classic())
data("Boston", package = "MASS")
# Split the data into training and test set
set.seed(123)
training.samples <- Boston$medv %>%
 createDataPartition(p = 0.8, list = FALSE)
train.data <- Boston[training.samples, ]
test.data <- Boston[-training.samples, ]
ggplot(train.data, aes(lstat, medv) ) +
 geom_point() +
 stat_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Simple linear regression

# Build the model
model <- lm(medv ~ lstat, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
 RMSE = RMSE(predictions, test.data$medv),
 R2 = R2(predictions, test.data$medv)
)
##       RMSE       R2
## 1 6.503817 0.513163
summary(model)
## 
## Call:
## lm(formula = medv ~ lstat, data = train.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.218  -4.011  -1.123   2.025  24.459 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.6527     0.6230   55.62   <2e-16 ***
## lstat        -0.9561     0.0428  -22.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.144 on 405 degrees of freedom
## Multiple R-squared:  0.5521, Adjusted R-squared:  0.551 
## F-statistic: 499.2 on 1 and 405 DF,  p-value: < 2.2e-16
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
coeftest(model)
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 34.652724   0.622974  55.625 < 2.2e-16 ***
## lstat       -0.956133   0.042796 -22.342 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(train.data, aes(lstat, medv) ) +
 geom_point() +
 stat_smooth(method = lm, formula = y ~ x)

##Polynomial Regression of order 2

model2<-lm(medv ~ poly(lstat, 2, raw = TRUE), data = train.data)
coeftest(model2)
## 
## t test of coefficients:
## 
##                              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)                 42.573638   0.964887  44.123 < 2.2e-16 ***
## poly(lstat, 2, raw = TRUE)1 -2.267309   0.135846 -16.690 < 2.2e-16 ***
## poly(lstat, 2, raw = TRUE)2  0.041198   0.004095  10.060 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model2)
## 
## Call:
## lm(formula = medv ~ poly(lstat, 2, raw = TRUE), data = train.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.3654  -3.8250  -0.6439   2.2733  25.2922 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 42.573638   0.964887   44.12   <2e-16 ***
## poly(lstat, 2, raw = TRUE)1 -2.267309   0.135846  -16.69   <2e-16 ***
## poly(lstat, 2, raw = TRUE)2  0.041198   0.004095   10.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.501 on 404 degrees of freedom
## Multiple R-squared:  0.6418, Adjusted R-squared:   0.64 
## F-statistic: 361.9 on 2 and 404 DF,  p-value: < 2.2e-16
ggplot(train.data, aes(lstat, medv) ) +
 geom_point() +
 stat_smooth(method = lm, formula = y ~ poly(x, 2, raw = TRUE))

##Polynomial model of order 6`

lm(medv ~ poly(lstat, 6, raw = TRUE), data = train.data) %>%
 summary()
## 
## Call:
## lm(formula = medv ~ poly(lstat, 6, raw = TRUE), data = train.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.1962  -3.1527  -0.7655   2.0404  26.7661 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  7.788e+01  6.844e+00  11.379  < 2e-16 ***
## poly(lstat, 6, raw = TRUE)1 -1.767e+01  3.569e+00  -4.952 1.08e-06 ***
## poly(lstat, 6, raw = TRUE)2  2.417e+00  6.779e-01   3.566 0.000407 ***
## poly(lstat, 6, raw = TRUE)3 -1.761e-01  6.105e-02  -2.885 0.004121 ** 
## poly(lstat, 6, raw = TRUE)4  6.845e-03  2.799e-03   2.446 0.014883 *  
## poly(lstat, 6, raw = TRUE)5 -1.343e-04  6.290e-05  -2.136 0.033323 *  
## poly(lstat, 6, raw = TRUE)6  1.047e-06  5.481e-07   1.910 0.056910 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.188 on 400 degrees of freedom
## Multiple R-squared:  0.6845, Adjusted R-squared:  0.6798 
## F-statistic: 144.6 on 6 and 400 DF,  p-value: < 2.2e-16
ggplot(train.data, aes(lstat, medv) ) +
 geom_point() +
 stat_smooth(method = lm, formula = y ~ poly(x, 6, raw = TRUE))

##polynomial of Order 4

lm(medv ~ poly(lstat, 4, raw = TRUE), data = train.data) %>%
 summary()
## 
## Call:
## lm(formula = medv ~ poly(lstat, 4, raw = TRUE), data = train.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6093  -3.1719  -0.6806   2.2075  27.1453 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  5.764e+01  2.615e+00  22.047  < 2e-16 ***
## poly(lstat, 4, raw = TRUE)1 -7.098e+00  8.303e-01  -8.549 2.62e-16 ***
## poly(lstat, 4, raw = TRUE)2  5.007e-01  8.419e-02   5.947 5.94e-09 ***
## poly(lstat, 4, raw = TRUE)3 -1.643e-02  3.336e-03  -4.925 1.24e-06 ***
## poly(lstat, 4, raw = TRUE)4  1.948e-04  4.468e-05   4.359 1.66e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.255 on 402 degrees of freedom
## Multiple R-squared:  0.6747, Adjusted R-squared:  0.6715 
## F-statistic: 208.5 on 4 and 402 DF,  p-value: < 2.2e-16
ggplot(train.data, aes(lstat, medv) ) +
 geom_point() +
 stat_smooth(method = lm, formula = y ~ poly(x, 4, raw = TRUE))

##Spline regression

library(splines)
knots <- quantile(train.data$lstat, p = c(0.25, 0.5, 0.75))
model <- lm (medv ~ bs(lstat, knots = knots), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
 RMSE = RMSE(predictions, test.data$medv),
 R2 = R2(predictions, test.data$medv)
)
##       RMSE        R2
## 1 5.317372 0.6786367
ggplot(train.data, aes(lstat, medv) ) +
 geom_point() +
 stat_smooth(method = lm, formula = y ~ splines::bs(x, df = 3))