knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
theme_set(theme_classic())
data("Boston", package = "MASS")
# Split the data into training and test set
set.seed(123)
training.samples <- Boston$medv %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- Boston[training.samples, ]
test.data <- Boston[-training.samples, ]
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##
Simple linear regression
# Build the model
model <- lm(medv ~ lstat, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$medv),
R2 = R2(predictions, test.data$medv)
)
## RMSE R2
## 1 6.503817 0.513163
summary(model)
##
## Call:
## lm(formula = medv ~ lstat, data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.218 -4.011 -1.123 2.025 24.459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6527 0.6230 55.62 <2e-16 ***
## lstat -0.9561 0.0428 -22.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.144 on 405 degrees of freedom
## Multiple R-squared: 0.5521, Adjusted R-squared: 0.551
## F-statistic: 499.2 on 1 and 405 DF, p-value: < 2.2e-16
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
coeftest(model)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.652724 0.622974 55.625 < 2.2e-16 ***
## lstat -0.956133 0.042796 -22.342 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)
##Polynomial Regression of order 2
model2<-lm(medv ~ poly(lstat, 2, raw = TRUE), data = train.data)
coeftest(model2)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.573638 0.964887 44.123 < 2.2e-16 ***
## poly(lstat, 2, raw = TRUE)1 -2.267309 0.135846 -16.690 < 2.2e-16 ***
## poly(lstat, 2, raw = TRUE)2 0.041198 0.004095 10.060 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model2)
##
## Call:
## lm(formula = medv ~ poly(lstat, 2, raw = TRUE), data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.3654 -3.8250 -0.6439 2.2733 25.2922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.573638 0.964887 44.12 <2e-16 ***
## poly(lstat, 2, raw = TRUE)1 -2.267309 0.135846 -16.69 <2e-16 ***
## poly(lstat, 2, raw = TRUE)2 0.041198 0.004095 10.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.501 on 404 degrees of freedom
## Multiple R-squared: 0.6418, Adjusted R-squared: 0.64
## F-statistic: 361.9 on 2 and 404 DF, p-value: < 2.2e-16
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x, 2, raw = TRUE))
##Polynomial model of order 6`
lm(medv ~ poly(lstat, 6, raw = TRUE), data = train.data) %>%
summary()
##
## Call:
## lm(formula = medv ~ poly(lstat, 6, raw = TRUE), data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.1962 -3.1527 -0.7655 2.0404 26.7661
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.788e+01 6.844e+00 11.379 < 2e-16 ***
## poly(lstat, 6, raw = TRUE)1 -1.767e+01 3.569e+00 -4.952 1.08e-06 ***
## poly(lstat, 6, raw = TRUE)2 2.417e+00 6.779e-01 3.566 0.000407 ***
## poly(lstat, 6, raw = TRUE)3 -1.761e-01 6.105e-02 -2.885 0.004121 **
## poly(lstat, 6, raw = TRUE)4 6.845e-03 2.799e-03 2.446 0.014883 *
## poly(lstat, 6, raw = TRUE)5 -1.343e-04 6.290e-05 -2.136 0.033323 *
## poly(lstat, 6, raw = TRUE)6 1.047e-06 5.481e-07 1.910 0.056910 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.188 on 400 degrees of freedom
## Multiple R-squared: 0.6845, Adjusted R-squared: 0.6798
## F-statistic: 144.6 on 6 and 400 DF, p-value: < 2.2e-16
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x, 6, raw = TRUE))
##polynomial of Order 4
lm(medv ~ poly(lstat, 4, raw = TRUE), data = train.data) %>%
summary()
##
## Call:
## lm(formula = medv ~ poly(lstat, 4, raw = TRUE), data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6093 -3.1719 -0.6806 2.2075 27.1453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.764e+01 2.615e+00 22.047 < 2e-16 ***
## poly(lstat, 4, raw = TRUE)1 -7.098e+00 8.303e-01 -8.549 2.62e-16 ***
## poly(lstat, 4, raw = TRUE)2 5.007e-01 8.419e-02 5.947 5.94e-09 ***
## poly(lstat, 4, raw = TRUE)3 -1.643e-02 3.336e-03 -4.925 1.24e-06 ***
## poly(lstat, 4, raw = TRUE)4 1.948e-04 4.468e-05 4.359 1.66e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.255 on 402 degrees of freedom
## Multiple R-squared: 0.6747, Adjusted R-squared: 0.6715
## F-statistic: 208.5 on 4 and 402 DF, p-value: < 2.2e-16
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x, 4, raw = TRUE))
##Spline regression
library(splines)
knots <- quantile(train.data$lstat, p = c(0.25, 0.5, 0.75))
model <- lm (medv ~ bs(lstat, knots = knots), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$medv),
R2 = R2(predictions, test.data$medv)
)
## RMSE R2
## 1 5.317372 0.6786367
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ splines::bs(x, df = 3))