The purpose of this data set is to predict the median value of owner-occupied homes for various census tracts in the Boston area. Each row (observation) represents a given census tract and the variable we wish to predict is cmedv (median value of owner-occupied homes in USD 1000’s). The other variables are variables we want to use to help make predictions of cmedv and include:
• lon: longitude of census tract
• lat: latitude of census tract
• crim: per capita crime rate by town
• zn: proportion of residential land zoned for lots over 25,000 sq.ft
• indus: proportion of non-retail business acres per town
• chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
• nox: nitric oxides concentration (parts per 10 million) –> aka air pollution
• rm: average number of rooms per dwelling
• age: proportion of owner-occupied units built prior to 1940
• dis: weighted distances to five Boston employment centers
• rad: index of accessibility to radial highways
• tax: full-value property-tax rate per USD 10,000
• ptratio: pupil-teacher ratio by town
• lstat: percentage of lower status of the population
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
boston <- readr::read_csv("boston (1).csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
cor <- cor(train)
cor["cmedv", ] %>% sort(decreasing = TRUE)
## cmedv rm b zn dis chas
## 1.000000000 0.708152619 0.358302318 0.344272023 0.271455846 0.164575979
## lat lon crim rad age nox
## -0.002024587 -0.315456520 -0.384298342 -0.396999035 -0.398915864 -0.439021014
## tax indus ptratio lstat
## -0.479383777 -0.489537963 -0.500927283 -0.742823016
ggplot(train, aes(rm, cmedv)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
model <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
tidy(model)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -35.4 3.11 -11.4 8.70e-26
## 2 rm 9.22 0.491 18.8 7.46e-55
model %>%
predict(test) %>%
bind_cols(test) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.83
model2 <- linear_reg() %>%
fit(cmedv ~ ., data = train)
tidy(model2) %>%
filter(as.numeric(p.value) < 0.05)
## # A tibble: 11 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 crim -0.0830 0.0396 -2.10 3.65e- 2
## 2 zn 0.0332 0.0165 2.01 4.56e- 2
## 3 chas 2.28 1.05 2.17 3.06e- 2
## 4 nox -11.7 4.74 -2.46 1.44e- 2
## 5 rm 4.37 0.516 8.46 8.13e-16
## 6 dis -1.26 0.244 -5.17 4.06e- 7
## 7 rad 0.272 0.0790 3.44 6.47e- 4
## 8 tax -0.0121 0.00436 -2.78 5.78e- 3
## 9 ptratio -0.874 0.163 -5.37 1.48e- 7
## 10 b 0.0123 0.00310 3.97 8.72e- 5
## 11 lstat -0.479 0.0637 -7.51 5.24e-13
model2 %>%
predict(test) %>%
bind_cols(test) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
# This is better than the model that just used the rm variable
model2 %>%
vip::vip(5)