library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.7 ✔ recipes 1.1.1
## ✔ dials 1.4.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.0 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Modeling tasks: 1. Is this a supervised or unsupervised learning problem? Why? Supervised because I am giving it specific inputs and requesting specific outputs when I run the task.
Response: cmedv Predictor: all other variables
Regression Problem
boston <- readr::read_csv('Module-8/boston.csv')
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
missing_values <- colSums(is.na(boston))
print(missing_values)
## lon lat cmedv crim zn indus chas nox rm age
## 0 0 0 0 0 0 0 0 0 0
## dis rad tax ptratio b lstat
## 0 0 0 0 0 0
min_cmedv <- min(boston$cmedv)
max_cmedv <- max(boston$cmedv)
avg_cmedv <- mean(boston$cmedv)
median_cmedv <- median(boston$cmedv)
set.seed(123)
boston_split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)
boston_split
## <Training/Testing/Total>
## <352/154/506>
352 observations are in the training set and 154 observations are in the testing set.
ggplot(boston_train, aes(x = cmedv))+
geom_line(stat = 'density', trim = TRUE, color = 'blue')+
geom_line(data = boston_test, stat= 'density', trim = TRUE, color = 'red')
8. Fill in the blanks to fit a linear regression model using the rm
feature variable to predict cmedv and compute the RMSE on the test data.
What is the test set RMSE?
# fit model
lm1 <- linear_reg() %>%
fit(cmedv ~ rm, data = boston_train)
# compute the RMSE on the test data
lm1 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.83
lm2 <- linear_reg() %>%
fit(cmedv ~ ., data = boston_train)
# compute the RMSE on the test data
lm2 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
# fit model
knn <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("regression") %>%
fit(cmedv ~ ., data = boston_train)
# compute the RMSE on the test data
knn %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 3.37