library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library("kknn")
#1. Supervised because there is a target variable
#2.Response variable = cmedv , predictor variables = lon, lat, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat
# 3. Regression Problem
#4
boston <- readr::read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sum(is.na(boston))
## [1] 0
# missing values = 0
summary(boston$cmedv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 17.02 21.20 22.53 25.00 50.00
# min = 5 , 1st Quar = 17.02 , medians = 21.2 , Mean = 22.53 , 3rd Qaur = 25 , max = 50
#5
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
# 6 = 352
# 8
# fit model
lm1 <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
# compute the RMSE on the test data
lm1 %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.83
# Test set RMSE: 6.83
# 9
# fit model
lm2 <- linear_reg() %>%
fit(cmedv ~ ., data = train)
# compute the RMSE on the test data
lm2 %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
# Test set RMSE: 4.83
#10
# fit model
knn <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("regression") %>%
fit(cmedv ~ ., data = train)
# compute the RMSE on the test data
knn %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 3.37
#RMSE = 3.37