library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library("kknn")
#1. Supervised because there is a target variable

#2.Response variable = cmedv , predictor variables = lon, lat, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat

# 3. Regression Problem
#4 
boston <- readr::read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sum(is.na(boston))
## [1] 0
# missing values = 0

summary(boston$cmedv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   17.02   21.20   22.53   25.00   50.00
# min = 5 , 1st Quar = 17.02 , medians = 21.2 , Mean = 22.53 , 3rd Qaur = 25 , max = 50
#5
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
# 6 = 352
# 8
# fit model
lm1 <- linear_reg() %>%
  fit(cmedv ~ rm, data = train)

# compute the RMSE on the test data
lm1 %>%
  predict(test) %>%
  bind_cols(test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.83
# Test set RMSE: 6.83
# 9
# fit model
lm2 <- linear_reg() %>%
  fit(cmedv ~ ., data = train)

# compute the RMSE on the test data
lm2 %>%
  predict(test) %>%
  bind_cols(test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83
# Test set RMSE: 4.83
#10
# fit model
knn <- nearest_neighbor() %>%
  set_engine("kknn") %>%
  set_mode("regression") %>%
  fit(cmedv ~ ., data = train)

# compute the RMSE on the test data
knn %>%
  predict(test) %>%
  bind_cols(test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        3.37
#RMSE = 3.37