#1. This is supervised as we are trying to predict our cmedv value

#2. cmedv is the response variable, all others are predictor

#3.regression

#4.  no, as the value is 0

boston <- readr::read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sum(is.na(boston))
## [1] 0
summary(boston$cmedv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   17.02   21.20   22.53   25.00   50.00
#5.

set.seed(123)
split <- initial_split(boston, prop = .7, strata = cmedv)
train <- training(split)
test <- testing(split)
#6.
nrow(train)
## [1] 352
nrow(test)
## [1] 154
# training = 352. testing = 154
#7.
summary(train$cmedv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.30   17.02   21.20   22.56   25.00   50.00
summary(test$cmedv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   17.02   21.15   22.45   25.00   50.00
ggplot(train, aes(x = cmedv)) +
  geom_histogram(binwidth = 2) +
  ggtitle("Training")

ggplot(test, aes(x = cmedv)) +
  geom_histogram(binwidth = 2) +
  ggtitle("Testing")

#they differ slightly in distribution
#.8
# fit model
lm1 <- linear_reg() %>%
  fit(cmedv ~ rm, data = train)
# compute the RMSE on the test data
lm1 %>%
  predict(test) %>%
  bind_cols(test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
# 6.83
#9.
# fit model
lm2 <- linear_reg() %>%
  fit(cmedv ~ ., data = train)
# compute the RMSE on the test data
lm2 %>%
  predict(test) %>%
  bind_cols(test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
# 4.83, this is better
#10.
# fit model
knn <- nearest_neighbor() %>%
  set_engine("kknn") %>%
  set_mode("regression") %>%
  fit(cmedv ~ ., data = train)
# compute the RMSE on the test data
knn %>%
  predict(test) %>%
  bind_cols(test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
# 3.37, which is the best of the 3!