#1. This is supervised as we are trying to predict our cmedv value
#2. cmedv is the response variable, all others are predictor
#3.regression
#4. no, as the value is 0
boston <- readr::read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sum(is.na(boston))
## [1] 0
summary(boston$cmedv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 17.02 21.20 22.53 25.00 50.00
#5.
set.seed(123)
split <- initial_split(boston, prop = .7, strata = cmedv)
train <- training(split)
test <- testing(split)
#6.
nrow(train)
## [1] 352
nrow(test)
## [1] 154
# training = 352. testing = 154
#7.
summary(train$cmedv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.30 17.02 21.20 22.56 25.00 50.00
summary(test$cmedv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 17.02 21.15 22.45 25.00 50.00
ggplot(train, aes(x = cmedv)) +
geom_histogram(binwidth = 2) +
ggtitle("Training")

ggplot(test, aes(x = cmedv)) +
geom_histogram(binwidth = 2) +
ggtitle("Testing")

#they differ slightly in distribution
#.8
# fit model
lm1 <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
# compute the RMSE on the test data
lm1 %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
# 6.83
#9.
# fit model
lm2 <- linear_reg() %>%
fit(cmedv ~ ., data = train)
# compute the RMSE on the test data
lm2 %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
# 4.83, this is better
#10.
# fit model
knn <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("regression") %>%
fit(cmedv ~ ., data = train)
# compute the RMSE on the test data
knn %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
# 3.37, which is the best of the 3!