#1.
It is a supervised learning problem because we are predicting the
median value of owner occupied homes using other variables as
features.
#2. TARGET - Response variable: cmedv(median value of owner-occupied
homes)
FEATURES - Predictor variables: lon, lat, crim, zn, indus, chas, nox,
rm, age, dis, rad, tax, ptratio, lstat
#3. Since cmedv is a continuous variable this is a regression
problem.
#START OF CODE
#install packages, load in data set, check for missing values and stats
library(readr)
boston <- read_csv("Desktop/R/Data/boston.csv")
View(boston)
library(readr)
boston <- read_csv("Desktop/R/Data/boston.csv")
install.packages("rsample")
install.packages("tidymodels")
install.packages("tidyverse")
install.packages("kknn")
install.packages("conflicted")
install.packages("dplyr")
install.packages("kknn")
library(rsample)
library(tidyverse)
library(tidymodels)
library(conflicted)
library(dplyr)
library(kknn)
library(conflicted)
library(readr)
boston <- read_csv("Desktop/R/Data/boston.csv")
View(boston)
library(readr)
boston <- read_csv("Desktop/R/Data/boston.csv")
sum(is.na(boston))
0
min_cmedv <- min(boston$cmedv, na.rm = TRUE)
max_cmedv <- max(boston$cmedv, na.rm = TRUE)
mean_cmedv <- mean(boston$cmedv, na.rm = TRUE)
median_cmedv <- median(boston$cmedv, na.rm = TRUE)
min_cmedv
max_cmedv
mean_cmedv
median_cmedv
#splitting data 70-30% split
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
nrow(train)
nrow(test)
nrow(train)
[1] 352
nrow(test)
[1] 154
#compare distributions
train %>%
ggplot(aes(x = cmedv)) +
geom_histogram(binwidth = 1, fill = "blue", alpha = 0.7) +
labs(title = "Distribution of cmedv in Training Set")
test %>%
ggplot(aes(x = cmedv)) +
geom_histogram(binwidth = 1, fill = "red", alpha = 0.7) +
labs(title = "Distribution of cmedv in Test Set")
#linear regression model and computing RMSE
lm1 <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
lm1 %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 6.83
#fit linear regression model compute RMSE with all factors
lm2 <- linear_reg() %>%
fit(cmedv ~ ., data = train)
lm2 %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 4.83
#fit K nearest neighbor model and calculate RMSE
knn <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("regression") %>%
fit(cmedv ~ ., data = train)
knn %>%
predict(test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 3.37
plot(cars)
LS0tCnRpdGxlOiAiTW9kdWxlIDggTGFiIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKZWRpdG9yOiBFbW1hIGhlbnNsZXkKbWFya2Rvd246IAp3cmFwOiA3MgotLS0KCiMxLgoKSXQgaXMgYSBzdXBlcnZpc2VkIGxlYXJuaW5nIHByb2JsZW0gYmVjYXVzZSB3ZSBhcmUgcHJlZGljdGluZyB0aGUgbWVkaWFuIHZhbHVlIG9mIG93bmVyIG9jY3VwaWVkIGhvbWVzIHVzaW5nIG90aGVyIHZhcmlhYmxlcyBhcyBmZWF0dXJlcy4KCiMyLiBUQVJHRVQgLSBSZXNwb25zZSB2YXJpYWJsZTogY21lZHYobWVkaWFuIHZhbHVlIG9mIG93bmVyLW9jY3VwaWVkIGhvbWVzKQoKRkVBVFVSRVMgLSBQcmVkaWN0b3IgdmFyaWFibGVzOiBsb24sIGxhdCwgY3JpbSwgem4sIGluZHVzLCBjaGFzLCBub3gsIHJtLCBhZ2UsIGRpcywgcmFkLCB0YXgsIHB0cmF0aW8sIGxzdGF0CgojMy4gU2luY2UgY21lZHYgaXMgYSBjb250aW51b3VzIHZhcmlhYmxlIHRoaXMgaXMgYSByZWdyZXNzaW9uIHByb2JsZW0uCgpgYGB7cn0KCiNTVEFSVCBPRiBDT0RFIAoKI2luc3RhbGwgcGFja2FnZXMsIGxvYWQgaW4gZGF0YSBzZXQsIGNoZWNrIGZvciBtaXNzaW5nIHZhbHVlcyBhbmQgc3RhdHMKIApsaWJyYXJ5KHJlYWRyKQpib3N0b24gPC0gcmVhZF9jc3YoIkRlc2t0b3AvUi9EYXRhL2Jvc3Rvbi5jc3YiKQpWaWV3KGJvc3RvbikKbGlicmFyeShyZWFkcikKYm9zdG9uIDwtIHJlYWRfY3N2KCJEZXNrdG9wL1IvRGF0YS9ib3N0b24uY3N2IikKCmluc3RhbGwucGFja2FnZXMoInJzYW1wbGUiKQppbnN0YWxsLnBhY2thZ2VzKCJ0aWR5bW9kZWxzIikKaW5zdGFsbC5wYWNrYWdlcygidGlkeXZlcnNlIikKaW5zdGFsbC5wYWNrYWdlcygia2tubiIpCmluc3RhbGwucGFja2FnZXMoImNvbmZsaWN0ZWQiKQppbnN0YWxsLnBhY2thZ2VzKCJkcGx5ciIpCmluc3RhbGwucGFja2FnZXMoImtrbm4iKQpsaWJyYXJ5KHJzYW1wbGUpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHRpZHltb2RlbHMpCmxpYnJhcnkoY29uZmxpY3RlZCkKbGlicmFyeShkcGx5cikKbGlicmFyeShra25uKQpsaWJyYXJ5KGNvbmZsaWN0ZWQpCgpsaWJyYXJ5KHJlYWRyKQpib3N0b24gPC0gcmVhZF9jc3YoIkRlc2t0b3AvUi9EYXRhL2Jvc3Rvbi5jc3YiKQpWaWV3KGJvc3RvbikKbGlicmFyeShyZWFkcikKYm9zdG9uIDwtIHJlYWRfY3N2KCJEZXNrdG9wL1IvRGF0YS9ib3N0b24uY3N2IikKCgpzdW0oaXMubmEoYm9zdG9uKSkgCjAKCm1pbl9jbWVkdiA8LSBtaW4oYm9zdG9uJGNtZWR2LCBuYS5ybSA9IFRSVUUpCm1heF9jbWVkdiA8LSBtYXgoYm9zdG9uJGNtZWR2LCBuYS5ybSA9IFRSVUUpCm1lYW5fY21lZHYgPC0gbWVhbihib3N0b24kY21lZHYsIG5hLnJtID0gVFJVRSkKbWVkaWFuX2NtZWR2IDwtIG1lZGlhbihib3N0b24kY21lZHYsIG5hLnJtID0gVFJVRSkKCm1pbl9jbWVkdgptYXhfY21lZHYKbWVhbl9jbWVkdgptZWRpYW5fY21lZHYKCiNzcGxpdHRpbmcgZGF0YSA3MC0zMCUgc3BsaXQKCnNldC5zZWVkKDEyMykKCnNwbGl0IDwtIGluaXRpYWxfc3BsaXQoYm9zdG9uLCBwcm9wID0gMC43LCBzdHJhdGEgPSBjbWVkdikKdHJhaW4gPC0gdHJhaW5pbmcoc3BsaXQpCnRlc3QgPC0gdGVzdGluZyhzcGxpdCkKCm5yb3codHJhaW4pCm5yb3codGVzdCkKCm5yb3codHJhaW4pClsxXSAzNTIKbnJvdyh0ZXN0KQpbMV0gMTU0CgojY29tcGFyZSBkaXN0cmlidXRpb25zCgp0cmFpbiAlPiUKICBnZ3Bsb3QoYWVzKHggPSBjbWVkdikpICsKICBnZW9tX2hpc3RvZ3JhbShiaW53aWR0aCA9IDEsIGZpbGwgPSAiYmx1ZSIsIGFscGhhID0gMC43KSArCiAgbGFicyh0aXRsZSA9ICJEaXN0cmlidXRpb24gb2YgY21lZHYgaW4gVHJhaW5pbmcgU2V0IikKCnRlc3QgJT4lCiAgZ2dwbG90KGFlcyh4ID0gY21lZHYpKSArCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAxLCBmaWxsID0gInJlZCIsIGFscGhhID0gMC43KSArCiAgbGFicyh0aXRsZSA9ICJEaXN0cmlidXRpb24gb2YgY21lZHYgaW4gVGVzdCBTZXQiKQoKI2xpbmVhciByZWdyZXNzaW9uIG1vZGVsIGFuZCBjb21wdXRpbmcgUk1TRQoKbG0xIDwtIGxpbmVhcl9yZWcoKSAlPiUKICBmaXQoY21lZHYgfiBybSwgZGF0YSA9IHRyYWluKQoKbG0xICU+JQogIHByZWRpY3QodGVzdCkgJT4lCiAgYmluZF9jb2xzKHRlc3QgJT4lIHNlbGVjdChjbWVkdikpICU+JQogIHJtc2UodHJ1dGggPSBjbWVkdiwgZXN0aW1hdGUgPSAucHJlZCkKCiMgQSB0aWJibGU6IDEgw5cgMwoubWV0cmljIC5lc3RpbWF0b3IgLmVzdGltYXRlCjxjaHI+ICAgPGNocj4gICAgICAgICAgPGRibD4KICAxIHJtc2UgICAgc3RhbmRhcmQgICAgICAgIDYuODMKCiNmaXQgbGluZWFyIHJlZ3Jlc3Npb24gbW9kZWwgY29tcHV0ZSBSTVNFIHdpdGggYWxsIGZhY3RvcnMKCmxtMiA8LSBsaW5lYXJfcmVnKCkgJT4lCiAgZml0KGNtZWR2IH4gLiwgZGF0YSA9IHRyYWluKQoKbG0yICU+JQogIHByZWRpY3QodGVzdCkgJT4lCiAgYmluZF9jb2xzKHRlc3QgJT4lIHNlbGVjdChjbWVkdikpICU+JQogIHJtc2UodHJ1dGggPSBjbWVkdiwgZXN0aW1hdGUgPSAucHJlZCkKCiMgQSB0aWJibGU6IDEgw5cgMwoubWV0cmljIC5lc3RpbWF0b3IgLmVzdGltYXRlCjxjaHI+ICAgPGNocj4gICAgICAgICAgPGRibD4KICAxIHJtc2UgICAgc3RhbmRhcmQgICAgICAgIDQuODMKCiNmaXQgSyBuZWFyZXN0IG5laWdoYm9yIG1vZGVsIGFuZCBjYWxjdWxhdGUgUk1TRQoKa25uIDwtIG5lYXJlc3RfbmVpZ2hib3IoKSAlPiUKICBzZXRfZW5naW5lKCJra25uIikgJT4lCiAgc2V0X21vZGUoInJlZ3Jlc3Npb24iKSAlPiUKICBmaXQoY21lZHYgfiAuLCBkYXRhID0gdHJhaW4pCgprbm4gJT4lCiAgcHJlZGljdCh0ZXN0KSAlPiUKICBiaW5kX2NvbHModGVzdCAlPiUgc2VsZWN0KGNtZWR2KSkgJT4lCiAgcm1zZSh0cnV0aCA9IGNtZWR2LCBlc3RpbWF0ZSA9IC5wcmVkKQoKIyBBIHRpYmJsZTogMSDDlyAzCi5tZXRyaWMgLmVzdGltYXRvciAuZXN0aW1hdGUKPGNocj4gICA8Y2hyPiAgICAgICAgICA8ZGJsPgogIDEgcm1zZSAgICBzdGFuZGFyZCAgICAgICAgMy4zNwoKCnBsb3QoY2FycykKYGBgCg==