Prerequisites

library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ───────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.3.6     ✔ dplyr   1.0.9
✔ tibble  3.1.8     ✔ stringr 1.4.1
✔ tidyr   1.2.0     ✔ forcats 0.5.2
✔ purrr   0.3.4     ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
install.packages("tidymodels")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/tidymodels_1.0.0.tgz'
Content type 'application/x-gzip' length 112363 bytes (109 KB)
==================================================
downloaded 109 KB

The downloaded binary packages are in
    /var/folders/n6/cxyj8ln97v94mmwvbp991tbh0000gn/T//RtmpOcqNWF/downloaded_packages
library(tidymodels)
── Attaching packages ──────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.0     ✔ rsample      1.1.0
✔ dials        1.0.0     ✔ tune         1.0.1
✔ infer        1.0.3     ✔ workflows    1.1.0
✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
✔ parsnip      1.0.2     ✔ yardstick    1.1.0
✔ recipes      1.0.1     
── Conflicts ─────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Search for functions across packages at https://www.tidymodels.org/find/
library(here)
library(readr)
boston <- read_csv("Documents/Learning R/Module 8/data/boston.csv")
Error: 'Documents/Learning R/Module 8/data/boston.csv' does not exist in current working directory ('/Users/oanhdang/Documents/Learning R/Module 8').

Model Tasks

Question 1

Is this a supervised or unsupervised learning problem? Why?

This is a supervised learning problem because we want to predict the specific target (cmdev)

Question 2

There are 16 variables in this data set. Which variable is the response variable and which variables are the predictor variables (aka features)?

  • Response variable: cmedv
  • Predictor variables: lon, lat, crim, zn, sq.ft, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat

Question 3

Given the type of variable cmedv is, is this a regression or classification problem?

This is regression problem because the output is numeric continous ## Question 4

library(here)
library(readr)
boston <- read_csv("Documents/Learning R/Module 8/data/boston.csv")
Error: 'Documents/Learning R/Module 8/data/boston.csv' does not exist in current working directory ('/Users/oanhdang/Documents/Learning R/Module 8').

Question 5

Fill in the blanks to split the data into a training set and test set using a 70-30% split. Be sure to include the set.seed(123) so that your train and test sets are the same size as mine.

set.seed(123)
boston_split<- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train<- training(boston_split)
boston_test<- testing(boston_split)

Question 6:

How many observations are in the training set and test set?

dim(boston_test)   
[1] 154  16
dim(boston_train)
[1] 352  16

Questiob 7

Compare the distribution of cmedv between the training set and test set. Do they appear to have the

same distribution or do they differ significantly?

ggplot(boston_train, aes(x = cmedv)) + 
  geom_line(stat = "density", trim = TRUE) + 
  geom_line(data = boston_test, stat = "density", trim = TRUE, col = "red")

The training and the test set have the same distribution

Question 8

Fill in the blanks to fit a linear regression model using the rm feature variable to predict cmedv and compute the RMSE on the test data. What is the test set RMSE?

# fit model
boston_lm1 <- linear_reg() %>%
  set_engine('lm') %>%
  fit(cmedv ~ rm , data = boston_train)

# compute the RMSE on the test data
boston_lm1 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)

Question 9

Fill in the blanks to fit a linear regression model using all available features to predict cmedv and compute the RMSE on the test data. What is the test set RMSE? Is this better than the previous model’s performance?

# fit model
boston_lm2 <- linear_reg() %>%
  set_engine('lm') %>%
  fit(cmedv ~ . , data = boston_train)

# compute the RMSE on the test data
boston_lm2 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)

Question 10

Fit a K-nearest neighbor model that uses all available features to predict cmedv and compute the RMSE on the test data. What is the test set RMSE? Is this better than the previous two models’ performances?

install.packages("kknn")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/kknn_1.3.1.tgz'
Content type 'application/x-gzip' length 319886 bytes (312 KB)
==================================================
downloaded 312 KB

The downloaded binary packages are in
    /var/folders/n6/cxyj8ln97v94mmwvbp991tbh0000gn/T//RtmpOcqNWF/downloaded_packages
library(kknn)

# fit model
knn <- nearest_neighbor() %>%
set_engine('kknn') %>%
set_mode("regression") %>%
fit( cmedv ~ . , data = boston_train)
# compute the RMSE on the test data
knn %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
NA
sum(is.na(boston$cmedv))
[1] 0
min(boston$cmedv)
[1] 5
max(boston$cmedv)
[1] 50
median(boston$cmedv)
[1] 21.2
mean(boston$cmedv)
[1] 22.52885
LS0tCnRpdGxlOiAiTW9kdWxlIDggTGFiIgphdXRob3I6ICJPYW5oIERhbmciCm91dHB1dDoKICBodG1sX2RvY3VtZW50OgogICAgZGZfcHJpbnQ6IHBhZ2VkCi0tLQojIyBQcmVyZXF1aXNpdGVzCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKaW5zdGFsbC5wYWNrYWdlcygidGlkeW1vZGVscyIpCmxpYnJhcnkodGlkeW1vZGVscykKbGlicmFyeShoZXJlKQpsaWJyYXJ5KHJlYWRyKQpib3N0b24gPC0gcmVhZF9jc3YoIkRvY3VtZW50cy9MZWFybmluZyBSL01vZHVsZSA4L2RhdGEvYm9zdG9uLmNzdiIpCmBgYAoKCiMjIE1vZGVsIFRhc2tzCgojIyBRdWVzdGlvbiAxCiMjIyBJcyB0aGlzIGEgc3VwZXJ2aXNlZCBvciB1bnN1cGVydmlzZWQgbGVhcm5pbmcgcHJvYmxlbT8gV2h5PwoKVGhpcyBpcyBhIHN1cGVydmlzZWQgbGVhcm5pbmcgcHJvYmxlbSBiZWNhdXNlIHdlIHdhbnQgdG8gcHJlZGljdCB0aGUgc3BlY2lmaWMgdGFyZ2V0IChjbWRldikKCiMjIFF1ZXN0aW9uIDIKIyMjIFRoZXJlIGFyZSAxNiB2YXJpYWJsZXMgaW4gdGhpcyBkYXRhIHNldC4gV2hpY2ggdmFyaWFibGUgaXMgdGhlIHJlc3BvbnNlIHZhcmlhYmxlIGFuZCB3aGljaCB2YXJpYWJsZXMgYXJlIHRoZSBwcmVkaWN0b3IgdmFyaWFibGVzIChha2EgZmVhdHVyZXMpPwoKKiBSZXNwb25zZSB2YXJpYWJsZTogY21lZHYKKiBQcmVkaWN0b3IgdmFyaWFibGVzOgpsb24sIGxhdCwgY3JpbSwgem4sIHNxLmZ0LCBpbmR1cywgY2hhcywgbm94LCBybSwgYWdlLCBkaXMsIHJhZCwgdGF4LCBwdHJhdGlvLCBsc3RhdAoKIyMgUXVlc3Rpb24gMwojIyMgR2l2ZW4gdGhlIHR5cGUgb2YgdmFyaWFibGUgY21lZHYgaXMsIGlzIHRoaXMgYSByZWdyZXNzaW9uIG9yIGNsYXNzaWZpY2F0aW9uIHByb2JsZW0/ClRoaXMgaXMgcmVncmVzc2lvbiBwcm9ibGVtIGJlY2F1c2UgdGhlIG91dHB1dCBpcyBudW1lcmljIGNvbnRpbm91cwojIyBRdWVzdGlvbiA0CmBgYHtyfQpsaWJyYXJ5KGhlcmUpCmxpYnJhcnkocmVhZHIpCmJvc3RvbiA8LSByZWFkX2NzdigiRG9jdW1lbnRzL0xlYXJuaW5nIFIvTW9kdWxlIDgvZGF0YS9ib3N0b24uY3N2IikKYGBgCgojIyBRdWVzdGlvbiA1CiMjIyBGaWxsIGluIHRoZSBibGFua3MgdG8gc3BsaXQgdGhlIGRhdGEgaW50byBhIHRyYWluaW5nIHNldCBhbmQgdGVzdCBzZXQgdXNpbmcgYSA3MC0zMCUgc3BsaXQuIEJlIHN1cmUgdG8gaW5jbHVkZSB0aGUgc2V0LnNlZWQoMTIzKSBzbyB0aGF0IHlvdXIgdHJhaW4gYW5kIHRlc3Qgc2V0cyBhcmUgdGhlIHNhbWUgc2l6ZSBhcyBtaW5lLgoKYGBge3J9CnNldC5zZWVkKDEyMykKYm9zdG9uX3NwbGl0PC0gaW5pdGlhbF9zcGxpdChib3N0b24sIHByb3AgPSAwLjcsIHN0cmF0YSA9IGNtZWR2KQpib3N0b25fdHJhaW48LSB0cmFpbmluZyhib3N0b25fc3BsaXQpCmJvc3Rvbl90ZXN0PC0gdGVzdGluZyhib3N0b25fc3BsaXQpCmBgYAoKIyMgUXVlc3Rpb24gNjoKIyMjIEhvdyBtYW55IG9ic2VydmF0aW9ucyBhcmUgaW4gdGhlIHRyYWluaW5nIHNldCBhbmQgdGVzdCBzZXQ/CmBgYHtyfQpkaW0oYm9zdG9uX3Rlc3QpICAgCmRpbShib3N0b25fdHJhaW4pCmBgYAojIyBRdWVzdGlvYiA3CiMjIyBDb21wYXJlIHRoZSBkaXN0cmlidXRpb24gb2YgY21lZHYgYmV0d2VlbiB0aGUgdHJhaW5pbmcgc2V0IGFuZCB0ZXN0IHNldC4gRG8gdGhleSBhcHBlYXIgdG8gaGF2ZSB0aGUKc2FtZSBkaXN0cmlidXRpb24gb3IgZG8gdGhleSBkaWZmZXIgc2lnbmlmaWNhbnRseT8KYGBge3J9CmdncGxvdChib3N0b25fdHJhaW4sIGFlcyh4ID0gY21lZHYpKSArIAogIGdlb21fbGluZShzdGF0ID0gImRlbnNpdHkiLCB0cmltID0gVFJVRSkgKyAKICBnZW9tX2xpbmUoZGF0YSA9IGJvc3Rvbl90ZXN0LCBzdGF0ID0gImRlbnNpdHkiLCB0cmltID0gVFJVRSwgY29sID0gInJlZCIpCmBgYApUaGUgdHJhaW5pbmcgYW5kIHRoZSB0ZXN0IHNldCBoYXZlIHRoZSBzYW1lIGRpc3RyaWJ1dGlvbgoKIyMgUXVlc3Rpb24gOAojIyMgRmlsbCBpbiB0aGUgYmxhbmtzIHRvIGZpdCBhIGxpbmVhciByZWdyZXNzaW9uIG1vZGVsIHVzaW5nIHRoZSBybSBmZWF0dXJlIHZhcmlhYmxlIHRvIHByZWRpY3QgY21lZHYgYW5kIGNvbXB1dGUgdGhlIFJNU0Ugb24gdGhlIHRlc3QgZGF0YS4gV2hhdCBpcyB0aGUgdGVzdCBzZXQgUk1TRT8KYGBge3J9CiMgZml0IG1vZGVsCmJvc3Rvbl9sbTEgPC0gbGluZWFyX3JlZygpICU+JQogIHNldF9lbmdpbmUoJ2xtJykgJT4lCiAgZml0KGNtZWR2IH4gcm0gLCBkYXRhID0gYm9zdG9uX3RyYWluKQoKIyBjb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGEKYm9zdG9uX2xtMSAlPiUKcHJlZGljdChib3N0b25fdGVzdCkgJT4lCmJpbmRfY29scyhib3N0b25fdGVzdCAlPiUgc2VsZWN0KGNtZWR2KSkgJT4lCnJtc2UodHJ1dGggPSBjbWVkdiwgZXN0aW1hdGUgPSAucHJlZCkKYGBgCiMjIFF1ZXN0aW9uIDkKIyMjIEZpbGwgaW4gdGhlIGJsYW5rcyB0byBmaXQgYSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbCB1c2luZyBhbGwgYXZhaWxhYmxlIGZlYXR1cmVzIHRvIHByZWRpY3QgY21lZHYgYW5kIGNvbXB1dGUgdGhlIFJNU0Ugb24gdGhlIHRlc3QgZGF0YS4gV2hhdCBpcyB0aGUgdGVzdCBzZXQgUk1TRT8gSXMgdGhpcyBiZXR0ZXIgdGhhbiB0aGUgcHJldmlvdXMgbW9kZWzigJlzIHBlcmZvcm1hbmNlPwoKYGBge3J9CiMgZml0IG1vZGVsCmJvc3Rvbl9sbTIgPC0gbGluZWFyX3JlZygpICU+JQogIHNldF9lbmdpbmUoJ2xtJykgJT4lCiAgZml0KGNtZWR2IH4gLiAsIGRhdGEgPSBib3N0b25fdHJhaW4pCgojIGNvbXB1dGUgdGhlIFJNU0Ugb24gdGhlIHRlc3QgZGF0YQpib3N0b25fbG0yICU+JQpwcmVkaWN0KGJvc3Rvbl90ZXN0KSAlPiUKYmluZF9jb2xzKGJvc3Rvbl90ZXN0ICU+JSBzZWxlY3QoY21lZHYpKSAlPiUKcm1zZSh0cnV0aCA9IGNtZWR2LCBlc3RpbWF0ZSA9IC5wcmVkKQpgYGAKIyMgUXVlc3Rpb24gMTAKIyMjIEZpdCBhIEstbmVhcmVzdCBuZWlnaGJvciBtb2RlbCB0aGF0IHVzZXMgYWxsIGF2YWlsYWJsZSBmZWF0dXJlcyB0byBwcmVkaWN0IGNtZWR2IGFuZCBjb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGEuIFdoYXQgaXMgdGhlIHRlc3Qgc2V0IFJNU0U/IElzIHRoaXMgYmV0dGVyIHRoYW4gdGhlIHByZXZpb3VzIHR3byBtb2RlbHPigJkgcGVyZm9ybWFuY2VzPwoKYGBge3J9Cmluc3RhbGwucGFja2FnZXMoImtrbm4iKQpsaWJyYXJ5KGtrbm4pCgojIGZpdCBtb2RlbAprbm4gPC0gbmVhcmVzdF9uZWlnaGJvcigpICU+JQpzZXRfZW5naW5lKCdra25uJykgJT4lCnNldF9tb2RlKCJyZWdyZXNzaW9uIikgJT4lCmZpdCggY21lZHYgfiAuICwgZGF0YSA9IGJvc3Rvbl90cmFpbikKIyBjb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGEKa25uICU+JQpwcmVkaWN0KGJvc3Rvbl90ZXN0KSAlPiUKYmluZF9jb2xzKGJvc3Rvbl90ZXN0ICU+JSBzZWxlY3QoY21lZHYpKSAlPiUKcm1zZSh0cnV0aCA9IGNtZWR2LCBlc3RpbWF0ZSA9IC5wcmVkKQoKYGBgCmBgYHtyfQpzdW0oaXMubmEoYm9zdG9uJGNtZWR2KSkKbWluKGJvc3RvbiRjbWVkdikKbWF4KGJvc3RvbiRjbWVkdikKbWVkaWFuKGJvc3RvbiRjbWVkdikKbWVhbihib3N0b24kY21lZHYpCmBgYAoKCg==