Prerequisites
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.3.6 ✔ dplyr 1.0.9
✔ tibble 3.1.8 ✔ stringr 1.4.1
✔ tidyr 1.2.0 ✔ forcats 0.5.2
✔ purrr 0.3.4 ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
install.packages("tidymodels")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.2/tidymodels_1.0.0.tgz'
Content type 'application/x-gzip' length 112363 bytes (109 KB)
==================================================
downloaded 109 KB
The downloaded binary packages are in
/var/folders/n6/cxyj8ln97v94mmwvbp991tbh0000gn/T//RtmpOcqNWF/downloaded_packages
library(tidymodels)
── Attaching packages ──────────────────────────────── tidymodels 1.0.0 ──
✔ broom 1.0.0 ✔ rsample 1.1.0
✔ dials 1.0.0 ✔ tune 1.0.1
✔ infer 1.0.3 ✔ workflows 1.1.0
✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
✔ parsnip 1.0.2 ✔ yardstick 1.1.0
✔ recipes 1.0.1
── Conflicts ─────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Search for functions across packages at https://www.tidymodels.org/find/
library(here)
library(readr)
boston <- read_csv("Documents/Learning R/Module 8/data/boston.csv")
Error: 'Documents/Learning R/Module 8/data/boston.csv' does not exist in current working directory ('/Users/oanhdang/Documents/Learning R/Module 8').
Model Tasks
Question 1
Is this a supervised or unsupervised learning problem? Why?
This is a supervised learning problem because we want to predict the
specific target (cmdev)
Question 2
There are 16 variables in this data set. Which variable is the
response variable and which variables are the predictor variables (aka
features)?
- Response variable: cmedv
- Predictor variables: lon, lat, crim, zn, sq.ft, indus, chas, nox,
rm, age, dis, rad, tax, ptratio, lstat
Question 3
Given the type of variable cmedv is, is this a regression or
classification problem?
This is regression problem because the output is numeric continous ##
Question 4
library(here)
library(readr)
boston <- read_csv("Documents/Learning R/Module 8/data/boston.csv")
Error: 'Documents/Learning R/Module 8/data/boston.csv' does not exist in current working directory ('/Users/oanhdang/Documents/Learning R/Module 8').
Question 5
Fill in the blanks to split the data into a training set and test
set using a 70-30% split. Be sure to include the set.seed(123) so that
your train and test sets are the same size as mine.
set.seed(123)
boston_split<- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train<- training(boston_split)
boston_test<- testing(boston_split)
Question 6:
How many observations are in the training set and test set?
dim(boston_test)
[1] 154 16
dim(boston_train)
[1] 352 16
Questiob 7
Compare the distribution of cmedv between the training set and test
set. Do they appear to have the
same distribution or do they differ significantly?
ggplot(boston_train, aes(x = cmedv)) +
geom_line(stat = "density", trim = TRUE) +
geom_line(data = boston_test, stat = "density", trim = TRUE, col = "red")

The training and the test set have the same distribution
Question 8
Fill in the blanks to fit a linear regression model using the rm
feature variable to predict cmedv and compute the RMSE on the test data.
What is the test set RMSE?
# fit model
boston_lm1 <- linear_reg() %>%
set_engine('lm') %>%
fit(cmedv ~ rm , data = boston_train)
# compute the RMSE on the test data
boston_lm1 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
LS0tCnRpdGxlOiAiTW9kdWxlIDggTGFiIgphdXRob3I6ICJPYW5oIERhbmciCm91dHB1dDoKICBodG1sX2RvY3VtZW50OgogICAgZGZfcHJpbnQ6IHBhZ2VkCi0tLQojIyBQcmVyZXF1aXNpdGVzCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKaW5zdGFsbC5wYWNrYWdlcygidGlkeW1vZGVscyIpCmxpYnJhcnkodGlkeW1vZGVscykKbGlicmFyeShoZXJlKQpsaWJyYXJ5KHJlYWRyKQpib3N0b24gPC0gcmVhZF9jc3YoIkRvY3VtZW50cy9MZWFybmluZyBSL01vZHVsZSA4L2RhdGEvYm9zdG9uLmNzdiIpCmBgYAoKCiMjIE1vZGVsIFRhc2tzCgojIyBRdWVzdGlvbiAxCiMjIyBJcyB0aGlzIGEgc3VwZXJ2aXNlZCBvciB1bnN1cGVydmlzZWQgbGVhcm5pbmcgcHJvYmxlbT8gV2h5PwoKVGhpcyBpcyBhIHN1cGVydmlzZWQgbGVhcm5pbmcgcHJvYmxlbSBiZWNhdXNlIHdlIHdhbnQgdG8gcHJlZGljdCB0aGUgc3BlY2lmaWMgdGFyZ2V0IChjbWRldikKCiMjIFF1ZXN0aW9uIDIKIyMjIFRoZXJlIGFyZSAxNiB2YXJpYWJsZXMgaW4gdGhpcyBkYXRhIHNldC4gV2hpY2ggdmFyaWFibGUgaXMgdGhlIHJlc3BvbnNlIHZhcmlhYmxlIGFuZCB3aGljaCB2YXJpYWJsZXMgYXJlIHRoZSBwcmVkaWN0b3IgdmFyaWFibGVzIChha2EgZmVhdHVyZXMpPwoKKiBSZXNwb25zZSB2YXJpYWJsZTogY21lZHYKKiBQcmVkaWN0b3IgdmFyaWFibGVzOgpsb24sIGxhdCwgY3JpbSwgem4sIHNxLmZ0LCBpbmR1cywgY2hhcywgbm94LCBybSwgYWdlLCBkaXMsIHJhZCwgdGF4LCBwdHJhdGlvLCBsc3RhdAoKIyMgUXVlc3Rpb24gMwojIyMgR2l2ZW4gdGhlIHR5cGUgb2YgdmFyaWFibGUgY21lZHYgaXMsIGlzIHRoaXMgYSByZWdyZXNzaW9uIG9yIGNsYXNzaWZpY2F0aW9uIHByb2JsZW0/ClRoaXMgaXMgcmVncmVzc2lvbiBwcm9ibGVtIGJlY2F1c2UgdGhlIG91dHB1dCBpcyBudW1lcmljIGNvbnRpbm91cwojIyBRdWVzdGlvbiA0CmBgYHtyfQpsaWJyYXJ5KGhlcmUpCmxpYnJhcnkocmVhZHIpCmJvc3RvbiA8LSByZWFkX2NzdigiRG9jdW1lbnRzL0xlYXJuaW5nIFIvTW9kdWxlIDgvZGF0YS9ib3N0b24uY3N2IikKYGBgCgojIyBRdWVzdGlvbiA1CiMjIyBGaWxsIGluIHRoZSBibGFua3MgdG8gc3BsaXQgdGhlIGRhdGEgaW50byBhIHRyYWluaW5nIHNldCBhbmQgdGVzdCBzZXQgdXNpbmcgYSA3MC0zMCUgc3BsaXQuIEJlIHN1cmUgdG8gaW5jbHVkZSB0aGUgc2V0LnNlZWQoMTIzKSBzbyB0aGF0IHlvdXIgdHJhaW4gYW5kIHRlc3Qgc2V0cyBhcmUgdGhlIHNhbWUgc2l6ZSBhcyBtaW5lLgoKYGBge3J9CnNldC5zZWVkKDEyMykKYm9zdG9uX3NwbGl0PC0gaW5pdGlhbF9zcGxpdChib3N0b24sIHByb3AgPSAwLjcsIHN0cmF0YSA9IGNtZWR2KQpib3N0b25fdHJhaW48LSB0cmFpbmluZyhib3N0b25fc3BsaXQpCmJvc3Rvbl90ZXN0PC0gdGVzdGluZyhib3N0b25fc3BsaXQpCmBgYAoKIyMgUXVlc3Rpb24gNjoKIyMjIEhvdyBtYW55IG9ic2VydmF0aW9ucyBhcmUgaW4gdGhlIHRyYWluaW5nIHNldCBhbmQgdGVzdCBzZXQ/CmBgYHtyfQpkaW0oYm9zdG9uX3Rlc3QpICAgCmRpbShib3N0b25fdHJhaW4pCmBgYAojIyBRdWVzdGlvYiA3CiMjIyBDb21wYXJlIHRoZSBkaXN0cmlidXRpb24gb2YgY21lZHYgYmV0d2VlbiB0aGUgdHJhaW5pbmcgc2V0IGFuZCB0ZXN0IHNldC4gRG8gdGhleSBhcHBlYXIgdG8gaGF2ZSB0aGUKc2FtZSBkaXN0cmlidXRpb24gb3IgZG8gdGhleSBkaWZmZXIgc2lnbmlmaWNhbnRseT8KYGBge3J9CmdncGxvdChib3N0b25fdHJhaW4sIGFlcyh4ID0gY21lZHYpKSArIAogIGdlb21fbGluZShzdGF0ID0gImRlbnNpdHkiLCB0cmltID0gVFJVRSkgKyAKICBnZW9tX2xpbmUoZGF0YSA9IGJvc3Rvbl90ZXN0LCBzdGF0ID0gImRlbnNpdHkiLCB0cmltID0gVFJVRSwgY29sID0gInJlZCIpCmBgYApUaGUgdHJhaW5pbmcgYW5kIHRoZSB0ZXN0IHNldCBoYXZlIHRoZSBzYW1lIGRpc3RyaWJ1dGlvbgoKIyMgUXVlc3Rpb24gOAojIyMgRmlsbCBpbiB0aGUgYmxhbmtzIHRvIGZpdCBhIGxpbmVhciByZWdyZXNzaW9uIG1vZGVsIHVzaW5nIHRoZSBybSBmZWF0dXJlIHZhcmlhYmxlIHRvIHByZWRpY3QgY21lZHYgYW5kIGNvbXB1dGUgdGhlIFJNU0Ugb24gdGhlIHRlc3QgZGF0YS4gV2hhdCBpcyB0aGUgdGVzdCBzZXQgUk1TRT8KYGBge3J9CiMgZml0IG1vZGVsCmJvc3Rvbl9sbTEgPC0gbGluZWFyX3JlZygpICU+JQogIHNldF9lbmdpbmUoJ2xtJykgJT4lCiAgZml0KGNtZWR2IH4gcm0gLCBkYXRhID0gYm9zdG9uX3RyYWluKQoKIyBjb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGEKYm9zdG9uX2xtMSAlPiUKcHJlZGljdChib3N0b25fdGVzdCkgJT4lCmJpbmRfY29scyhib3N0b25fdGVzdCAlPiUgc2VsZWN0KGNtZWR2KSkgJT4lCnJtc2UodHJ1dGggPSBjbWVkdiwgZXN0aW1hdGUgPSAucHJlZCkKYGBgCiMjIFF1ZXN0aW9uIDkKIyMjIEZpbGwgaW4gdGhlIGJsYW5rcyB0byBmaXQgYSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbCB1c2luZyBhbGwgYXZhaWxhYmxlIGZlYXR1cmVzIHRvIHByZWRpY3QgY21lZHYgYW5kIGNvbXB1dGUgdGhlIFJNU0Ugb24gdGhlIHRlc3QgZGF0YS4gV2hhdCBpcyB0aGUgdGVzdCBzZXQgUk1TRT8gSXMgdGhpcyBiZXR0ZXIgdGhhbiB0aGUgcHJldmlvdXMgbW9kZWzigJlzIHBlcmZvcm1hbmNlPwoKYGBge3J9CiMgZml0IG1vZGVsCmJvc3Rvbl9sbTIgPC0gbGluZWFyX3JlZygpICU+JQogIHNldF9lbmdpbmUoJ2xtJykgJT4lCiAgZml0KGNtZWR2IH4gLiAsIGRhdGEgPSBib3N0b25fdHJhaW4pCgojIGNvbXB1dGUgdGhlIFJNU0Ugb24gdGhlIHRlc3QgZGF0YQpib3N0b25fbG0yICU+JQpwcmVkaWN0KGJvc3Rvbl90ZXN0KSAlPiUKYmluZF9jb2xzKGJvc3Rvbl90ZXN0ICU+JSBzZWxlY3QoY21lZHYpKSAlPiUKcm1zZSh0cnV0aCA9IGNtZWR2LCBlc3RpbWF0ZSA9IC5wcmVkKQpgYGAKIyMgUXVlc3Rpb24gMTAKIyMjIEZpdCBhIEstbmVhcmVzdCBuZWlnaGJvciBtb2RlbCB0aGF0IHVzZXMgYWxsIGF2YWlsYWJsZSBmZWF0dXJlcyB0byBwcmVkaWN0IGNtZWR2IGFuZCBjb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGEuIFdoYXQgaXMgdGhlIHRlc3Qgc2V0IFJNU0U/IElzIHRoaXMgYmV0dGVyIHRoYW4gdGhlIHByZXZpb3VzIHR3byBtb2RlbHPigJkgcGVyZm9ybWFuY2VzPwoKYGBge3J9Cmluc3RhbGwucGFja2FnZXMoImtrbm4iKQpsaWJyYXJ5KGtrbm4pCgojIGZpdCBtb2RlbAprbm4gPC0gbmVhcmVzdF9uZWlnaGJvcigpICU+JQpzZXRfZW5naW5lKCdra25uJykgJT4lCnNldF9tb2RlKCJyZWdyZXNzaW9uIikgJT4lCmZpdCggY21lZHYgfiAuICwgZGF0YSA9IGJvc3Rvbl90cmFpbikKIyBjb21wdXRlIHRoZSBSTVNFIG9uIHRoZSB0ZXN0IGRhdGEKa25uICU+JQpwcmVkaWN0KGJvc3Rvbl90ZXN0KSAlPiUKYmluZF9jb2xzKGJvc3Rvbl90ZXN0ICU+JSBzZWxlY3QoY21lZHYpKSAlPiUKcm1zZSh0cnV0aCA9IGNtZWR2LCBlc3RpbWF0ZSA9IC5wcmVkKQoKYGBgCmBgYHtyfQpzdW0oaXMubmEoYm9zdG9uJGNtZWR2KSkKbWluKGJvc3RvbiRjbWVkdikKbWF4KGJvc3RvbiRjbWVkdikKbWVkaWFuKGJvc3RvbiRjbWVkdikKbWVhbihib3N0b24kY21lZHYpCmBgYAoKCg==