Packages required:
library(kknn)
library(tidymodels)
Q1: Is this a supervised or unsupervised learning problem? Why?
This is a supervised because we are using predictor variables to
predict a known response variable (cmedv)
Q2: There are 16 variables in this data set. Which variable is the
response variable and which variables are the predictor variables (aka
features)?
cmedv is the response variable. All others are predictor
variables
Q3: Is this a regression or classification problem?
This is a regression problem because the response variable cmedv is
continuous
Q4: Import data set + Missing value + Statistic
boston <- readr::read_csv("C:/Users/thanh/OneDrive - University of Cincinnati/Nie/F24_Data mining/boston.csv")
Rows: 506 Columns: 16── Column specification ───────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sum(is.na(boston))
[1] 0
summary(boston$cmedv)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.00 17.02 21.20 22.53 25.00 50.00
Q5: Split data set:
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
Q6: Number of observation in training set and test set:
nrow(train) #training set
[1] 352
nrow(test) #test set
[1] 154
Q7: Comparision the distribution of cmedv between 2 sets:
library(ggplot2)
ggplot(train, aes(x = cmedv)) + geom_histogram(binwidth = 1) + ggtitle("Training Set cmedv Distribution")

ggplot(test, aes(x = cmedv)) + geom_histogram(binwidth = 1) + ggtitle("Test Set cmedv Distribution")

Q8: Fit a linear regression model with rm as the predictor
lm1 <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
lm1 %>%
predict(new_data = test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
Q9:
lm2 <- linear_reg() %>%
fit(cmedv ~ ., data = train)
lm2 %>%
predict(new_data = test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
Q10:
knn <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("regression") %>%
fit(cmedv ~ ., data = train)
# Compute the RMSE on the test data
knn %>%
predict(new_data = test) %>%
bind_cols(test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
LS0tDQp0aXRsZTogIk1vZHVsZSBMYWIgOCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCg0KIyMjIyBQYWNrYWdlcyByZXF1aXJlZDogDQpgYGB7cn0NCmxpYnJhcnkoa2tubikNCmxpYnJhcnkodGlkeW1vZGVscykNCmBgYA0KDQojIyMjIFExOiBJcyB0aGlzIGEgc3VwZXJ2aXNlZCBvciB1bnN1cGVydmlzZWQgbGVhcm5pbmcgcHJvYmxlbT8gV2h5PyANCg0KVGhpcyBpcyBhIHN1cGVydmlzZWQgYmVjYXVzZSB3ZSBhcmUgdXNpbmcgcHJlZGljdG9yIHZhcmlhYmxlcyB0byBwcmVkaWN0IGEga25vd24gcmVzcG9uc2UgdmFyaWFibGUgKGNtZWR2KQ0KDQoNCiMjIyMgUTI6IFRoZXJlIGFyZSAxNiB2YXJpYWJsZXMgaW4gdGhpcyBkYXRhIHNldC4gV2hpY2ggdmFyaWFibGUgaXMgdGhlIHJlc3BvbnNlIHZhcmlhYmxlIGFuZCB3aGljaCB2YXJpYWJsZXMgYXJlIHRoZSBwcmVkaWN0b3IgdmFyaWFibGVzIChha2EgZmVhdHVyZXMpPw0KDQpjbWVkdiBpcyB0aGUgcmVzcG9uc2UgdmFyaWFibGUuIEFsbCBvdGhlcnMgYXJlIHByZWRpY3RvciB2YXJpYWJsZXMgDQoNCg0KIyMjIyBRMzogSXMgdGhpcyBhIHJlZ3Jlc3Npb24gb3IgY2xhc3NpZmljYXRpb24gcHJvYmxlbT8NCg0KVGhpcyBpcyBhIHJlZ3Jlc3Npb24gcHJvYmxlbSBiZWNhdXNlIHRoZSByZXNwb25zZSB2YXJpYWJsZSBjbWVkdiBpcyBjb250aW51b3VzDQoNCg0KIyMjIyBRNDogSW1wb3J0IGRhdGEgc2V0ICsgTWlzc2luZyB2YWx1ZSArIFN0YXRpc3RpYyANCg0KYGBge3J9DQpib3N0b24gPC0gcmVhZHI6OnJlYWRfY3N2KCJDOi9Vc2Vycy90aGFuaC9PbmVEcml2ZSAtIFVuaXZlcnNpdHkgb2YgQ2luY2lubmF0aS9OaWUvRjI0X0RhdGEgbWluaW5nL2Jvc3Rvbi5jc3YiKQ0KDQpzdW0oaXMubmEoYm9zdG9uKSkNCnN1bW1hcnkoYm9zdG9uJGNtZWR2KQ0KYGBgDQoNCiMjIyMgUTU6IFNwbGl0IGRhdGEgc2V0OiANCmBgYHtyfQ0Kc2V0LnNlZWQoMTIzKQ0Kc3BsaXQgPC0gaW5pdGlhbF9zcGxpdChib3N0b24sIHByb3AgPSAwLjcsIHN0cmF0YSA9IGNtZWR2KQ0KdHJhaW4gPC0gdHJhaW5pbmcoc3BsaXQpDQp0ZXN0IDwtIHRlc3Rpbmcoc3BsaXQpDQpgYGANCg0KIyMjIyBRNjogTnVtYmVyIG9mIG9ic2VydmF0aW9uIGluIHRyYWluaW5nIHNldCBhbmQgdGVzdCBzZXQ6IA0KYGBge3J9DQpucm93KHRyYWluKSAgI3RyYWluaW5nIHNldA0KbnJvdyh0ZXN0KSAgICN0ZXN0IHNldA0KYGBgDQoNCiMjIyMgUTc6IENvbXBhcmlzaW9uIHRoZSBkaXN0cmlidXRpb24gb2YgY21lZHYgYmV0d2VlbiAyIHNldHM6IA0KYGBge3J9DQpsaWJyYXJ5KGdncGxvdDIpDQpnZ3Bsb3QodHJhaW4sIGFlcyh4ID0gY21lZHYpKSArIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMSkgKyBnZ3RpdGxlKCJUcmFpbmluZyBTZXQgY21lZHYgRGlzdHJpYnV0aW9uIikNCmdncGxvdCh0ZXN0LCBhZXMoeCA9IGNtZWR2KSkgKyBnZW9tX2hpc3RvZ3JhbShiaW53aWR0aCA9IDEpICsgZ2d0aXRsZSgiVGVzdCBTZXQgY21lZHYgRGlzdHJpYnV0aW9uIikNCmBgYA0KDQojIyMjIFE4OiBGaXQgYSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbCB3aXRoIHJtIGFzIHRoZSBwcmVkaWN0b3INCmBgYHtyfQ0KbG0xIDwtIGxpbmVhcl9yZWcoKSAlPiUNCiAgZml0KGNtZWR2IH4gcm0sIGRhdGEgPSB0cmFpbikNCg0KbG0xICU+JQ0KICBwcmVkaWN0KG5ld19kYXRhID0gdGVzdCkgJT4lDQogIGJpbmRfY29scyh0ZXN0ICU+JSBzZWxlY3QoY21lZHYpKSAlPiUNCiAgcm1zZSh0cnV0aCA9IGNtZWR2LCBlc3RpbWF0ZSA9IC5wcmVkKQ0KYGBgDQojIyMjIFE5OiANCmBgYHtyfQ0KbG0yIDwtIGxpbmVhcl9yZWcoKSAlPiUNCiAgZml0KGNtZWR2IH4gLiwgZGF0YSA9IHRyYWluKQ0KDQpsbTIgJT4lDQogIHByZWRpY3QobmV3X2RhdGEgPSB0ZXN0KSAlPiUNCiAgYmluZF9jb2xzKHRlc3QgJT4lIHNlbGVjdChjbWVkdikpICU+JQ0KICBybXNlKHRydXRoID0gY21lZHYsIGVzdGltYXRlID0gLnByZWQpDQpgYGANCiMjIyMgUTEwOiANCmBgYHtyfQ0Ka25uIDwtIG5lYXJlc3RfbmVpZ2hib3IoKSAlPiUNCiAgc2V0X2VuZ2luZSgia2tubiIpICU+JQ0KICBzZXRfbW9kZSgicmVncmVzc2lvbiIpICU+JQ0KICBmaXQoY21lZHYgfiAuLCBkYXRhID0gdHJhaW4pDQoNCiMgQ29tcHV0ZSB0aGUgUk1TRSBvbiB0aGUgdGVzdCBkYXRhDQprbm4gJT4lDQogIHByZWRpY3QobmV3X2RhdGEgPSB0ZXN0KSAlPiUNCiAgYmluZF9jb2xzKHRlc3QgJT4lIHNlbGVjdChjbWVkdikpICU+JQ0KICBybXNlKHRydXRoID0gY21lZHYsIGVzdGltYXRlID0gLnByZWQpDQpgYGANCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg==