Packages required:
library(tidymodels)
Warning: package ‘tidymodels’ was built under R version 4.3.3Registered S3 method overwritten by 'data.table':
method from
print.data.table
── Attaching packages ────────────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom 1.0.5 ✔ recipes 1.1.0
✔ dials 1.3.0 ✔ rsample 1.2.1
✔ dplyr 1.1.4 ✔ tibble 3.2.1
✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
✔ infer 1.0.7 ✔ tune 1.2.1
✔ modeldata 1.4.0 ✔ workflows 1.1.4
✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
✔ purrr 1.0.2 ✔ yardstick 1.3.1
Warning: package ‘dials’ was built under R version 4.3.3Warning: package ‘ggplot2’ was built under R version 4.3.3Warning: package ‘infer’ was built under R version 4.3.3Warning: package ‘modeldata’ was built under R version 4.3.3Warning: package ‘parsnip’ was built under R version 4.3.3Warning: package ‘recipes’ was built under R version 4.3.3Warning: package ‘rsample’ was built under R version 4.3.3Warning: package ‘tune’ was built under R version 4.3.3Warning: package ‘workflows’ was built under R version 4.3.3Warning: package ‘workflowsets’ was built under R version 4.3.3Warning: package ‘yardstick’ was built under R version 4.3.3── Conflicts ───────────────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ recipes::step() masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.5
✔ lubridate 1.9.3 ✔ stringr 1.5.1── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ readr::col_factor() masks scales::col_factor()
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter() masks stats::filter()
✖ stringr::fixed() masks recipes::fixed()
✖ dplyr::lag() masks stats::lag()
✖ readr::spec() masks yardstick::spec()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(dplyr)
Part 1:
Import Boston housing data
boston <- readr::read_csv("C:/Users/thanh/OneDrive - University of Cincinnati/Nie/F24_Data mining/Lab9_boston.csv")
Rows: 506 Columns: 16── Column specification ────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptrat...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Split data
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
Question 3:
cor_matrix <- cor(train)
cor_with_cmedv <- cor_matrix["cmedv", ]
sorted_cor <- sort(cor_with_cmedv, decreasing = TRUE)
strongest_positive <- names(sorted_cor[2])
strongest_negative <- names(tail(sorted_cor, 1))
print(paste("Strongest positive correlation: ", strongest_positive))
[1] "Strongest positive correlation: rm"
print(paste("Strongest negative correlation: ", strongest_negative))
[1] "Strongest negative correlation: lstat"
Question 4:
ggplot(train, aes_string(x = strongest_positive, y = "cmedv")) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("cmedv vs", strongest_positive),
x = strongest_positive, y = "cmedv")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
Please use tidy evaluation idioms with `aes()`.
See also `vignette("ggplot2-in-packages")` for more information.

Question 5:
simple_lm <- lm(cmedv ~ ., data = train %>% select(cmedv, strongest_positive))
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(strongest_positive)
# Now:
data %>% select(all_of(strongest_positive))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
summary(simple_lm)
Call:
lm(formula = cmedv ~ ., data = train %>% select(cmedv, strongest_positive))
Residuals:
Min 1Q Median 3Q Max
-23.5959 -2.4585 0.3015 2.8637 31.2795
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -35.4295 3.1100 -11.39 <2e-16 ***
rm 9.2170 0.4912 18.76 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.492 on 350 degrees of freedom
Multiple R-squared: 0.5015, Adjusted R-squared: 0.5001
F-statistic: 352.1 on 1 and 350 DF, p-value: < 2.2e-16
coeff <- summary(simple_lm)$coefficients[2, ]
print(coeff)
Estimate Std. Error t value Pr(>|t|)
9.217011e+00 4.912137e-01 1.876375e+01 7.459051e-55
Question 6:
preds <- predict(simple_lm, newdata = test)
rmse_simple <- sqrt(mean((preds - test$cmedv)^2))
print(paste("RMSE for simple model:", rmse_simple))
[1] "RMSE for simple model: 6.83140547006634"
Part 2
Question 7:
full_lm <- lm(cmedv ~ ., data = train)
summary(full_lm)
Call:
lm(formula = cmedv ~ ., data = train)
Residuals:
Min 1Q Median 3Q Max
-16.4058 -2.4591 -0.7128 1.4936 25.8239
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.080e+02 3.420e+02 -1.778 0.076351 .
lon -5.648e+00 3.864e+00 -1.462 0.144690
lat 5.537e+00 4.243e+00 1.305 0.192741
crim -8.304e-02 3.956e-02 -2.099 0.036540 *
zn 3.318e-02 1.654e-02 2.007 0.045592 *
indus -1.426e-02 7.379e-02 -0.193 0.846841
chas 2.282e+00 1.051e+00 2.171 0.030606 *
nox -1.165e+01 4.738e+00 -2.459 0.014435 *
rm 4.369e+00 5.162e-01 8.464 8.13e-16 ***
age -9.215e-03 1.559e-02 -0.591 0.554928
dis -1.259e+00 2.436e-01 -5.168 4.06e-07 ***
rad 2.721e-01 7.903e-02 3.444 0.000647 ***
tax -1.212e-02 4.362e-03 -2.778 0.005784 **
ptratio -8.739e-01 1.627e-01 -5.369 1.48e-07 ***
b 1.231e-02 3.099e-03 3.972 8.72e-05 ***
lstat -4.786e-01 6.369e-02 -7.514 5.24e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.683 on 336 degrees of freedom
Multiple R-squared: 0.7509, Adjusted R-squared: 0.7398
F-statistic: 67.54 on 15 and 336 DF, p-value: < 2.2e-16
significant_vars <- summary(full_lm)$coefficients[, 4] < 0.05
significant_vars_names <- names(significant_vars)[significant_vars]
print("Significant variables at p.value < 0.05:")
[1] "Significant variables at p.value < 0.05:"
print(significant_vars_names)
[1] "crim" "zn" "chas" "nox" "rm" "dis" "rad" "tax"
[9] "ptratio" "b" "lstat"
lstat_coeff <- summary(full_lm)$coefficients["lstat", ]
print(paste("lstat coefficient:", lstat_coeff))
[1] "lstat coefficient: -0.47857828967583" "lstat coefficient: 0.063690571028555"
[3] "lstat coefficient: -7.51411522847336" "lstat coefficient: 5.23773617043357e-13"
Question 8:
preds_full <- predict(full_lm, newdata = test)
rmse_full <- sqrt(mean((preds_full - test$cmedv)^2))
print(paste("RMSE for full model:", rmse_full))
[1] "RMSE for full model: 4.82926145935203"
if (rmse_full < rmse_simple) {
print("The full model performs better.")
} else {
print("The simple model performs better.")
}
[1] "The full model performs better."
Question 9:
coefficients <- summary(full_lm)$coefficients[, 1]
top_5_influential <- sort(abs(coefficients), decreasing = TRUE)[2:6] # Exclude intercept
top_5_influential_names <- names(top_5_influential)
print("Top 5 most influential predictor variables:")
[1] "Top 5 most influential predictor variables:"
print(top_5_influential_names)
[1] "nox" "lon" "lat" "rm" "chas"
# Get the coefficients from the full model
coefficients <- summary(full_lm)$coefficients[, 1]
# Exclude the intercept from the list of coefficients
coefficients_no_intercept <- coefficients[-1]
# Sort the absolute values of the coefficients in descending order
sorted_coefficients <- sort(abs(coefficients_no_intercept), decreasing = TRUE)
# Get the top 5 most influential predictor variables
top_5_influential <- names(sorted_coefficients)[1:5]
print("Top 5 most influential predictor variables:")
[1] "Top 5 most influential predictor variables:"
print(top_5_influential)
[1] "nox" "lon" "lat" "rm" "chas"
LS0tDQp0aXRsZTogIk1vZHVsZSA5IExhYiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIyMgUGFja2FnZXMgcmVxdWlyZWQ6DQpgYGB7cn0NCmxpYnJhcnkodGlkeW1vZGVscykgDQpsaWJyYXJ5KHRpZHl2ZXJzZSkgDQpsaWJyYXJ5KGdncGxvdDIpICAgIA0KbGlicmFyeShkcGx5cikNCmBgYA0KDQojIyMgUGFydCAxOiANCg0KIyMjIyBJbXBvcnQgQm9zdG9uIGhvdXNpbmcgZGF0YSANCmBgYHtyfQ0KYm9zdG9uIDwtIHJlYWRyOjpyZWFkX2NzdigiQzovVXNlcnMvdGhhbmgvT25lRHJpdmUgLSBVbml2ZXJzaXR5IG9mIENpbmNpbm5hdGkvTmllL0YyNF9EYXRhIG1pbmluZy9MYWI5X2Jvc3Rvbi5jc3YiKQ0KDQpgYGANCg0KIyMjIyBTcGxpdCBkYXRhDQpgYGB7cn0NCnNldC5zZWVkKDEyMykNCnNwbGl0IDwtIGluaXRpYWxfc3BsaXQoYm9zdG9uLCBwcm9wID0gMC43LCBzdHJhdGEgPSBjbWVkdikNCg0KdHJhaW4gPC0gdHJhaW5pbmcoc3BsaXQpDQp0ZXN0IDwtIHRlc3Rpbmcoc3BsaXQpDQpgYGANCg0KIyMjIyBRdWVzdGlvbiAzOg0KYGBge3J9DQpjb3JfbWF0cml4IDwtIGNvcih0cmFpbikNCmNvcl93aXRoX2NtZWR2IDwtIGNvcl9tYXRyaXhbImNtZWR2IiwgXQ0KDQpzb3J0ZWRfY29yIDwtIHNvcnQoY29yX3dpdGhfY21lZHYsIGRlY3JlYXNpbmcgPSBUUlVFKQ0KDQpzdHJvbmdlc3RfcG9zaXRpdmUgPC0gbmFtZXMoc29ydGVkX2NvclsyXSkgIA0Kc3Ryb25nZXN0X25lZ2F0aXZlIDwtIG5hbWVzKHRhaWwoc29ydGVkX2NvciwgMSkpDQoNCnByaW50KHBhc3RlKCJTdHJvbmdlc3QgcG9zaXRpdmUgY29ycmVsYXRpb246ICIsIHN0cm9uZ2VzdF9wb3NpdGl2ZSkpDQpwcmludChwYXN0ZSgiU3Ryb25nZXN0IG5lZ2F0aXZlIGNvcnJlbGF0aW9uOiAiLCBzdHJvbmdlc3RfbmVnYXRpdmUpKQ0KDQpgYGANCiMjIyMgUXVlc3Rpb24gNDogDQpgYGB7cn0NCmdncGxvdCh0cmFpbiwgYWVzX3N0cmluZyh4ID0gc3Ryb25nZXN0X3Bvc2l0aXZlLCB5ID0gImNtZWR2IikpICsNCiAgZ2VvbV9wb2ludCgpICsNCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgc2UgPSBGQUxTRSwgY29sb3IgPSAiYmx1ZSIpICsNCiAgbGFicyh0aXRsZSA9IHBhc3RlKCJjbWVkdiB2cyIsIHN0cm9uZ2VzdF9wb3NpdGl2ZSksDQogICAgICAgeCA9IHN0cm9uZ2VzdF9wb3NpdGl2ZSwgeSA9ICJjbWVkdiIpDQoNCmBgYA0KDQojIyMjIFF1ZXN0aW9uIDU6IA0KYGBge3J9DQpzaW1wbGVfbG0gPC0gbG0oY21lZHYgfiAuLCBkYXRhID0gdHJhaW4gJT4lIHNlbGVjdChjbWVkdiwgc3Ryb25nZXN0X3Bvc2l0aXZlKSkNCg0Kc3VtbWFyeShzaW1wbGVfbG0pDQoNCmNvZWZmIDwtIHN1bW1hcnkoc2ltcGxlX2xtKSRjb2VmZmljaWVudHNbMiwgXQ0KcHJpbnQoY29lZmYpDQoNCmBgYA0KIyMjIyBRdWVzdGlvbiA2OiANCmBgYHtyfQ0KcHJlZHMgPC0gcHJlZGljdChzaW1wbGVfbG0sIG5ld2RhdGEgPSB0ZXN0KQ0KDQpybXNlX3NpbXBsZSA8LSBzcXJ0KG1lYW4oKHByZWRzIC0gdGVzdCRjbWVkdileMikpDQpwcmludChwYXN0ZSgiUk1TRSBmb3Igc2ltcGxlIG1vZGVsOiIsIHJtc2Vfc2ltcGxlKSkNCg0KYGBgDQojIyMgUGFydCAyIA0KDQojIyMjIFF1ZXN0aW9uIDc6DQpgYGB7cn0NCmZ1bGxfbG0gPC0gbG0oY21lZHYgfiAuLCBkYXRhID0gdHJhaW4pDQpzdW1tYXJ5KGZ1bGxfbG0pDQoNCnNpZ25pZmljYW50X3ZhcnMgPC0gc3VtbWFyeShmdWxsX2xtKSRjb2VmZmljaWVudHNbLCA0XSA8IDAuMDUNCnNpZ25pZmljYW50X3ZhcnNfbmFtZXMgPC0gbmFtZXMoc2lnbmlmaWNhbnRfdmFycylbc2lnbmlmaWNhbnRfdmFyc10NCnByaW50KCJTaWduaWZpY2FudCB2YXJpYWJsZXMgYXQgcC52YWx1ZSA8IDAuMDU6IikNCnByaW50KHNpZ25pZmljYW50X3ZhcnNfbmFtZXMpDQoNCmxzdGF0X2NvZWZmIDwtIHN1bW1hcnkoZnVsbF9sbSkkY29lZmZpY2llbnRzWyJsc3RhdCIsIF0NCnByaW50KHBhc3RlKCJsc3RhdCBjb2VmZmljaWVudDoiLCBsc3RhdF9jb2VmZikpDQoNCmBgYA0KIyMjIyBRdWVzdGlvbiA4OiANCmBgYHtyfQ0KcHJlZHNfZnVsbCA8LSBwcmVkaWN0KGZ1bGxfbG0sIG5ld2RhdGEgPSB0ZXN0KQ0KDQpybXNlX2Z1bGwgPC0gc3FydChtZWFuKChwcmVkc19mdWxsIC0gdGVzdCRjbWVkdileMikpDQpwcmludChwYXN0ZSgiUk1TRSBmb3IgZnVsbCBtb2RlbDoiLCBybXNlX2Z1bGwpKQ0KDQppZiAocm1zZV9mdWxsIDwgcm1zZV9zaW1wbGUpIHsNCiAgcHJpbnQoIlRoZSBmdWxsIG1vZGVsIHBlcmZvcm1zIGJldHRlci4iKQ0KfSBlbHNlIHsNCiAgcHJpbnQoIlRoZSBzaW1wbGUgbW9kZWwgcGVyZm9ybXMgYmV0dGVyLiIpDQp9DQoNCmBgYA0KIyMjIyBRdWVzdGlvbiA5OiANCmBgYHtyfQ0KDQpjb2VmZmljaWVudHMgPC0gc3VtbWFyeShmdWxsX2xtKSRjb2VmZmljaWVudHNbLCAxXQ0KdG9wXzVfaW5mbHVlbnRpYWwgPC0gc29ydChhYnMoY29lZmZpY2llbnRzKSwgZGVjcmVhc2luZyA9IFRSVUUpWzI6Nl0gICMgRXhjbHVkZSBpbnRlcmNlcHQNCnRvcF81X2luZmx1ZW50aWFsX25hbWVzIDwtIG5hbWVzKHRvcF81X2luZmx1ZW50aWFsKQ0KDQpwcmludCgiVG9wIDUgbW9zdCBpbmZsdWVudGlhbCBwcmVkaWN0b3IgdmFyaWFibGVzOiIpDQpwcmludCh0b3BfNV9pbmZsdWVudGlhbF9uYW1lcykNCg0KDQpgYGANCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0K