Packages required:

library(tidymodels) 
Warning: package ‘tidymodels’ was built under R version 4.3.3Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
── Attaching packages ────────────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.5     ✔ recipes      1.1.0
✔ dials        1.3.0     ✔ rsample      1.2.1
✔ dplyr        1.1.4     ✔ tibble       3.2.1
✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
✔ infer        1.0.7     ✔ tune         1.2.1
✔ modeldata    1.4.0     ✔ workflows    1.1.4
✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
✔ purrr        1.0.2     ✔ yardstick    1.3.1
Warning: package ‘dials’ was built under R version 4.3.3Warning: package ‘ggplot2’ was built under R version 4.3.3Warning: package ‘infer’ was built under R version 4.3.3Warning: package ‘modeldata’ was built under R version 4.3.3Warning: package ‘parsnip’ was built under R version 4.3.3Warning: package ‘recipes’ was built under R version 4.3.3Warning: package ‘rsample’ was built under R version 4.3.3Warning: package ‘tune’ was built under R version 4.3.3Warning: package ‘workflows’ was built under R version 4.3.3Warning: package ‘workflowsets’ was built under R version 4.3.3Warning: package ‘yardstick’ was built under R version 4.3.3── Conflicts ───────────────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages
library(tidyverse) 
── Attaching core tidyverse packages ──────────────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ lubridate 1.9.3     ✔ stringr   1.5.1── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ readr::col_factor() masks scales::col_factor()
✖ purrr::discard()    masks scales::discard()
✖ dplyr::filter()     masks stats::filter()
✖ stringr::fixed()    masks recipes::fixed()
✖ dplyr::lag()        masks stats::lag()
✖ readr::spec()       masks yardstick::spec()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)    
library(dplyr)

Part 1:

Import Boston housing data

boston <- readr::read_csv("C:/Users/thanh/OneDrive - University of Cincinnati/Nie/F24_Data mining/Lab9_boston.csv")
Rows: 506 Columns: 16── Column specification ────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptrat...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Split data

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)

train <- training(split)
test <- testing(split)

Question 3:

cor_matrix <- cor(train)
cor_with_cmedv <- cor_matrix["cmedv", ]

sorted_cor <- sort(cor_with_cmedv, decreasing = TRUE)

strongest_positive <- names(sorted_cor[2])  
strongest_negative <- names(tail(sorted_cor, 1))

print(paste("Strongest positive correlation: ", strongest_positive))
[1] "Strongest positive correlation:  rm"
print(paste("Strongest negative correlation: ", strongest_negative))
[1] "Strongest negative correlation:  lstat"

Question 4:

ggplot(train, aes_string(x = strongest_positive, y = "cmedv")) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = paste("cmedv vs", strongest_positive),
       x = strongest_positive, y = "cmedv")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
Please use tidy evaluation idioms with `aes()`. 
See also `vignette("ggplot2-in-packages")` for more information.

Question 5:

simple_lm <- lm(cmedv ~ ., data = train %>% select(cmedv, strongest_positive))
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(strongest_positive)

# Now:
data %>% select(all_of(strongest_positive))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
summary(simple_lm)

Call:
lm(formula = cmedv ~ ., data = train %>% select(cmedv, strongest_positive))

Residuals:
     Min       1Q   Median       3Q      Max 
-23.5959  -2.4585   0.3015   2.8637  31.2795 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -35.4295     3.1100  -11.39   <2e-16 ***
rm            9.2170     0.4912   18.76   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.492 on 350 degrees of freedom
Multiple R-squared:  0.5015,    Adjusted R-squared:  0.5001 
F-statistic: 352.1 on 1 and 350 DF,  p-value: < 2.2e-16
coeff <- summary(simple_lm)$coefficients[2, ]
print(coeff)
    Estimate   Std. Error      t value     Pr(>|t|) 
9.217011e+00 4.912137e-01 1.876375e+01 7.459051e-55 

Question 6:

preds <- predict(simple_lm, newdata = test)

rmse_simple <- sqrt(mean((preds - test$cmedv)^2))
print(paste("RMSE for simple model:", rmse_simple))
[1] "RMSE for simple model: 6.83140547006634"

Part 2

Question 7:

full_lm <- lm(cmedv ~ ., data = train)
summary(full_lm)

Call:
lm(formula = cmedv ~ ., data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-16.4058  -2.4591  -0.7128   1.4936  25.8239 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -6.080e+02  3.420e+02  -1.778 0.076351 .  
lon         -5.648e+00  3.864e+00  -1.462 0.144690    
lat          5.537e+00  4.243e+00   1.305 0.192741    
crim        -8.304e-02  3.956e-02  -2.099 0.036540 *  
zn           3.318e-02  1.654e-02   2.007 0.045592 *  
indus       -1.426e-02  7.379e-02  -0.193 0.846841    
chas         2.282e+00  1.051e+00   2.171 0.030606 *  
nox         -1.165e+01  4.738e+00  -2.459 0.014435 *  
rm           4.369e+00  5.162e-01   8.464 8.13e-16 ***
age         -9.215e-03  1.559e-02  -0.591 0.554928    
dis         -1.259e+00  2.436e-01  -5.168 4.06e-07 ***
rad          2.721e-01  7.903e-02   3.444 0.000647 ***
tax         -1.212e-02  4.362e-03  -2.778 0.005784 ** 
ptratio     -8.739e-01  1.627e-01  -5.369 1.48e-07 ***
b            1.231e-02  3.099e-03   3.972 8.72e-05 ***
lstat       -4.786e-01  6.369e-02  -7.514 5.24e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.683 on 336 degrees of freedom
Multiple R-squared:  0.7509,    Adjusted R-squared:  0.7398 
F-statistic: 67.54 on 15 and 336 DF,  p-value: < 2.2e-16
significant_vars <- summary(full_lm)$coefficients[, 4] < 0.05
significant_vars_names <- names(significant_vars)[significant_vars]
print("Significant variables at p.value < 0.05:")
[1] "Significant variables at p.value < 0.05:"
print(significant_vars_names)
 [1] "crim"    "zn"      "chas"    "nox"     "rm"      "dis"     "rad"     "tax"    
 [9] "ptratio" "b"       "lstat"  
lstat_coeff <- summary(full_lm)$coefficients["lstat", ]
print(paste("lstat coefficient:", lstat_coeff))
[1] "lstat coefficient: -0.47857828967583"    "lstat coefficient: 0.063690571028555"   
[3] "lstat coefficient: -7.51411522847336"    "lstat coefficient: 5.23773617043357e-13"

Question 8:

preds_full <- predict(full_lm, newdata = test)

rmse_full <- sqrt(mean((preds_full - test$cmedv)^2))
print(paste("RMSE for full model:", rmse_full))
[1] "RMSE for full model: 4.82926145935203"
if (rmse_full < rmse_simple) {
  print("The full model performs better.")
} else {
  print("The simple model performs better.")
}
[1] "The full model performs better."

Question 9:


coefficients <- summary(full_lm)$coefficients[, 1]
top_5_influential <- sort(abs(coefficients), decreasing = TRUE)[2:6]  # Exclude intercept
top_5_influential_names <- names(top_5_influential)

print("Top 5 most influential predictor variables:")
[1] "Top 5 most influential predictor variables:"
print(top_5_influential_names)
[1] "nox"  "lon"  "lat"  "rm"   "chas"
# Get the coefficients from the full model
coefficients <- summary(full_lm)$coefficients[, 1]

# Exclude the intercept from the list of coefficients
coefficients_no_intercept <- coefficients[-1]

# Sort the absolute values of the coefficients in descending order
sorted_coefficients <- sort(abs(coefficients_no_intercept), decreasing = TRUE)

# Get the top 5 most influential predictor variables
top_5_influential <- names(sorted_coefficients)[1:5]

print("Top 5 most influential predictor variables:")
[1] "Top 5 most influential predictor variables:"
print(top_5_influential)
[1] "nox"  "lon"  "lat"  "rm"   "chas"
LS0tDQp0aXRsZTogIk1vZHVsZSA5IExhYiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIyMgUGFja2FnZXMgcmVxdWlyZWQ6DQpgYGB7cn0NCmxpYnJhcnkodGlkeW1vZGVscykgDQpsaWJyYXJ5KHRpZHl2ZXJzZSkgDQpsaWJyYXJ5KGdncGxvdDIpICAgIA0KbGlicmFyeShkcGx5cikNCmBgYA0KDQojIyMgUGFydCAxOiANCg0KIyMjIyBJbXBvcnQgQm9zdG9uIGhvdXNpbmcgZGF0YSANCmBgYHtyfQ0KYm9zdG9uIDwtIHJlYWRyOjpyZWFkX2NzdigiQzovVXNlcnMvdGhhbmgvT25lRHJpdmUgLSBVbml2ZXJzaXR5IG9mIENpbmNpbm5hdGkvTmllL0YyNF9EYXRhIG1pbmluZy9MYWI5X2Jvc3Rvbi5jc3YiKQ0KDQpgYGANCg0KIyMjIyBTcGxpdCBkYXRhDQpgYGB7cn0NCnNldC5zZWVkKDEyMykNCnNwbGl0IDwtIGluaXRpYWxfc3BsaXQoYm9zdG9uLCBwcm9wID0gMC43LCBzdHJhdGEgPSBjbWVkdikNCg0KdHJhaW4gPC0gdHJhaW5pbmcoc3BsaXQpDQp0ZXN0IDwtIHRlc3Rpbmcoc3BsaXQpDQpgYGANCg0KIyMjIyBRdWVzdGlvbiAzOg0KYGBge3J9DQpjb3JfbWF0cml4IDwtIGNvcih0cmFpbikNCmNvcl93aXRoX2NtZWR2IDwtIGNvcl9tYXRyaXhbImNtZWR2IiwgXQ0KDQpzb3J0ZWRfY29yIDwtIHNvcnQoY29yX3dpdGhfY21lZHYsIGRlY3JlYXNpbmcgPSBUUlVFKQ0KDQpzdHJvbmdlc3RfcG9zaXRpdmUgPC0gbmFtZXMoc29ydGVkX2NvclsyXSkgIA0Kc3Ryb25nZXN0X25lZ2F0aXZlIDwtIG5hbWVzKHRhaWwoc29ydGVkX2NvciwgMSkpDQoNCnByaW50KHBhc3RlKCJTdHJvbmdlc3QgcG9zaXRpdmUgY29ycmVsYXRpb246ICIsIHN0cm9uZ2VzdF9wb3NpdGl2ZSkpDQpwcmludChwYXN0ZSgiU3Ryb25nZXN0IG5lZ2F0aXZlIGNvcnJlbGF0aW9uOiAiLCBzdHJvbmdlc3RfbmVnYXRpdmUpKQ0KDQpgYGANCiMjIyMgUXVlc3Rpb24gNDogDQpgYGB7cn0NCmdncGxvdCh0cmFpbiwgYWVzX3N0cmluZyh4ID0gc3Ryb25nZXN0X3Bvc2l0aXZlLCB5ID0gImNtZWR2IikpICsNCiAgZ2VvbV9wb2ludCgpICsNCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgc2UgPSBGQUxTRSwgY29sb3IgPSAiYmx1ZSIpICsNCiAgbGFicyh0aXRsZSA9IHBhc3RlKCJjbWVkdiB2cyIsIHN0cm9uZ2VzdF9wb3NpdGl2ZSksDQogICAgICAgeCA9IHN0cm9uZ2VzdF9wb3NpdGl2ZSwgeSA9ICJjbWVkdiIpDQoNCmBgYA0KDQojIyMjIFF1ZXN0aW9uIDU6IA0KYGBge3J9DQpzaW1wbGVfbG0gPC0gbG0oY21lZHYgfiAuLCBkYXRhID0gdHJhaW4gJT4lIHNlbGVjdChjbWVkdiwgc3Ryb25nZXN0X3Bvc2l0aXZlKSkNCg0Kc3VtbWFyeShzaW1wbGVfbG0pDQoNCmNvZWZmIDwtIHN1bW1hcnkoc2ltcGxlX2xtKSRjb2VmZmljaWVudHNbMiwgXQ0KcHJpbnQoY29lZmYpDQoNCmBgYA0KIyMjIyBRdWVzdGlvbiA2OiANCmBgYHtyfQ0KcHJlZHMgPC0gcHJlZGljdChzaW1wbGVfbG0sIG5ld2RhdGEgPSB0ZXN0KQ0KDQpybXNlX3NpbXBsZSA8LSBzcXJ0KG1lYW4oKHByZWRzIC0gdGVzdCRjbWVkdileMikpDQpwcmludChwYXN0ZSgiUk1TRSBmb3Igc2ltcGxlIG1vZGVsOiIsIHJtc2Vfc2ltcGxlKSkNCg0KYGBgDQojIyMgUGFydCAyIA0KDQojIyMjIFF1ZXN0aW9uIDc6DQpgYGB7cn0NCmZ1bGxfbG0gPC0gbG0oY21lZHYgfiAuLCBkYXRhID0gdHJhaW4pDQpzdW1tYXJ5KGZ1bGxfbG0pDQoNCnNpZ25pZmljYW50X3ZhcnMgPC0gc3VtbWFyeShmdWxsX2xtKSRjb2VmZmljaWVudHNbLCA0XSA8IDAuMDUNCnNpZ25pZmljYW50X3ZhcnNfbmFtZXMgPC0gbmFtZXMoc2lnbmlmaWNhbnRfdmFycylbc2lnbmlmaWNhbnRfdmFyc10NCnByaW50KCJTaWduaWZpY2FudCB2YXJpYWJsZXMgYXQgcC52YWx1ZSA8IDAuMDU6IikNCnByaW50KHNpZ25pZmljYW50X3ZhcnNfbmFtZXMpDQoNCmxzdGF0X2NvZWZmIDwtIHN1bW1hcnkoZnVsbF9sbSkkY29lZmZpY2llbnRzWyJsc3RhdCIsIF0NCnByaW50KHBhc3RlKCJsc3RhdCBjb2VmZmljaWVudDoiLCBsc3RhdF9jb2VmZikpDQoNCmBgYA0KIyMjIyBRdWVzdGlvbiA4OiANCmBgYHtyfQ0KcHJlZHNfZnVsbCA8LSBwcmVkaWN0KGZ1bGxfbG0sIG5ld2RhdGEgPSB0ZXN0KQ0KDQpybXNlX2Z1bGwgPC0gc3FydChtZWFuKChwcmVkc19mdWxsIC0gdGVzdCRjbWVkdileMikpDQpwcmludChwYXN0ZSgiUk1TRSBmb3IgZnVsbCBtb2RlbDoiLCBybXNlX2Z1bGwpKQ0KDQppZiAocm1zZV9mdWxsIDwgcm1zZV9zaW1wbGUpIHsNCiAgcHJpbnQoIlRoZSBmdWxsIG1vZGVsIHBlcmZvcm1zIGJldHRlci4iKQ0KfSBlbHNlIHsNCiAgcHJpbnQoIlRoZSBzaW1wbGUgbW9kZWwgcGVyZm9ybXMgYmV0dGVyLiIpDQp9DQoNCmBgYA0KIyMjIyBRdWVzdGlvbiA5OiANCmBgYHtyfQ0KDQpjb2VmZmljaWVudHMgPC0gc3VtbWFyeShmdWxsX2xtKSRjb2VmZmljaWVudHNbLCAxXQ0KdG9wXzVfaW5mbHVlbnRpYWwgPC0gc29ydChhYnMoY29lZmZpY2llbnRzKSwgZGVjcmVhc2luZyA9IFRSVUUpWzI6Nl0gICMgRXhjbHVkZSBpbnRlcmNlcHQNCnRvcF81X2luZmx1ZW50aWFsX25hbWVzIDwtIG5hbWVzKHRvcF81X2luZmx1ZW50aWFsKQ0KDQpwcmludCgiVG9wIDUgbW9zdCBpbmZsdWVudGlhbCBwcmVkaWN0b3IgdmFyaWFibGVzOiIpDQpwcmludCh0b3BfNV9pbmZsdWVudGlhbF9uYW1lcykNCg0KDQpgYGANCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0K