Packages required:
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
Part 1:
Import Boston housing data
boston <- readr::read_csv("~/Library/CloudStorage/OneDrive-UniversityofCincinnati/UC Courses/Fall 2024/Data Mining/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Split data
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
Question 3:
cor_matrix <- cor(train)
cor_with_cmedv <- cor_matrix["cmedv", ]
sorted_cor <- sort(cor_with_cmedv, decreasing = TRUE)
strongest_pos <- names(sorted_cor[2])
strongest_neg <- names(tail(sorted_cor, 1))
print(paste("Strongest positive correlation: ", strongest_pos))
## [1] "Strongest positive correlation: rm"
print(paste("Strongest negative correlation: ", strongest_neg))
## [1] "Strongest negative correlation: lstat"
Question 4:
ggplot(train, aes_string(x = strongest_pos, y = "cmedv")) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("cmedv vs", strongest_pos),
x = strongest_pos, y = "cmedv")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

Question 5:
simple_lm <- lm(cmedv ~ ., data = train %>% select(cmedv, strongest_pos))
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(strongest_pos)
##
## # Now:
## data %>% select(all_of(strongest_pos))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
summary(simple_lm)
##
## Call:
## lm(formula = cmedv ~ ., data = train %>% select(cmedv, strongest_pos))
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.5959 -2.4585 0.3015 2.8637 31.2795
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35.4295 3.1100 -11.39 <2e-16 ***
## rm 9.2170 0.4912 18.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.492 on 350 degrees of freedom
## Multiple R-squared: 0.5015, Adjusted R-squared: 0.5001
## F-statistic: 352.1 on 1 and 350 DF, p-value: < 2.2e-16
coeff <- summary(simple_lm)$coefficients[2, ]
print(coeff)
## Estimate Std. Error t value Pr(>|t|)
## 9.217011e+00 4.912137e-01 1.876375e+01 7.459051e-55
Question 6:
preds <- predict(simple_lm, newdata = test)
rmse_simple <- sqrt(mean((preds - test$cmedv)^2))
print(paste("RMSE for simple model:", rmse_simple))
## [1] "RMSE for simple model: 6.83140547006634"
Part 2
Question 7:
full_lm <- lm(cmedv ~ ., data = train)
summary(full_lm)
##
## Call:
## lm(formula = cmedv ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.4058 -2.4591 -0.7128 1.4936 25.8239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.080e+02 3.420e+02 -1.778 0.076351 .
## lon -5.648e+00 3.864e+00 -1.462 0.144690
## lat 5.537e+00 4.243e+00 1.305 0.192741
## crim -8.304e-02 3.956e-02 -2.099 0.036540 *
## zn 3.318e-02 1.654e-02 2.007 0.045592 *
## indus -1.426e-02 7.379e-02 -0.193 0.846841
## chas 2.282e+00 1.051e+00 2.171 0.030606 *
## nox -1.165e+01 4.738e+00 -2.459 0.014435 *
## rm 4.369e+00 5.162e-01 8.464 8.13e-16 ***
## age -9.215e-03 1.559e-02 -0.591 0.554928
## dis -1.259e+00 2.436e-01 -5.168 4.06e-07 ***
## rad 2.721e-01 7.903e-02 3.444 0.000647 ***
## tax -1.212e-02 4.362e-03 -2.778 0.005784 **
## ptratio -8.739e-01 1.627e-01 -5.369 1.48e-07 ***
## b 1.231e-02 3.099e-03 3.972 8.72e-05 ***
## lstat -4.786e-01 6.369e-02 -7.514 5.24e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.683 on 336 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7398
## F-statistic: 67.54 on 15 and 336 DF, p-value: < 2.2e-16
significant_vars <- summary(full_lm)$coefficients[, 4] < 0.05
significant_vars_names <- names(significant_vars)[significant_vars]
print("Significant variables at p.value < 0.05:")
## [1] "Significant variables at p.value < 0.05:"
print(significant_vars_names)
## [1] "crim" "zn" "chas" "nox" "rm" "dis" "rad"
## [8] "tax" "ptratio" "b" "lstat"
lstat_coeff <- summary(full_lm)$coefficients["lstat", ]
print(paste("lstat coefficient:", lstat_coeff))
## [1] "lstat coefficient: -0.47857828967583"
## [2] "lstat coefficient: 0.063690571028555"
## [3] "lstat coefficient: -7.51411522847337"
## [4] "lstat coefficient: 5.23773617043338e-13"
Question 8:
preds_full <- predict(full_lm, newdata = test)
rmse_full <- sqrt(mean((preds_full - test$cmedv)^2))
print(paste("RMSE for full model:", rmse_full))
## [1] "RMSE for full model: 4.82926145935202"
if (rmse_full < rmse_simple) {
print("The full model performs better.")
} else {
print("The simple model performs better.")
}
## [1] "The full model performs better."
Question 9:
coefficients <- summary(full_lm)$coefficients[, 1]
top_5_influential <- sort(abs(coefficients), decreasing = TRUE)[2:6]
top_5_influential_names <- names(top_5_influential)
print("Top 5 most influential predictor variables:")
## [1] "Top 5 most influential predictor variables:"
print(top_5_influential_names)
## [1] "nox" "lon" "lat" "rm" "chas"