Packages required:

library(tidymodels) 
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)    
library(dplyr)

Part 1:

Import Boston housing data

boston <- readr::read_csv("~/Library/CloudStorage/OneDrive-UniversityofCincinnati/UC Courses/Fall 2024/Data Mining/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Split data

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)

train <- training(split)
test <- testing(split)

Question 3:

cor_matrix <- cor(train)
cor_with_cmedv <- cor_matrix["cmedv", ]

sorted_cor <- sort(cor_with_cmedv, decreasing = TRUE)

strongest_pos <- names(sorted_cor[2])  
strongest_neg <- names(tail(sorted_cor, 1))

print(paste("Strongest positive correlation: ", strongest_pos))
## [1] "Strongest positive correlation:  rm"
print(paste("Strongest negative correlation: ", strongest_neg))
## [1] "Strongest negative correlation:  lstat"

Question 4:

ggplot(train, aes_string(x = strongest_pos, y = "cmedv")) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = paste("cmedv vs", strongest_pos),
       x = strongest_pos, y = "cmedv")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

Question 5:

simple_lm <- lm(cmedv ~ ., data = train %>% select(cmedv, strongest_pos))
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(strongest_pos)
## 
##   # Now:
##   data %>% select(all_of(strongest_pos))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
summary(simple_lm)
## 
## Call:
## lm(formula = cmedv ~ ., data = train %>% select(cmedv, strongest_pos))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.5959  -2.4585   0.3015   2.8637  31.2795 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -35.4295     3.1100  -11.39   <2e-16 ***
## rm            9.2170     0.4912   18.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.492 on 350 degrees of freedom
## Multiple R-squared:  0.5015, Adjusted R-squared:  0.5001 
## F-statistic: 352.1 on 1 and 350 DF,  p-value: < 2.2e-16
coeff <- summary(simple_lm)$coefficients[2, ]
print(coeff)
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 9.217011e+00 4.912137e-01 1.876375e+01 7.459051e-55

Question 6:

preds <- predict(simple_lm, newdata = test)

rmse_simple <- sqrt(mean((preds - test$cmedv)^2))
print(paste("RMSE for simple model:", rmse_simple))
## [1] "RMSE for simple model: 6.83140547006634"

Part 2

Question 7:

full_lm <- lm(cmedv ~ ., data = train)
summary(full_lm)
## 
## Call:
## lm(formula = cmedv ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.4058  -2.4591  -0.7128   1.4936  25.8239 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.080e+02  3.420e+02  -1.778 0.076351 .  
## lon         -5.648e+00  3.864e+00  -1.462 0.144690    
## lat          5.537e+00  4.243e+00   1.305 0.192741    
## crim        -8.304e-02  3.956e-02  -2.099 0.036540 *  
## zn           3.318e-02  1.654e-02   2.007 0.045592 *  
## indus       -1.426e-02  7.379e-02  -0.193 0.846841    
## chas         2.282e+00  1.051e+00   2.171 0.030606 *  
## nox         -1.165e+01  4.738e+00  -2.459 0.014435 *  
## rm           4.369e+00  5.162e-01   8.464 8.13e-16 ***
## age         -9.215e-03  1.559e-02  -0.591 0.554928    
## dis         -1.259e+00  2.436e-01  -5.168 4.06e-07 ***
## rad          2.721e-01  7.903e-02   3.444 0.000647 ***
## tax         -1.212e-02  4.362e-03  -2.778 0.005784 ** 
## ptratio     -8.739e-01  1.627e-01  -5.369 1.48e-07 ***
## b            1.231e-02  3.099e-03   3.972 8.72e-05 ***
## lstat       -4.786e-01  6.369e-02  -7.514 5.24e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.683 on 336 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7398 
## F-statistic: 67.54 on 15 and 336 DF,  p-value: < 2.2e-16
significant_vars <- summary(full_lm)$coefficients[, 4] < 0.05
significant_vars_names <- names(significant_vars)[significant_vars]
print("Significant variables at p.value < 0.05:")
## [1] "Significant variables at p.value < 0.05:"
print(significant_vars_names)
##  [1] "crim"    "zn"      "chas"    "nox"     "rm"      "dis"     "rad"    
##  [8] "tax"     "ptratio" "b"       "lstat"
lstat_coeff <- summary(full_lm)$coefficients["lstat", ]
print(paste("lstat coefficient:", lstat_coeff))
## [1] "lstat coefficient: -0.47857828967583"   
## [2] "lstat coefficient: 0.063690571028555"   
## [3] "lstat coefficient: -7.51411522847337"   
## [4] "lstat coefficient: 5.23773617043338e-13"

Question 8:

preds_full <- predict(full_lm, newdata = test)

rmse_full <- sqrt(mean((preds_full - test$cmedv)^2))
print(paste("RMSE for full model:", rmse_full))
## [1] "RMSE for full model: 4.82926145935202"
if (rmse_full < rmse_simple) {
  print("The full model performs better.")
} else {
  print("The simple model performs better.")
}
## [1] "The full model performs better."

Question 9:

coefficients <- summary(full_lm)$coefficients[, 1]
top_5_influential <- sort(abs(coefficients), decreasing = TRUE)[2:6]
top_5_influential_names <- names(top_5_influential)

print("Top 5 most influential predictor variables:")
## [1] "Top 5 most influential predictor variables:"
print(top_5_influential_names)
## [1] "nox"  "lon"  "lat"  "rm"   "chas"