Load Data
boston <- read_csv("C:/Users/letha/OneDrive/Documents/BANA4080/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Split Data (70% Train, 30% Test)
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
Correlation Analysis
cor_matrix <- cor(train %>% select(-c(lon, lat)))
strongest_pos <- names(sort(cor_matrix["cmedv",], decreasing = TRUE))[2]
strongest_neg <- names(sort(cor_matrix["cmedv",]))[1]
Plot Strongest Positive Correlation
train %>%
ggplot(aes_string(x = strongest_pos, y = "cmedv")) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

Simple Linear Regression
simple_model <- lm(cmedv ~ get(strongest_pos), data = train)
sum_simple <- summary(simple_model)
RMSE for Simple Model
simple_pred <- predict(simple_model, newdata = test)
simple_rmse <- sqrt(mean((test$cmedv - simple_pred)^2))
Multiple Linear Regression
multi_model <- lm(cmedv ~ ., data = train)
sum_multi <- summary(multi_model)
RMSE for Multiple Regression
multi_pred <- predict(multi_model, newdata = test)
multi_rmse <- sqrt(mean((test$cmedv - multi_pred)^2))
Top 5 Influential Variables
top_vars <- names(sort(abs(coef(multi_model)[-1]), decreasing = TRUE))[1:5]
Output Results
list(
"Strongest Positive Correlation" = strongest_pos,
"Strongest Negative Correlation" = strongest_neg,
"Simple Model RMSE" = simple_rmse,
"Multiple Model RMSE" = multi_rmse,
"Top 5 Predictors" = top_vars
)
## $`Strongest Positive Correlation`
## [1] "rm"
##
## $`Strongest Negative Correlation`
## [1] "lstat"
##
## $`Simple Model RMSE`
## [1] 6.831405
##
## $`Multiple Model RMSE`
## [1] 4.829261
##
## $`Top 5 Predictors`
## [1] "nox" "lon" "lat" "rm" "chas"