Load Data

boston <- read_csv("C:/Users/letha/OneDrive/Documents/BANA4080/boston.csv")

## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Split Data (70% Train, 30% Test)

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)

Correlation Analysis

cor_matrix <- cor(train %>% select(-c(lon, lat)))
strongest_pos <- names(sort(cor_matrix["cmedv",], decreasing = TRUE))[2]
strongest_neg <- names(sort(cor_matrix["cmedv",]))[1]

Plot Strongest Positive Correlation

train %>%
  ggplot(aes_string(x = strongest_pos, y = "cmedv")) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal()

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

Simple Linear Regression

simple_model <- lm(cmedv ~ get(strongest_pos), data = train)
sum_simple <- summary(simple_model)

RMSE for Simple Model

simple_pred <- predict(simple_model, newdata = test)
simple_rmse <- sqrt(mean((test$cmedv - simple_pred)^2))

Multiple Linear Regression

multi_model <- lm(cmedv ~ ., data = train)
sum_multi <- summary(multi_model)

RMSE for Multiple Regression

multi_pred <- predict(multi_model, newdata = test)
multi_rmse <- sqrt(mean((test$cmedv - multi_pred)^2))

Top 5 Influential Variables

top_vars <- names(sort(abs(coef(multi_model)[-1]), decreasing = TRUE))[1:5]

Output Results

list(
  "Strongest Positive Correlation" = strongest_pos,
  "Strongest Negative Correlation" = strongest_neg,
  "Simple Model RMSE" = simple_rmse,
  "Multiple Model RMSE" = multi_rmse,
  "Top 5 Predictors" = top_vars
)

## $`Strongest Positive Correlation`
## [1] "rm"
## 
## $`Strongest Negative Correlation`
## [1] "lstat"
## 
## $`Simple Model RMSE`
## [1] 6.831405
## 
## $`Multiple Model RMSE`
## [1] 4.829261
## 
## $`Top 5 Predictors`
## [1] "nox"  "lon"  "lat"  "rm"   "chas"

R Notebook

Load Data

Split Data (70% Train, 30% Test)

Correlation Analysis

Plot Strongest Positive Correlation

Simple Linear Regression

RMSE for Simple Model

Multiple Linear Regression

RMSE for Multiple Regression

Top 5 Influential Variables

Output Results