#PACKAGES
# Data wrangling & visualization packages
library(tidyverse)
# Modeling packages
library(tidymodels)
#IMPORT
library(readr)
boston <- read_csv("data/boston-1.csv")
head(boston)
## # A tibble: 6 × 16
## lon lat cmedv crim zn indus chas nox rm age dis rad
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -71.0 42.3 24 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1
## 2 -71.0 42.3 21.6 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2
## 3 -70.9 42.3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2
## 4 -70.9 42.3 33.4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3
## 5 -70.9 42.3 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3
## 6 -70.9 42.3 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3
## # … with 4 more variables: tax <dbl>, ptratio <dbl>, b <dbl>, lstat <dbl>
#2. Split the data into a training set and test set using a 70-30% split. Be sure to include the set.seed(123) so that your train and test sets are the same size as mine and statify by cmedv.
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
train <- training(split)
test <- testing(split)
#3. Using the training data, assess the correlation between cmedv and all the predictor variables. Hint: you can generate a correlation matrix for all variables at one time with cor(). Which predictor variable has the strongest positive correlation? Which predictor variable has the strongest negative correlation?
correlation_train <- cor(train)
cmedv_cor <- correlation_train[ , c("cmedv")]
strongest postive = rm strongest negative = lstat
#4. Based on the results in #3, plot the relationship between cmedv and the predictor variable with the strongest positive correlation. Add a “best fit” regression line that illustrates the linear relationship. Do you feel like this line accurately captures the relationship? yes
ggplot(train, aes(rm, cmedv)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", se = FALSE) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels= scales::comma)
#5. Train a simple linear regression model with the predictor variables identified above as having the strongest positive correlation. Explain the coefficient estimate for this predictor variable. Is the coefficient statistically different than zero?
model1 <- linear_reg() %>%
fit(cmedv ~ rm, data = train)
tidy(model1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -35.4 3.11 -11.4 8.70e-26
## 2 rm 9.22 0.491 18.8 7.46e-55
for every one rm we see an average increase of 9.217 yes, because the p-value is basically 0
#6. Compute the generalization RMSE for the above model. Interpret the results.
residuals <- model1 %>%
predict(train) %>%
bind_cols(train) %>%
select(rm, cmedv, .pred) %>%
mutate(residual = cmedv - .pred)
residuals %>%
mutate(squared_residuals = residual^2) %>%
summarize(sum_of_squared_residuals = sum(squared_residuals))
## # A tibble: 1 × 1
## sum_of_squared_residuals
## <dbl>
## 1 14751.
model1 %>%
predict(test) %>%
bind_cols(test) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.83
#7 Fit a linear regression model using all available features to predict cmedv. Which predictor variables are statistically different than zero at the p.value < 0.05 level? Interpret the lstat coefficient.
model3 <- linear_reg() %>%
fit(cmedv ~ ., data = train)
tidy(model3) %>%
arrange(desc(p.value))
## # A tibble: 16 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 indus -0.0143 0.0738 -0.193 8.47e- 1
## 2 age -0.00921 0.0156 -0.591 5.55e- 1
## 3 lat 5.54 4.24 1.31 1.93e- 1
## 4 lon -5.65 3.86 -1.46 1.45e- 1
## 5 (Intercept) -608. 342. -1.78 7.64e- 2
## 6 zn 0.0332 0.0165 2.01 4.56e- 2
## 7 crim -0.0830 0.0396 -2.10 3.65e- 2
## 8 chas 2.28 1.05 2.17 3.06e- 2
## 9 nox -11.7 4.74 -2.46 1.44e- 2
## 10 tax -0.0121 0.00436 -2.78 5.78e- 3
## 11 rad 0.272 0.0790 3.44 6.47e- 4
## 12 b 0.0123 0.00310 3.97 8.72e- 5
## 13 dis -1.26 0.244 -5.17 4.06e- 7
## 14 ptratio -0.874 0.163 -5.37 1.48e- 7
## 15 lstat -0.479 0.0637 -7.51 5.24e-13
## 16 rm 4.37 0.516 8.46 8.13e-16
pvalue less than 0.05: zn, crim, chas, nox, tax, rad, b, dis, ptratio, lstat, rm
In reference to the intercept the lstat will decrease by 0.4786
#8. Compute the generalization RMSE for the model with all predictor variables. What is the test set RMSE? Is this better than the model that just used the rm predictor variable?
model3 %>%
predict(test) %>%
bind_cols(test) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
rmse: 4.829. Yes, it is almost 2 points better thann when just using rm
#9. Using the full model with all predictor variables, identify the top 5 most influential predictor variables in our model.
model3 %>%
vip::vip(num_features = 20)