library(ISLR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
library(tibble)
#a Predicting Sales
glimpse(Carseats)
## Rows: 400
## Columns: 11
## $ Sales <dbl> 9.50, 11.22, 10.06, 7.40, 4.15, 10.81, 6.63, 11.85, 6.54, …
## $ CompPrice <dbl> 138, 111, 113, 117, 141, 124, 115, 136, 132, 132, 121, 117…
## $ Income <dbl> 73, 48, 35, 100, 64, 113, 105, 81, 110, 113, 78, 94, 35, 2…
## $ Advertising <dbl> 11, 16, 10, 4, 3, 13, 0, 15, 0, 0, 9, 4, 2, 11, 11, 5, 0, …
## $ Population <dbl> 276, 260, 269, 466, 340, 501, 45, 425, 108, 131, 150, 503,…
## $ Price <dbl> 120, 83, 80, 97, 128, 72, 108, 120, 124, 124, 100, 94, 136…
## $ ShelveLoc <fct> Bad, Good, Medium, Medium, Bad, Bad, Medium, Good, Medium,…
## $ Age <dbl> 42, 65, 59, 55, 38, 78, 71, 67, 76, 76, 26, 50, 62, 53, 52…
## $ Education <dbl> 17, 10, 12, 14, 13, 16, 15, 10, 10, 17, 10, 13, 18, 18, 18…
## $ Urban <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Ye…
## $ US <fct> Yes, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Yes, N…
sales_lm <- lm(Sales ~ Price + Urban + US, data = Carseats)
# b Interpreting Coefficients
summary(sales_lm)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
#e Smaller Model
sales_lm_2 <- lm(Sales ~ Price + US, data = Carseats)
summary(sales_lm_2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
#g Coefficient Confidence Interval
confint(sales_lm_2, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
#h Outliers and High Leverage Observations
# NOTE: broom::augment(lm)$.hat is the same as hatvalues(lm)
# (p + 1) / n = average leverage?
round(((2 + 1) / nrow(Carseats)), 10) == round(mean(hatvalues(sales_lm_2)), 10)
## [1] TRUE
broom::augment(sales_lm_2) %>%
select(.hat, .std.resid) %>%
ggplot(aes(x = .hat, y = .std.resid)) +
geom_point() +
geom_hline(yintercept = -2, col = "deepskyblue3", size = 1) +
geom_hline(yintercept = 2, col = "deepskyblue3", size = 1) +
geom_vline(xintercept = 3 / nrow(Carseats), col = "mediumseagreen", size = 1) +
theme_light() +
labs(x = "Leverage",
y = "Standardized Residual",
title = "Residuals vs Leverage")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

broom::augment(sales_lm_2) %>%
rownames_to_column("rowid") %>%
arrange(desc(.cooksd)) %>%
select(Sales, Price, US, .std.resid, .hat, .cooksd)
## # A tibble: 400 × 6
## Sales Price US .std.resid .hat .cooksd
## <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 14.9 82 No 2.58 0.0116 0.0261
## 2 14.4 53 No 1.73 0.0237 0.0243
## 3 10.6 149 No 2.32 0.0126 0.0228
## 4 15.6 72 Yes 2.17 0.0129 0.0205
## 5 0.37 191 Yes -1.42 0.0286 0.0198
## 6 16.3 92 Yes 2.87 0.00664 0.0183
## 7 3.47 81 No -2.10 0.0119 0.0177
## 8 0.53 159 Yes -2.05 0.0119 0.0169
## 9 13.6 89 No 2.18 0.00983 0.0158
## 10 3.02 90 Yes -2.56 0.00710 0.0157
## # ℹ 390 more rows