library(ISLR)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
library(tibble)
#a Predicting Sales
glimpse(Carseats)
## Rows: 400
## Columns: 11
## $ Sales       <dbl> 9.50, 11.22, 10.06, 7.40, 4.15, 10.81, 6.63, 11.85, 6.54, …
## $ CompPrice   <dbl> 138, 111, 113, 117, 141, 124, 115, 136, 132, 132, 121, 117…
## $ Income      <dbl> 73, 48, 35, 100, 64, 113, 105, 81, 110, 113, 78, 94, 35, 2…
## $ Advertising <dbl> 11, 16, 10, 4, 3, 13, 0, 15, 0, 0, 9, 4, 2, 11, 11, 5, 0, …
## $ Population  <dbl> 276, 260, 269, 466, 340, 501, 45, 425, 108, 131, 150, 503,…
## $ Price       <dbl> 120, 83, 80, 97, 128, 72, 108, 120, 124, 124, 100, 94, 136…
## $ ShelveLoc   <fct> Bad, Good, Medium, Medium, Bad, Bad, Medium, Good, Medium,…
## $ Age         <dbl> 42, 65, 59, 55, 38, 78, 71, 67, 76, 76, 26, 50, 62, 53, 52…
## $ Education   <dbl> 17, 10, 12, 14, 13, 16, 15, 10, 10, 17, 10, 13, 18, 18, 18…
## $ Urban       <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Ye…
## $ US          <fct> Yes, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Yes, N…
sales_lm <- lm(Sales ~ Price + Urban + US, data = Carseats)
# b Interpreting Coefficients
summary(sales_lm)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
#e Smaller Model
sales_lm_2 <- lm(Sales ~ Price + US, data = Carseats)

summary(sales_lm_2)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
#g Coefficient Confidence Interval
confint(sales_lm_2, level = 0.95)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632
#h Outliers and High Leverage Observations
# NOTE: broom::augment(lm)$.hat is the same as hatvalues(lm)
# (p + 1) / n = average leverage?
round(((2 + 1) / nrow(Carseats)), 10) == round(mean(hatvalues(sales_lm_2)), 10)
## [1] TRUE
broom::augment(sales_lm_2) %>%
  select(.hat, .std.resid) %>%
  ggplot(aes(x = .hat, y = .std.resid)) + 
  geom_point() + 
  geom_hline(yintercept = -2, col = "deepskyblue3", size = 1) + 
  geom_hline(yintercept = 2, col = "deepskyblue3", size = 1) + 
  geom_vline(xintercept = 3 / nrow(Carseats), col = "mediumseagreen", size = 1) + 
  theme_light() + 
  labs(x = "Leverage", 
       y = "Standardized Residual", 
       title = "Residuals vs Leverage")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

broom::augment(sales_lm_2) %>%
  rownames_to_column("rowid") %>%
  arrange(desc(.cooksd)) %>% 
  select(Sales, Price, US, .std.resid, .hat, .cooksd)
## # A tibble: 400 × 6
##    Sales Price US    .std.resid    .hat .cooksd
##    <dbl> <dbl> <fct>      <dbl>   <dbl>   <dbl>
##  1 14.9     82 No          2.58 0.0116   0.0261
##  2 14.4     53 No          1.73 0.0237   0.0243
##  3 10.6    149 No          2.32 0.0126   0.0228
##  4 15.6     72 Yes         2.17 0.0129   0.0205
##  5  0.37   191 Yes        -1.42 0.0286   0.0198
##  6 16.3     92 Yes         2.87 0.00664  0.0183
##  7  3.47    81 No         -2.10 0.0119   0.0177
##  8  0.53   159 Yes        -2.05 0.0119   0.0169
##  9 13.6     89 No          2.18 0.00983  0.0158
## 10  3.02    90 Yes        -2.56 0.00710  0.0157
## # ℹ 390 more rows