knitr::opts_chunk$set(echo = TRUE)
library(ISLR2)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
The KNN classifier attempts to estimate the conditional distribution of Y given X, and then classify the observation of interest based on the highest estimated probability. It does this but taking a group of known points \(\mathbb{N}_{0}\) similar to X and estimating the conditional probability for class j as the fraction of the points in \(\mathbb{N}_{0}\), whose response is j. The test observation is then classified by the largest probability.
The KNN regression model is similar. The key difference is that the KNN regression model estimates the response of the test observation by taking average of all training responses in \(\mathbb{N}_{0}\).
auto = read.csv("auto1.csv", na.strings = "?", stringsAsFactors = T)
dim(auto)
## [1] 397 9
auto = na.omit(auto)
dim(auto)
## [1] 392 9
plot(auto)
#### b) Compute the matrix of correlations between the variables using
the function cor(). You will need to exclude the name variable, which is
qualitative.
auto %>%
select(-name) %>%
cor()
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
m1 = lm(mpg ~ .-name, data = auto)
summary(m1)
##
## Call:
## lm(formula = mpg ~ . - name, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
coef(m1)[7]
## year
## 0.7507727
There is a relationship between the predictors and the response.
Displacement, weight, year and origin seem to have a significant
relationship with mpg. The coefficient for Year
is 0.75
which means that for every 1 increase in year the mpg increases by
0.75
par(mfrow=c(2,2))
plot(m1)
summary(influence.measures(m1))
## Potentially influential observations of
## lm(formula = mpg ~ . - name, data = auto) :
##
## dfb.1_ dfb.cyln dfb.dspl dfb.hrsp dfb.wght dfb.accl dfb.year dfb.orgn
## 9 -0.01 -0.15 0.16 0.14 -0.14 0.09 0.00 0.03
## 13 0.01 -0.01 0.02 -0.01 -0.01 -0.01 -0.01 0.01
## 14 0.11 0.17 -0.42 -0.43 0.68 -0.36 -0.08 0.00
## 26 -0.04 0.01 -0.06 0.15 -0.02 0.10 -0.01 -0.02
## 27 -0.03 0.05 -0.08 0.11 -0.01 0.07 -0.02 -0.03
## 28 -0.05 0.07 -0.13 0.19 -0.02 0.10 -0.02 -0.05
## 29 -0.06 0.08 -0.14 0.18 0.02 0.15 -0.03 -0.04
## 112 -0.22 0.16 0.03 0.02 -0.14 0.19 0.18 -0.16
## 167 -0.01 -0.22 -0.02 0.03 0.21 0.06 -0.03 0.04
## 245 -0.10 0.05 0.12 0.05 -0.23 0.33 0.02 0.03
## 271 0.03 0.15 -0.11 0.03 -0.04 0.06 -0.06 -0.24
## 300 -0.03 -0.03 0.02 0.01 0.01 0.06 0.01 0.02
## 301 -0.04 0.05 -0.01 0.01 -0.03 0.05 0.02 0.00
## 310 -0.04 0.01 -0.04 -0.02 -0.01 -0.13 0.13 -0.02
## 323 -0.19 0.06 0.05 -0.01 -0.06 0.09 0.16 0.31
## 326 -0.20 0.05 0.10 0.06 -0.19 0.34 0.12 0.04
## 327 -0.28 0.04 0.06 0.11 -0.13 0.49 0.12 0.05
## 328 -0.14 0.14 -0.17 -0.07 0.19 0.06 0.08 0.06
## 330 -0.02 0.02 0.09 -0.15 -0.05 -0.21 0.12 0.22
## 387 -0.16 -0.16 0.40 -0.19 -0.19 0.05 0.26 0.04
## 394 -0.38 0.03 0.14 0.25 -0.32 0.58 0.23 0.03
## dffit cov.r cook.d hat
## 9 0.30 1.06 0.01 0.06_*
## 13 0.02 1.07_* 0.00 0.05
## 14 -0.79_* 1.19_* 0.08 0.19_*
## 26 0.18 1.07_* 0.00 0.06
## 27 0.13 1.09_* 0.00 0.07_*
## 28 0.22 1.08_* 0.01 0.07_*
## 29 0.25 1.11_* 0.01 0.09_*
## 112 -0.46_* 0.87_* 0.03 0.02
## 167 -0.39 0.93_* 0.02 0.03
## 245 0.48_* 0.82_* 0.03 0.02
## 271 -0.33 0.90_* 0.01 0.02
## 300 0.09 1.07_* 0.00 0.05
## 301 0.08 1.08_* 0.00 0.06
## 310 0.29 0.86_* 0.01 0.01
## 323 0.47_* 0.74_* 0.03 0.01
## 326 0.49_* 0.81_* 0.03 0.02
## 327 0.63_* 0.79_* 0.05 0.03
## 328 0.40 0.88_* 0.02 0.02
## 330 0.43 0.87_* 0.02 0.02
## 387 0.55_* 0.88_* 0.04 0.03
## 394 0.68_* 0.90_* 0.06 0.05
The residual plots indict lack of fit. Additionally, the leverage plot shows several outliers and we also see that observation 14 has very high leverage.
m2 = lm(mpg ~ weight * year * origin, data = auto)
summary(m2)
##
## Call:
## lm(formula = mpg ~ weight * year * origin, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7880 -1.9187 -0.1022 1.4576 12.1862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.170e+02 3.551e+01 -6.111 2.43e-09 ***
## weight 7.198e-02 1.334e-02 5.398 1.18e-07 ***
## year 3.331e+00 4.660e-01 7.147 4.50e-12 ***
## origin 9.961e+01 2.508e+01 3.972 8.51e-05 ***
## weight:year -1.005e-03 1.749e-04 -5.749 1.83e-08 ***
## weight:origin -4.313e-02 1.080e-02 -3.995 7.75e-05 ***
## year:origin -1.236e+00 3.254e-01 -3.798 0.000170 ***
## weight:year:origin 5.402e-04 1.399e-04 3.861 0.000132 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.055 on 384 degrees of freedom
## Multiple R-squared: 0.8495, Adjusted R-squared: 0.8468
## F-statistic: 309.7 on 7 and 384 DF, p-value: < 2.2e-16
After testing multiple combinations of interactions using the variables displacement, weight, year, and origin, I found that a model with weight, year, origin and all combinations of interactions of the three, produced the highest adj r-square.
m3 = lm(mpg ~ log(weight) * year * origin, data = auto)
summary(m3)
##
## Call:
## lm(formula = mpg ~ log(weight) * year * origin, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3341 -1.6334 0.0224 1.2993 12.3905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1285.5968 266.8732 -4.817 2.10e-06 ***
## log(weight) 160.0439 33.8676 4.726 3.23e-06 ***
## year 19.1901 3.5125 5.463 8.41e-08 ***
## origin 692.8308 187.9104 3.687 0.000260 ***
## log(weight):year -2.3543 0.4458 -5.281 2.15e-07 ***
## log(weight):origin -89.7194 24.2803 -3.695 0.000252 ***
## year:origin -8.9417 2.4437 -3.659 0.000289 ***
## log(weight):year:origin 1.1589 0.3157 3.671 0.000276 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.951 on 384 degrees of freedom
## Multiple R-squared: 0.8596, Adjusted R-squared: 0.857
## F-statistic: 335.8 on 7 and 384 DF, p-value: < 2.2e-16
m4 = lm(mpg ~ weight * year^2 * origin, data = auto)
summary(m4)
##
## Call:
## lm(formula = mpg ~ weight * year^2 * origin, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7880 -1.9187 -0.1022 1.4576 12.1862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.170e+02 3.551e+01 -6.111 2.43e-09 ***
## weight 7.198e-02 1.334e-02 5.398 1.18e-07 ***
## year 3.331e+00 4.660e-01 7.147 4.50e-12 ***
## origin 9.961e+01 2.508e+01 3.972 8.51e-05 ***
## weight:year -1.005e-03 1.749e-04 -5.749 1.83e-08 ***
## weight:origin -4.313e-02 1.080e-02 -3.995 7.75e-05 ***
## year:origin -1.236e+00 3.254e-01 -3.798 0.000170 ***
## weight:year:origin 5.402e-04 1.399e-04 3.861 0.000132 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.055 on 384 degrees of freedom
## Multiple R-squared: 0.8495, Adjusted R-squared: 0.8468
## F-statistic: 309.7 on 7 and 384 DF, p-value: < 2.2e-16
Log weight gave less power to the weight variable and actually improved the model a bit. Year squared gave more power to the year variable and did not have a significant effect on our model.
fit = lm(Sales ~ Price + Urban + US, data = Carseats)
summary(fit)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
coef(fit)[2]
## Price
## -0.05445885
coef(fit)[4]
## USYes
## 1.200573
The coefficient for Price
is -0.054459 which means that
for every dollar increase in the price of my carseat, my store’s sales
decrease by $54
The coefficient for US = Yes
is 1.200573 which means, on
average, our carseats sell $1,200 more compared to stores outside the
US
Urban
is not significant.
\(Sales = 13.04 - 0.054Price - 0.022Urban + 1.2US\)
See part (b) for interpretation, but Price
and
US_Yes
are significant so we can reject the null hypothesis
\(H_0 : \beta_j = 0\)
fit = lm(Sales ~ Price + US, data = Carseats)
summary(fit)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
Terrible Adj R-Squared is 0.2335 for part (a) and 0.2354 for part (e)
confint(fit)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
par(mfrow=c(2,2))
plot(fit)
summary(influence.measures(fit))
## Potentially influential observations of
## lm(formula = Sales ~ Price + US, data = Carseats) :
##
## dfb.1_ dfb.Pric dfb.USYs dffit cov.r cook.d hat
## 26 0.24 -0.18 -0.17 0.28_* 0.97_* 0.03 0.01
## 29 -0.10 0.10 -0.10 -0.18 0.97_* 0.01 0.01
## 43 -0.11 0.10 0.03 -0.11 1.05_* 0.00 0.04_*
## 50 -0.10 0.17 -0.17 0.26_* 0.98 0.02 0.01
## 51 -0.05 0.05 -0.11 -0.18 0.95_* 0.01 0.00
## 58 -0.05 -0.02 0.16 -0.20 0.97_* 0.01 0.01
## 69 -0.09 0.10 0.09 0.19 0.96_* 0.01 0.01
## 126 -0.07 0.06 0.03 -0.07 1.03_* 0.00 0.03_*
## 160 0.00 0.00 0.00 0.01 1.02_* 0.00 0.02
## 166 0.21 -0.23 -0.04 -0.24 1.02 0.02 0.03_*
## 172 0.06 -0.07 0.02 0.08 1.03_* 0.00 0.02
## 175 0.14 -0.19 0.09 -0.21 1.03_* 0.02 0.03_*
## 210 -0.14 0.15 -0.10 -0.22 0.97_* 0.02 0.01
## 270 -0.03 0.05 -0.03 0.06 1.03_* 0.00 0.02
## 298 -0.06 0.06 -0.09 -0.15 0.97_* 0.01 0.00
## 314 -0.05 0.04 0.02 -0.05 1.03_* 0.00 0.02_*
## 353 -0.02 0.03 0.09 0.15 0.97_* 0.01 0.00
## 357 0.02 -0.02 0.02 -0.03 1.03_* 0.00 0.02
## 368 0.26 -0.23 -0.11 0.27_* 1.01 0.02 0.02_*
## 377 0.14 -0.15 0.12 0.24 0.95_* 0.02 0.01
## 384 0.00 0.00 0.00 0.00 1.02_* 0.00 0.02
## 387 -0.03 0.04 -0.03 0.05 1.02_* 0.00 0.02
## 396 -0.05 0.05 0.08 0.14 0.98_* 0.01 0.00
Yes there is.
This problem involves simple linear regression without an intercept.
Regression of Y on X without intercept, \(\hat{\beta} = (X'X)^{-1}X'Y = \frac{\sum x_iy_i}{\sum x_i^2}\)
Regression of X on Y without intercept, \(\hat{\beta} = (Y'Y)^{-1}Y'X = \frac{\sum x_iy_i}{\sum y_i^2}\)
These are equal when \(\frac{\sum x_iy_i}{\sum x_i^2} = \frac{\sum x_iy_i}{\sum y_i^2}\)
Therefore \(\sum x_i^2 = \sum y_i^2\), when the sum of squared X values equals the sum of squared Y values
set.seed(123)
x_b = rnorm(100, mean = 0, sd = 1)
y_b = rnorm(100, mean = 0, sd = 2)
xy_fit_b = lm(y_b ~ x_b - 1)
yx_fit_b = lm(x_b ~ y_b - 1)
coef(xy_fit_b)
## x_b
## -0.1272558
coef(yx_fit_b)
## y_b
## -0.02827693
set.seed(23)
x_c = rnorm(100, mean = 0, sd = 1)
y_c = rnorm(100, mean = 0, sd = 1)
xy_fit_c = lm(y_c ~ x_c - 1)
yx_fit_c = lm(x_c ~ y_c - 1)
coef(xy_fit_c)
## x_c
## -0.02207371
coef(yx_fit_c)
## y_c
## -0.02283415