Assignment #2

Load required packages

options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("ISLR2") 

The downloaded binary packages are in
    /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//RtmpxU4gGZ/downloaded_packages
library(ISLR2)
library(ggplot2)
install.packages("GGally")

The downloaded binary packages are in
    /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//RtmpxU4gGZ/downloaded_packages
library(GGally)
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2

==========================

Question 2: KNN Classifier vs Regression

==========================

Explanation:

KNN Classifier predicts a categorical label based on the majority vote of k nearest neighbors.

KNN Regression predicts a continuous value by averaging the values of the k nearest neighbors.

==========================

Question 9: Auto Dataset - Multiple Linear Regression

==========================

(a) Scatterplot matrix

data("Auto")  
head(Auto)
  mpg cylinders displacement horsepower weight acceleration year origin
1  18         8          307        130   3504         12.0   70      1
2  15         8          350        165   3693         11.5   70      1
3  18         8          318        150   3436         11.0   70      1
4  16         8          304        150   3433         12.0   70      1
5  17         8          302        140   3449         10.5   70      1
6  15         8          429        198   4341         10.0   70      1
                       name
1 chevrolet chevelle malibu
2         buick skylark 320
3        plymouth satellite
4             amc rebel sst
5               ford torino
6          ford galaxie 500
auto <- Auto
ggpairs(auto[, -9])  # Exclude 'name' column

(b) Correlation matrix

cor(auto[, -9])  # Exclude 'name'
                    mpg  cylinders displacement horsepower     weight
mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
             acceleration       year     origin
mpg             0.4233285  0.5805410  0.5652088
cylinders      -0.5046834 -0.3456474 -0.5689316
displacement   -0.5438005 -0.3698552 -0.6145351
horsepower     -0.6891955 -0.4163615 -0.4551715
weight         -0.4168392 -0.3091199 -0.5850054
acceleration    1.0000000  0.2903161  0.2127458
year            0.2903161  1.0000000  0.1815277
origin          0.2127458  0.1815277  1.0000000

(c) Multiple linear regression

auto_lm <- lm(mpg ~ . - name, data = auto)
summary(auto_lm)

Call:
lm(formula = mpg ~ . - name, data = auto)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.5903 -2.1565 -0.1169  1.8690 13.0604 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
cylinders     -0.493376   0.323282  -1.526  0.12780    
displacement   0.019896   0.007515   2.647  0.00844 ** 
horsepower    -0.016951   0.013787  -1.230  0.21963    
weight        -0.006474   0.000652  -9.929  < 2e-16 ***
acceleration   0.080576   0.098845   0.815  0.41548    
year           0.750773   0.050973  14.729  < 2e-16 ***
origin         1.426141   0.278136   5.127 4.67e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.328 on 384 degrees of freedom
Multiple R-squared:  0.8215,    Adjusted R-squared:  0.8182 
F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

(d) Diagnostic plots

par(mfrow = c(2, 2))
plot(auto_lm)

(e) Interaction terms

auto_lm_interact <- lm(mpg ~ (cylinders + displacement + horsepower + weight + acceleration + year + origin)^2, data = auto)
summary(auto_lm_interact)

Call:
lm(formula = mpg ~ (cylinders + displacement + horsepower + weight + 
    acceleration + year + origin)^2, data = auto)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.6303 -1.4481  0.0596  1.2739 11.1386 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)   
(Intercept)                3.548e+01  5.314e+01   0.668  0.50475   
cylinders                  6.989e+00  8.248e+00   0.847  0.39738   
displacement              -4.785e-01  1.894e-01  -2.527  0.01192 * 
horsepower                 5.034e-01  3.470e-01   1.451  0.14769   
weight                     4.133e-03  1.759e-02   0.235  0.81442   
acceleration              -5.859e+00  2.174e+00  -2.696  0.00735 **
year                       6.974e-01  6.097e-01   1.144  0.25340   
origin                    -2.090e+01  7.097e+00  -2.944  0.00345 **
cylinders:displacement    -3.383e-03  6.455e-03  -0.524  0.60051   
cylinders:horsepower       1.161e-02  2.420e-02   0.480  0.63157   
cylinders:weight           3.575e-04  8.955e-04   0.399  0.69000   
cylinders:acceleration     2.779e-01  1.664e-01   1.670  0.09584 . 
cylinders:year            -1.741e-01  9.714e-02  -1.793  0.07389 . 
cylinders:origin           4.022e-01  4.926e-01   0.816  0.41482   
displacement:horsepower   -8.491e-05  2.885e-04  -0.294  0.76867   
displacement:weight        2.472e-05  1.470e-05   1.682  0.09342 . 
displacement:acceleration -3.479e-03  3.342e-03  -1.041  0.29853   
displacement:year          5.934e-03  2.391e-03   2.482  0.01352 * 
displacement:origin        2.398e-02  1.947e-02   1.232  0.21875   
horsepower:weight         -1.968e-05  2.924e-05  -0.673  0.50124   
horsepower:acceleration   -7.213e-03  3.719e-03  -1.939  0.05325 . 
horsepower:year           -5.838e-03  3.938e-03  -1.482  0.13916   
horsepower:origin          2.233e-03  2.930e-02   0.076  0.93931   
weight:acceleration        2.346e-04  2.289e-04   1.025  0.30596   
weight:year               -2.245e-04  2.127e-04  -1.056  0.29182   
weight:origin             -5.789e-04  1.591e-03  -0.364  0.71623   
acceleration:year          5.562e-02  2.558e-02   2.174  0.03033 * 
acceleration:origin        4.583e-01  1.567e-01   2.926  0.00365 **
year:origin                1.393e-01  7.399e-02   1.882  0.06062 . 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.695 on 363 degrees of freedom
Multiple R-squared:  0.8893,    Adjusted R-squared:  0.8808 
F-statistic: 104.2 on 28 and 363 DF,  p-value: < 2.2e-16

(f) Transformations

auto_lm_trans <- lm(mpg ~ log(horsepower) + I(weight^2) + sqrt(acceleration) + year + origin + cylinders + displacement, data = auto)
summary(auto_lm_trans)

Call:
lm(formula = mpg ~ log(horsepower) + I(weight^2) + sqrt(acceleration) + 
    year + origin + cylinders + displacement, data = auto)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.7389 -1.9547 -0.1959  1.8841 12.7431 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         5.424e+01  9.720e+00   5.580 4.55e-08 ***
log(horsepower)    -1.354e+01  1.447e+00  -9.355  < 2e-16 ***
I(weight^2)        -2.529e-07  9.828e-08  -2.574   0.0104 *  
sqrt(acceleration) -4.576e+00  8.003e-01  -5.717 2.18e-08 ***
year                6.683e-01  4.929e-02  13.558  < 2e-16 ***
origin              1.574e+00  2.684e-01   5.865 9.68e-09 ***
cylinders          -5.584e-01  3.152e-01  -1.771   0.0773 .  
displacement        7.785e-03  7.208e-03   1.080   0.2808    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.273 on 384 degrees of freedom
Multiple R-squared:  0.8273,    Adjusted R-squared:  0.8241 
F-statistic: 262.8 on 7 and 384 DF,  p-value: < 2.2e-16

==========================

Question 10: Carseats Dataset - Regression

==========================

(a) Fit model

carseats <- Carseats
model1 <- lm(Sales ~ Price + Urban + US, data = carseats)
summary(model1)

Call:
lm(formula = Sales ~ Price + Urban + US, data = carseats)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.9206 -1.6220 -0.0564  1.5786  7.0581 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
Price       -0.054459   0.005242 -10.389  < 2e-16 ***
UrbanYes    -0.021916   0.271650  -0.081    0.936    
USYes        1.200573   0.259042   4.635 4.86e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.472 on 396 degrees of freedom
Multiple R-squared:  0.2393,    Adjusted R-squared:  0.2335 
F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

(b) Interpretation

Coefficients give expected change in Sales per unit change in predictor.

(c) Equation format:

Sales = β0 + β1Price + β2UrbanYes + β3*USYes

(e) Reduced model

model2 <- lm(Sales ~ Price + US, data = carseats)
summary(model2)

Call:
lm(formula = Sales ~ Price + US, data = carseats)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.9269 -1.6286 -0.0574  1.5766  7.0515 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
Price       -0.05448    0.00523 -10.416  < 2e-16 ***
USYes        1.19964    0.25846   4.641 4.71e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.469 on 397 degrees of freedom
Multiple R-squared:  0.2393,    Adjusted R-squared:  0.2354 
F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

(f) Compare models

anova(model2, model1)
Analysis of Variance Table

Model 1: Sales ~ Price + US
Model 2: Sales ~ Price + Urban + US
  Res.Df    RSS Df Sum of Sq      F Pr(>F)
1    397 2420.9                           
2    396 2420.8  1   0.03979 0.0065 0.9357

(g) 95% confidence intervals

confint(model2)
                  2.5 %      97.5 %
(Intercept) 11.79032020 14.27126531
Price       -0.06475984 -0.04419543
USYes        0.69151957  1.70776632

(h) Diagnostic plots

par(mfrow = c(2, 2))
plot(model2)

==========================

Question 12: Regression Without Intercept

==========================

(a) Explanation:

The coefficient from regressing Y on X without intercept equals that of regressing X on Y

only when X and Y are perfectly correlated (cor = ±1).

(b) Different coefficients

set.seed(1)
x <- rnorm(100)
y <- 2 * x + rnorm(100)

summary(lm(y ~ x + 0))  # Y on X

Call:
lm(formula = y ~ x + 0)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.9154 -0.6472 -0.1771  0.5056  2.3109 

Coefficients:
  Estimate Std. Error t value Pr(>|t|)    
x   1.9939     0.1065   18.73   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9586 on 99 degrees of freedom
Multiple R-squared:  0.7798,    Adjusted R-squared:  0.7776 
F-statistic: 350.7 on 1 and 99 DF,  p-value: < 2.2e-16
summary(lm(x ~ y + 0))  # X on Y

Call:
lm(formula = x ~ y + 0)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.8699 -0.2368  0.1030  0.2858  0.8938 

Coefficients:
  Estimate Std. Error t value Pr(>|t|)    
y  0.39111    0.02089   18.73   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4246 on 99 degrees of freedom
Multiple R-squared:  0.7798,    Adjusted R-squared:  0.7776 
F-statistic: 350.7 on 1 and 99 DF,  p-value: < 2.2e-16

(c) Same coefficients

x <- 1:100
y <- 2 * x

summary(lm(y ~ x + 0))  # Y on X
Warning in summary.lm(lm(y ~ x + 0)): essentially perfect fit: summary may be
unreliable

Call:
lm(formula = y ~ x + 0)

Residuals:
       Min         1Q     Median         3Q        Max 
-4.677e-13 -3.180e-15  6.000e-17  3.080e-15  1.537e-14 

Coefficients:
   Estimate Std. Error   t value Pr(>|t|)    
x 2.000e+00  8.144e-17 2.456e+16   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.737e-14 on 99 degrees of freedom
Multiple R-squared:      1, Adjusted R-squared:      1 
F-statistic: 6.03e+32 on 1 and 99 DF,  p-value: < 2.2e-16
summary(lm(x ~ y + 0))  # X on Y
Warning in summary.lm(lm(x ~ y + 0)): essentially perfect fit: summary may be
unreliable

Call:
lm(formula = x ~ y + 0)

Residuals:
       Min         1Q     Median         3Q        Max 
-2.338e-13 -1.589e-15  2.900e-17  1.540e-15  7.683e-15 

Coefficients:
   Estimate Std. Error   t value Pr(>|t|)    
y 5.000e-01  2.036e-17 2.456e+16   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.369e-14 on 99 degrees of freedom
Multiple R-squared:      1, Adjusted R-squared:      1 
F-statistic: 6.03e+32 on 1 and 99 DF,  p-value: < 2.2e-16