Question 2: KNN Classifier vs. KNN Regression

Key Differences

  • KNN Classifier is used for categorical labels (e.g., “spam” or “not spam”). It assigns a class based on the majority vote of its nearest neighbors.
  • KNN Regression is used for numerical values (e.g., predicting house prices). It takes the average (or weighted average) of its nearest neighbors’ values.

How They Work

Feature KNN Classifier KNN Regression
Task Classification Regression
Output Type Categorical (labels) Continuous (numerical)
Decision Rule Majority Vote Mean/Weighted Mean
Example Identifying animals in images Predicting house prices

Example in R

Here’s a basic example using the class package for KNN classification and regression:

KNN Classifier Example

library(class)

# Sample data (iris dataset)
data(iris)
set.seed(42)
train_idx <- sample(1:nrow(iris), 0.7 * nrow(iris))
train_data <- iris[train_idx, 1:4]
train_labels <- iris[train_idx, 5]
test_data <- iris[-train_idx, 1:4]

y_pred <- knn(train = train_data, test = test_data, cl = train_labels, k = 3)
table(y_pred, iris[-train_idx, 5])
##             
## y_pred       setosa versicolor virginica
##   setosa         12          0         0
##   versicolor      0         15         2
##   virginica       0          0        16

KNN Regression Example

library(FNN)
## 
## Attaching package: 'FNN'
## The following objects are masked from 'package:class':
## 
##     knn, knn.cv
# Sample regression dataset (cars dataset)
data(cars)
train_idx <- sample(1:nrow(cars), 0.7 * nrow(cars))
train_x <- as.matrix(cars[train_idx, 1, drop = FALSE])
train_y <- cars[train_idx, 2]
test_x <- as.matrix(cars[-train_idx, 1, drop = FALSE])

knn_model <- knn.reg(train = train_x, test = test_x, y = train_y, k = 3)
plot(cars, pch = 16)
points(test_x, knn_model$pred, col = 'red', pch = 16)

This comparison shows how KNN handles classification (labels) and regression (numerical predictions).

Question 9: Multiple Linear Regression on the Auto Dataset

library(ISLR)

library(car)
## Loading required package: carData
library(GGally)  # Load GGally package for ggpairs
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Load the dataset
data(Auto)

# (a) Scatterplot matrix
pairs(Auto[, -9])

# (b) Compute correlation matrix (excluding 'name')
cor_matrix <- cor(Auto[, -9])
print(cor_matrix)
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000
# (c) Multiple Linear Regression
lm_model <- lm(mpg ~ . - name, data = Auto)
summary(lm_model)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16
# (d) Diagnostic plots
par(mfrow = c(2, 2))
plot(lm_model)

# (e) Interaction effects
lm_interaction <- lm(mpg ~ (.-name)^2, data = Auto)
summary(lm_interaction)
## 
## Call:
## lm(formula = mpg ~ (. - name)^2, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6303 -1.4481  0.0596  1.2739 11.1386 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                3.548e+01  5.314e+01   0.668  0.50475   
## cylinders                  6.989e+00  8.248e+00   0.847  0.39738   
## displacement              -4.785e-01  1.894e-01  -2.527  0.01192 * 
## horsepower                 5.034e-01  3.470e-01   1.451  0.14769   
## weight                     4.133e-03  1.759e-02   0.235  0.81442   
## acceleration              -5.859e+00  2.174e+00  -2.696  0.00735 **
## year                       6.974e-01  6.097e-01   1.144  0.25340   
## origin                    -2.090e+01  7.097e+00  -2.944  0.00345 **
## cylinders:displacement    -3.383e-03  6.455e-03  -0.524  0.60051   
## cylinders:horsepower       1.161e-02  2.420e-02   0.480  0.63157   
## cylinders:weight           3.575e-04  8.955e-04   0.399  0.69000   
## cylinders:acceleration     2.779e-01  1.664e-01   1.670  0.09584 . 
## cylinders:year            -1.741e-01  9.714e-02  -1.793  0.07389 . 
## cylinders:origin           4.022e-01  4.926e-01   0.816  0.41482   
## displacement:horsepower   -8.491e-05  2.885e-04  -0.294  0.76867   
## displacement:weight        2.472e-05  1.470e-05   1.682  0.09342 . 
## displacement:acceleration -3.479e-03  3.342e-03  -1.041  0.29853   
## displacement:year          5.934e-03  2.391e-03   2.482  0.01352 * 
## displacement:origin        2.398e-02  1.947e-02   1.232  0.21875   
## horsepower:weight         -1.968e-05  2.924e-05  -0.673  0.50124   
## horsepower:acceleration   -7.213e-03  3.719e-03  -1.939  0.05325 . 
## horsepower:year           -5.838e-03  3.938e-03  -1.482  0.13916   
## horsepower:origin          2.233e-03  2.930e-02   0.076  0.93931   
## weight:acceleration        2.346e-04  2.289e-04   1.025  0.30596   
## weight:year               -2.245e-04  2.127e-04  -1.056  0.29182   
## weight:origin             -5.789e-04  1.591e-03  -0.364  0.71623   
## acceleration:year          5.562e-02  2.558e-02   2.174  0.03033 * 
## acceleration:origin        4.583e-01  1.567e-01   2.926  0.00365 **
## year:origin                1.393e-01  7.399e-02   1.882  0.06062 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared:  0.8893, Adjusted R-squared:  0.8808 
## F-statistic: 104.2 on 28 and 363 DF,  p-value: < 2.2e-16
# (f) Transformations
lm_log <- lm(mpg ~ log(horsepower) + log(weight) + log(displacement) + . - name, data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(horsepower) + sqrt(weight) + sqrt(displacement) + . - name, data = Auto)
lm_squared <- lm(mpg ~ I(horsepower^2) + I(weight^2) + I(displacement^2) + . - name, data = Auto)

summary(lm_log)
## 
## Call:
## lm(formula = mpg ~ log(horsepower) + log(weight) + log(displacement) + 
##     . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3771 -1.5128 -0.1639  1.4299 12.0439 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       181.413301  39.817860   4.556 7.03e-06 ***
## log(horsepower)   -17.314001   3.664225  -4.725 3.24e-06 ***
## log(weight)       -17.516336   7.193802  -2.435  0.01535 *  
## log(displacement)  -2.477384   2.839507  -0.872  0.38350    
## cylinders          -0.132894   0.298216  -0.446  0.65612    
## displacement        0.010675   0.014195   0.752  0.45247    
## horsepower          0.101199   0.030912   3.274  0.00116 ** 
## weight              0.001833   0.002202   0.832  0.40578    
## acceleration       -0.212557   0.100006  -2.125  0.03419 *  
## year                0.770245   0.045241  17.026  < 2e-16 ***
## origin              0.628231   0.268398   2.341  0.01976 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.913 on 381 degrees of freedom
## Multiple R-squared:  0.8643, Adjusted R-squared:  0.8607 
## F-statistic: 242.7 on 10 and 381 DF,  p-value: < 2.2e-16
summary(lm_sqrt)
## 
## Call:
## lm(formula = mpg ~ sqrt(horsepower) + sqrt(weight) + sqrt(displacement) + 
##     . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3995 -1.4843 -0.1259  1.4221 11.9602 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        60.002781  10.785960   5.563 5.00e-08 ***
## sqrt(horsepower)   -6.473917   1.429856  -4.528 7.99e-06 ***
## sqrt(weight)       -1.264104   0.522144  -2.421 0.015945 *  
## sqrt(displacement) -1.010947   0.851007  -1.188 0.235596    
## cylinders           0.010899   0.304322   0.036 0.971450    
## displacement        0.031851   0.028317   1.125 0.261378    
## horsepower          0.245778   0.063431   3.875 0.000126 ***
## weight              0.007516   0.004480   1.678 0.094246 .  
## acceleration       -0.208077   0.100265  -2.075 0.038632 *  
## year                0.769576   0.045142  17.048  < 2e-16 ***
## origin              0.604507   0.268150   2.254 0.024741 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.905 on 381 degrees of freedom
## Multiple R-squared:  0.865,  Adjusted R-squared:  0.8614 
## F-statistic: 244.1 on 10 and 381 DF,  p-value: < 2.2e-16
summary(lm_squared)
## 
## Call:
## lm(formula = mpg ~ I(horsepower^2) + I(weight^2) + I(displacement^2) + 
##     . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2232 -1.5534 -0.0931  1.4304 11.9162 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.509e+00  4.789e+00   0.733 0.464247    
## I(horsepower^2)    5.268e-04  1.384e-04   3.807 0.000164 ***
## I(weight^2)        1.047e-06  3.488e-07   3.002 0.002862 ** 
## I(displacement^2)  6.324e-05  3.463e-05   1.826 0.068661 .  
## cylinders          4.113e-01  3.275e-01   1.256 0.209886    
## displacement      -3.513e-02  2.005e-02  -1.752 0.080556 .  
## horsepower        -1.915e-01  4.096e-02  -4.675 4.09e-06 ***
## weight            -1.067e-02  2.590e-03  -4.122 4.62e-05 ***
## acceleration      -1.735e-01  1.004e-01  -1.728 0.084870 .  
## year               7.692e-01  4.512e-02  17.048  < 2e-16 ***
## origin             5.788e-01  2.668e-01   2.170 0.030643 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.902 on 381 degrees of freedom
## Multiple R-squared:  0.8653, Adjusted R-squared:  0.8618 
## F-statistic: 244.8 on 10 and 381 DF,  p-value: < 2.2e-16

Question 10:

10. This question should be answered using the Carseats data set..

(a) Fit a multiple regression model to predict Sales using Price, Urban, and US..

library(ISLR)
attach(Carseats)
head(Carseats)
##   Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1  9.50       138     73          11        276   120       Bad  42        17
## 2 11.22       111     48          16        260    83      Good  65        10
## 3 10.06       113     35          10        269    80    Medium  59        12
## 4  7.40       117    100           4        466    97    Medium  55        14
## 5  4.15       141     64           3        340   128       Bad  38        13
## 6 10.81       124    113          13        501    72       Bad  78        16
##   Urban  US
## 1   Yes Yes
## 2   Yes Yes
## 3   Yes Yes
## 4   Yes Yes
## 5   Yes  No
## 6    No Yes
summary(Carseats)
##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price        ShelveLoc        Age          Education   
##  Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
##  1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
##  Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
##  Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
##  3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
##  Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
##  Urban       US     
##  No :118   No :142  
##  Yes:282   Yes:258  
##                     
##                     
##                     
## 
fit<-lm(Sales~ Price + Urban + US, data = Carseats)
summary(fit)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
coef(fit)[2]
##       Price 
## -0.05445885

(b) Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!.

The coefficient for Price is -0.054459 which means that for every dollar increase in the price of my car seat my stores sales decrease by $54 on average.

The coefficient for US == Yes is 1.200573 which means, on average, US stores sell $1,200 more of carseats compared to stores outside the us

(c) Write out the model in equation form, being careful to handle the qualitative variables properly.

\(Sales = 13.04 -0.05Price -0.022Urban + 1.2US\)

(Intercept) 13.043469 0.651012 20.036 < 2e-16 Price -0.054459 0.005242 -10.389 < 2e-16 UrbanYes -0.021916 0.271650 -0.081 0.936
USYes 1.200573 0.259042 4.635 4.86e-06 ***

(d) Significant Predictors

Based on the p-values: - Price and US are significant predictors (p-values < 0.05). - Urban is not significant.

(e) Fit a smaller model with significant predictors

fit_reduced <- lm(Sales ~ Price + US, data = Carseats)
summary(fit_reduced)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

(f) Model Comparison

  • The reduced model has similar adjusted R² but removes an insignificant predictor.
  • Price and US remain strong predictors.

(g) Confidence Intervals

confint(fit_reduced)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632

(h) Outliers and High Leverage Points

par(mfrow = c(2, 2))
plot(fit_reduced)

- Residual plots suggest some outliers but no severe leverage points. - Cook’s distance indicates no highly influential observations.






## Question 12:


``` r
set.seed(123)
n <- 100
X <- rnorm(n)
Y <- 2 * X + rnorm(n)

# (a) Condition for equality
# The coefficient estimate for regression of X onto Y is the same as Y onto X when var(X) = var(Y).

# (b) Example where coefficients differ
lm_YX <- lm(Y ~ X - 1)
lm_XY <- lm(X ~ Y - 1)

summary(lm_YX)
## 
## Call:
## lm(formula = Y ~ X - 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0010 -0.7901 -0.1800  0.4693  3.1762 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)    
## X   1.9364     0.1064    18.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9713 on 99 degrees of freedom
## Multiple R-squared:  0.7698, Adjusted R-squared:  0.7675 
## F-statistic: 331.1 on 1 and 99 DF,  p-value: < 2.2e-16
summary(lm_XY)
## 
## Call:
## lm(formula = X ~ Y - 1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.49720 -0.18013  0.07056  0.35235  1.04653 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)    
## Y  0.39757    0.02185    18.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4401 on 99 degrees of freedom
## Multiple R-squared:  0.7698, Adjusted R-squared:  0.7675 
## F-statistic: 331.1 on 1 and 99 DF,  p-value: < 2.2e-16
# (c) Example where coefficients are the same
X_new <- rnorm(n, mean = 0, sd = 1)
Y_new <- X_new * sqrt(var(Y)/var(X))

lm_YX_same <- lm(Y_new ~ X_new - 1)
lm_XY_same <- lm(X_new ~ Y_new - 1)

summary(lm_YX_same)
## Warning in summary.lm(lm_YX_same): essentially perfect fit: summary may be
## unreliable
## 
## Call:
## lm(formula = Y_new ~ X_new - 1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.446e-16 -3.155e-17  6.240e-18  7.363e-17  4.242e-16 
## 
## Coefficients:
##        Estimate Std. Error   t value Pr(>|t|)    
## X_new 2.216e+00  1.546e-17 1.434e+17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.473e-16 on 99 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.056e+34 on 1 and 99 DF,  p-value: < 2.2e-16
summary(lm_XY_same)
## Warning in summary.lm(lm_XY_same): essentially perfect fit: summary may be
## unreliable
## 
## Call:
## lm(formula = X_new ~ Y_new - 1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.686e-16 -5.003e-17 -2.270e-18  1.469e-17  1.701e-15 
## 
## Coefficients:
##        Estimate Std. Error   t value Pr(>|t|)    
## Y_new 4.512e-01  8.732e-18 5.167e+16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.844e-16 on 99 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.67e+33 on 1 and 99 DF,  p-value: < 2.2e-16