| Feature | KNN Classifier | KNN Regression |
|---|---|---|
| Task | Classification | Regression |
| Output Type | Categorical (labels) | Continuous (numerical) |
| Decision Rule | Majority Vote | Mean/Weighted Mean |
| Example | Identifying animals in images | Predicting house prices |
Here’s a basic example using the class package for KNN
classification and regression:
library(class)
# Sample data (iris dataset)
data(iris)
set.seed(42)
train_idx <- sample(1:nrow(iris), 0.7 * nrow(iris))
train_data <- iris[train_idx, 1:4]
train_labels <- iris[train_idx, 5]
test_data <- iris[-train_idx, 1:4]
y_pred <- knn(train = train_data, test = test_data, cl = train_labels, k = 3)
table(y_pred, iris[-train_idx, 5])
##
## y_pred setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 15 2
## virginica 0 0 16
library(FNN)
##
## Attaching package: 'FNN'
## The following objects are masked from 'package:class':
##
## knn, knn.cv
# Sample regression dataset (cars dataset)
data(cars)
train_idx <- sample(1:nrow(cars), 0.7 * nrow(cars))
train_x <- as.matrix(cars[train_idx, 1, drop = FALSE])
train_y <- cars[train_idx, 2]
test_x <- as.matrix(cars[-train_idx, 1, drop = FALSE])
knn_model <- knn.reg(train = train_x, test = test_x, y = train_y, k = 3)
plot(cars, pch = 16)
points(test_x, knn_model$pred, col = 'red', pch = 16)
This comparison shows how KNN handles classification (labels) and regression (numerical predictions).
library(ISLR)
library(car)
## Loading required package: carData
library(GGally) # Load GGally package for ggpairs
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Load the dataset
data(Auto)
# (a) Scatterplot matrix
pairs(Auto[, -9])
# (b) Compute correlation matrix (excluding 'name')
cor_matrix <- cor(Auto[, -9])
print(cor_matrix)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
# (c) Multiple Linear Regression
lm_model <- lm(mpg ~ . - name, data = Auto)
summary(lm_model)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
# (d) Diagnostic plots
par(mfrow = c(2, 2))
plot(lm_model)
# (e) Interaction effects
lm_interaction <- lm(mpg ~ (.-name)^2, data = Auto)
summary(lm_interaction)
##
## Call:
## lm(formula = mpg ~ (. - name)^2, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
# (f) Transformations
lm_log <- lm(mpg ~ log(horsepower) + log(weight) + log(displacement) + . - name, data = Auto)
lm_sqrt <- lm(mpg ~ sqrt(horsepower) + sqrt(weight) + sqrt(displacement) + . - name, data = Auto)
lm_squared <- lm(mpg ~ I(horsepower^2) + I(weight^2) + I(displacement^2) + . - name, data = Auto)
summary(lm_log)
##
## Call:
## lm(formula = mpg ~ log(horsepower) + log(weight) + log(displacement) +
## . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3771 -1.5128 -0.1639 1.4299 12.0439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 181.413301 39.817860 4.556 7.03e-06 ***
## log(horsepower) -17.314001 3.664225 -4.725 3.24e-06 ***
## log(weight) -17.516336 7.193802 -2.435 0.01535 *
## log(displacement) -2.477384 2.839507 -0.872 0.38350
## cylinders -0.132894 0.298216 -0.446 0.65612
## displacement 0.010675 0.014195 0.752 0.45247
## horsepower 0.101199 0.030912 3.274 0.00116 **
## weight 0.001833 0.002202 0.832 0.40578
## acceleration -0.212557 0.100006 -2.125 0.03419 *
## year 0.770245 0.045241 17.026 < 2e-16 ***
## origin 0.628231 0.268398 2.341 0.01976 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.913 on 381 degrees of freedom
## Multiple R-squared: 0.8643, Adjusted R-squared: 0.8607
## F-statistic: 242.7 on 10 and 381 DF, p-value: < 2.2e-16
summary(lm_sqrt)
##
## Call:
## lm(formula = mpg ~ sqrt(horsepower) + sqrt(weight) + sqrt(displacement) +
## . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3995 -1.4843 -0.1259 1.4221 11.9602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.002781 10.785960 5.563 5.00e-08 ***
## sqrt(horsepower) -6.473917 1.429856 -4.528 7.99e-06 ***
## sqrt(weight) -1.264104 0.522144 -2.421 0.015945 *
## sqrt(displacement) -1.010947 0.851007 -1.188 0.235596
## cylinders 0.010899 0.304322 0.036 0.971450
## displacement 0.031851 0.028317 1.125 0.261378
## horsepower 0.245778 0.063431 3.875 0.000126 ***
## weight 0.007516 0.004480 1.678 0.094246 .
## acceleration -0.208077 0.100265 -2.075 0.038632 *
## year 0.769576 0.045142 17.048 < 2e-16 ***
## origin 0.604507 0.268150 2.254 0.024741 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.905 on 381 degrees of freedom
## Multiple R-squared: 0.865, Adjusted R-squared: 0.8614
## F-statistic: 244.1 on 10 and 381 DF, p-value: < 2.2e-16
summary(lm_squared)
##
## Call:
## lm(formula = mpg ~ I(horsepower^2) + I(weight^2) + I(displacement^2) +
## . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2232 -1.5534 -0.0931 1.4304 11.9162
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.509e+00 4.789e+00 0.733 0.464247
## I(horsepower^2) 5.268e-04 1.384e-04 3.807 0.000164 ***
## I(weight^2) 1.047e-06 3.488e-07 3.002 0.002862 **
## I(displacement^2) 6.324e-05 3.463e-05 1.826 0.068661 .
## cylinders 4.113e-01 3.275e-01 1.256 0.209886
## displacement -3.513e-02 2.005e-02 -1.752 0.080556 .
## horsepower -1.915e-01 4.096e-02 -4.675 4.09e-06 ***
## weight -1.067e-02 2.590e-03 -4.122 4.62e-05 ***
## acceleration -1.735e-01 1.004e-01 -1.728 0.084870 .
## year 7.692e-01 4.512e-02 17.048 < 2e-16 ***
## origin 5.788e-01 2.668e-01 2.170 0.030643 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.902 on 381 degrees of freedom
## Multiple R-squared: 0.8653, Adjusted R-squared: 0.8618
## F-statistic: 244.8 on 10 and 381 DF, p-value: < 2.2e-16
10. This question should be answered using the Carseats data set..
(a) Fit a multiple regression model to predict Sales using Price, Urban, and US..
library(ISLR)
attach(Carseats)
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
summary(Carseats)
## Sales CompPrice Income Advertising
## Min. : 0.000 Min. : 77 Min. : 21.00 Min. : 0.000
## 1st Qu.: 5.390 1st Qu.:115 1st Qu.: 42.75 1st Qu.: 0.000
## Median : 7.490 Median :125 Median : 69.00 Median : 5.000
## Mean : 7.496 Mean :125 Mean : 68.66 Mean : 6.635
## 3rd Qu.: 9.320 3rd Qu.:135 3rd Qu.: 91.00 3rd Qu.:12.000
## Max. :16.270 Max. :175 Max. :120.00 Max. :29.000
## Population Price ShelveLoc Age Education
## Min. : 10.0 Min. : 24.0 Bad : 96 Min. :25.00 Min. :10.0
## 1st Qu.:139.0 1st Qu.:100.0 Good : 85 1st Qu.:39.75 1st Qu.:12.0
## Median :272.0 Median :117.0 Medium:219 Median :54.50 Median :14.0
## Mean :264.8 Mean :115.8 Mean :53.32 Mean :13.9
## 3rd Qu.:398.5 3rd Qu.:131.0 3rd Qu.:66.00 3rd Qu.:16.0
## Max. :509.0 Max. :191.0 Max. :80.00 Max. :18.0
## Urban US
## No :118 No :142
## Yes:282 Yes:258
##
##
##
##
fit<-lm(Sales~ Price + Urban + US, data = Carseats)
summary(fit)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
coef(fit)[2]
## Price
## -0.05445885
(b) Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!.
The coefficient for Price is -0.054459 which means that
for every dollar increase in the price of my car seat my stores sales
decrease by $54 on average.
The coefficient for US == Yes is 1.200573 which means,
on average, US stores sell $1,200 more of carseats compared to stores
outside the us
(c) Write out the model in equation form, being careful to handle the qualitative variables properly.
\(Sales = 13.04 -0.05Price -0.022Urban + 1.2US\)
(Intercept) 13.043469 0.651012 20.036 < 2e-16 Price
-0.054459 0.005242 -10.389 < 2e-16 UrbanYes -0.021916
0.271650 -0.081 0.936
USYes 1.200573 0.259042 4.635 4.86e-06 ***
Based on the p-values: - Price and US are significant predictors (p-values < 0.05). - Urban is not significant.
fit_reduced <- lm(Sales ~ Price + US, data = Carseats)
summary(fit_reduced)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
confint(fit_reduced)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
par(mfrow = c(2, 2))
plot(fit_reduced)
- Residual plots suggest some outliers but no severe
leverage points. - Cook’s distance indicates no highly
influential observations.
## Question 12:
``` r
set.seed(123)
n <- 100
X <- rnorm(n)
Y <- 2 * X + rnorm(n)
# (a) Condition for equality
# The coefficient estimate for regression of X onto Y is the same as Y onto X when var(X) = var(Y).
# (b) Example where coefficients differ
lm_YX <- lm(Y ~ X - 1)
lm_XY <- lm(X ~ Y - 1)
summary(lm_YX)
##
## Call:
## lm(formula = Y ~ X - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0010 -0.7901 -0.1800 0.4693 3.1762
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X 1.9364 0.1064 18.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9713 on 99 degrees of freedom
## Multiple R-squared: 0.7698, Adjusted R-squared: 0.7675
## F-statistic: 331.1 on 1 and 99 DF, p-value: < 2.2e-16
summary(lm_XY)
##
## Call:
## lm(formula = X ~ Y - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.49720 -0.18013 0.07056 0.35235 1.04653
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Y 0.39757 0.02185 18.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4401 on 99 degrees of freedom
## Multiple R-squared: 0.7698, Adjusted R-squared: 0.7675
## F-statistic: 331.1 on 1 and 99 DF, p-value: < 2.2e-16
# (c) Example where coefficients are the same
X_new <- rnorm(n, mean = 0, sd = 1)
Y_new <- X_new * sqrt(var(Y)/var(X))
lm_YX_same <- lm(Y_new ~ X_new - 1)
lm_XY_same <- lm(X_new ~ Y_new - 1)
summary(lm_YX_same)
## Warning in summary.lm(lm_YX_same): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = Y_new ~ X_new - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.446e-16 -3.155e-17 6.240e-18 7.363e-17 4.242e-16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X_new 2.216e+00 1.546e-17 1.434e+17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.473e-16 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.056e+34 on 1 and 99 DF, p-value: < 2.2e-16
summary(lm_XY_same)
## Warning in summary.lm(lm_XY_same): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = X_new ~ Y_new - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.686e-16 -5.003e-17 -2.270e-18 1.469e-17 1.701e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Y_new 4.512e-01 8.732e-18 5.167e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.844e-16 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.67e+33 on 1 and 99 DF, p-value: < 2.2e-16