KNN classification predicts a categorical label for a new data point by finding the K nearest training points and assigning the label that appears most frequently among them. KNN regression predicts a continuous numeric value by averaging the values of the K nearest neighbors.
# (a)
library(ISLR)
data(Auto)
pairs(Auto[, -which(names(Auto) == "name")])
# (b)
cor(Auto[, -which(names(Auto) == "name")])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
# (c)
lm_fit <- lm(mpg ~ . - name, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
# (d)
par(mfrow = c(2, 2))
plot(lm_fit)
# (e)
lm_interact <- lm(mpg ~ (cylinders + displacement + horsepower + weight + acceleration + year + origin)^2, data = Auto)
summary(lm_interact)
##
## Call:
## lm(formula = mpg ~ (cylinders + displacement + horsepower + weight +
## acceleration + year + origin)^2, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
# (f)
lm_trans <- lm(mpg ~ log(horsepower) + sqrt(weight) + I(year^2) + cylinders + displacement + acceleration + origin, data = Auto)
summary(lm_trans)
##
## Call:
## lm(formula = mpg ~ log(horsepower) + sqrt(weight) + I(year^2) +
## cylinders + displacement + acceleration + origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.1542 -1.8515 -0.1238 1.7169 12.5832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.4881772 6.6860947 9.047 < 2e-16 ***
## log(horsepower) -7.5792964 1.5446639 -4.907 1.37e-06 ***
## sqrt(weight) -0.5808214 0.0750906 -7.735 9.20e-14 ***
## I(year^2) 0.0048422 0.0003108 15.581 < 2e-16 ***
## cylinders -0.4267968 0.2974801 -1.435 0.1522
## displacement 0.0213509 0.0065425 3.263 0.0012 **
## acceleration -0.1802562 0.1027472 -1.754 0.0802 .
## origin 1.3182159 0.2529896 5.211 3.08e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.083 on 384 degrees of freedom
## Multiple R-squared: 0.8467, Adjusted R-squared: 0.8439
## F-statistic: 303.1 on 7 and 384 DF, p-value: < 2.2e-16
# (a)
data(Carseats)
lm_fit <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(lm_fit)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
# (b) The coefficient for Price is -0.0545, indicating that for each one-unit increase in Price, Sales are expected to decrease by about 0.0545 units. The coefficient for UrbanYes is -0.022, indicating that Urban stores have slightly lower Sales than non-Urban stores. The coefficient for USYes is 1.20, which means that stores located in the US tend to have Sales that are about 1.20 units higher than stores outside the US. The intercept is 13.04, indicating when Price is zero and the store is located in a non-Urban, non-US area, the predicted Sales are about 13.04 units.
# (c) Sales = 13.04 − 0.0545 × Price − 0.022 × UrbanYes + 1.20 × USYes
# (d) We reject the null hypothesis (H₀: β = 0) for Price and USYes because their p-values are much less than 0.05, indicating they have a statistically significant relationship with Sales. We fail to reject the null hypothesis for UrbanYes since its p-value is very large (0.936).
# (e)
lm_fit2 <- lm(Sales ~ Price + US, data = Carseats)
summary(lm_fit2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
# (f)
summary(lm_fit)$adj.r.squared
## [1] 0.2335123
summary(lm_fit2)$adj.r.squared
## [1] 0.2354305
# (g)
confint(lm_fit2)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
# (h)
par(mfrow = c(2, 2))
plot(lm_fit2)
# (a) The coefficient estimate for the regression of X onto Y is the same as the coefficient estimate for the regression of Y onto X when the sum of squares of X must equal the sum of squares of Y.
# (b)
set.seed(1)
x <- rnorm(100)
y <- 2 * x + rnorm(100)
beta_y_on_x <- sum(x * y) / sum(x^2)
beta_x_on_y <- sum(x * y) / sum(y^2)
beta_y_on_x
## [1] 1.993876
beta_x_on_y
## [1] 0.3911145
# (c)
set.seed(2)
x <- rnorm(100)
y <- 3 * x
beta_y_on_x <- sum(x * y) / sum(x^2)
beta_x_on_y <- sum(x * y) / sum(y^2)
beta_y_on_x
## [1] 3
beta_x_on_y
## [1] 0.3333333