- K-nearest neighbor is a non-parametric method that can be used for classification or regression. In the case of classification, a K-nearest neighbor classifier identifies the nearest K points to the observation. Then, it estimates the probability of the observation belonging to a specific class. Thus, K-nearest neighbor classifier provides a qualitative response. KNN regression is similar to classification in that to also uses the nearest K points to observed values. However, unlike K-nearest neighbor classifier, it provides a quantitative prediction.
#Beginning QUestion9
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
#(A.)
pairs(Auto[,1:8])

#(B.)
cor(Auto[,1:8])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
#(C.)
mpg_all = lm(mpg~.-name, data = Auto)
summary(mpg_all)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
#i. p-value: < 3.3e-16 for a F-statistic of 252.4. This shows significant evidence of a relationship betweenthe predictors and the mpg.
#ii. Displacement, weight, year and origin are statistically significant as their p-values are below 0.05 ornear zero.
#iii. The coefficient for the βyearβ predictor is 0.75, and suggests that increasing it by one year will mean avehicles predicted mpg will be 0.75mpg higher.
#(D.)
par(mfrow=c(2,2))
plot(mpg_all)

rstudent(mpg_all)[which(rstudent(mpg_all)>3)]
## 245 323 326 327
## 3.390068 4.029537 3.494823 3.690246
library(car)
## Warning: package 'car' was built under R version 3.6.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.6.3
vif(mpg_all)
## cylinders displacement horsepower weight acceleration year
## 10.737535 21.836792 9.943693 10.831260 2.625806 1.244952
## origin
## 1.772386
#(E.)
mpg_interaction = lm(mpg~.-name + year:cylinders + acceleration:horsepower,data = Auto)
summary(mpg_interaction)
##
## Call:
## lm(formula = mpg ~ . - name + year:cylinders + acceleration:horsepower,
## data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0203 -1.7318 -0.1015 1.5639 11.9559
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.862e+01 1.212e+01 -7.311 1.57e-12 ***
## cylinders 1.181e+01 2.349e+00 5.029 7.61e-07 ***
## displacement -8.775e-03 7.916e-03 -1.108 0.26838
## horsepower 9.151e-02 2.502e-02 3.658 0.00029 ***
## weight -4.269e-03 6.967e-04 -6.127 2.23e-09 ***
## acceleration 8.439e-01 1.590e-01 5.306 1.90e-07 ***
## year 1.521e+00 1.590e-01 9.569 < 2e-16 ***
## origin 1.070e+00 2.609e-01 4.102 5.00e-05 ***
## cylinders:year -1.520e-01 3.017e-02 -5.037 7.32e-07 ***
## horsepower:acceleration -9.837e-03 1.778e-03 -5.533 5.84e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.049 on 382 degrees of freedom
## Multiple R-squared: 0.8509, Adjusted R-squared: 0.8474
## F-statistic: 242.2 on 9 and 382 DF, p-value: < 2.2e-16
# cylinders:year and horsepower:acceleration are statistically significant. The R2 has in-creased from 0.82 to 0.85
mpg_poly = lm(mpg~.-name + year:cylinders + I(horsepower^2)+ I(acceleration^2), data = Auto)
summary(mpg_poly)
##
## Call:
## lm(formula = mpg ~ . - name + year:cylinders + I(horsepower^2) +
## I(acceleration^2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9986 -1.5525 -0.1194 1.4348 11.7722
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.394e+01 1.430e+01 -2.374 0.018075 *
## cylinders 8.481e+00 2.340e+00 3.624 0.000329 ***
## displacement -1.106e-02 7.330e-03 -1.509 0.132051
## horsepower -2.720e-01 3.531e-02 -7.703 1.16e-13 ***
## weight -3.338e-03 6.812e-04 -4.900 1.42e-06 ***
## acceleration -1.378e+00 5.421e-01 -2.542 0.011403 *
## year 1.272e+00 1.594e-01 7.982 1.71e-14 ***
## origin 1.027e+00 2.493e-01 4.121 4.63e-05 ***
## I(horsepower^2) 8.040e-04 1.140e-04 7.054 8.22e-12 ***
## I(acceleration^2) 3.351e-02 1.578e-02 2.124 0.034303 *
## cylinders:year -1.056e-01 3.023e-02 -3.493 0.000533 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.935 on 381 degrees of freedom
## Multiple R-squared: 0.8622, Adjusted R-squared: 0.8586
## F-statistic: 238.3 on 10 and 381 DF, p-value: < 2.2e-16
mpg_poly2 = lm(mpg~.-name-cylinders + log(weight) + log(acceleration) +
sqrt(displacement), data = Auto)
summary(mpg_poly2)
##
## Call:
## lm(formula = mpg ~ . - name - cylinders + log(weight) + log(acceleration) +
## sqrt(displacement), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.2104 -1.6665 -0.1085 1.5977 12.5231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 290.113140 49.599189 5.849 1.06e-08 ***
## displacement 0.032477 0.028592 1.136 0.256720
## horsepower -0.043782 0.013065 -3.351 0.000885 ***
## weight 0.006923 0.002226 3.110 0.002013 **
## acceleration 2.001283 0.468834 4.269 2.48e-05 ***
## year 0.801707 0.044950 17.836 < 2e-16 ***
## origin 0.502973 0.262462 1.916 0.056064 .
## log(weight) -34.848861 6.862200 -5.078 5.96e-07 ***
## log(acceleration) -33.152402 7.671145 -4.322 1.98e-05 ***
## sqrt(displacement) -1.043089 0.820337 -1.272 0.204311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.914 on 382 degrees of freedom
## Multiple R-squared: 0.8639, Adjusted R-squared: 0.8607
## F-statistic: 269.3 on 9 and 382 DF, p-value: < 2.2e-16
#End Question9
#Beginning Question10
library(ISLR)
#(A.)
carseats_lm = lm(Sales~Price + Urban + US, data = Carseats)
summary(carseats_lm)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
#(B.)
# The intercept represents the number of car seats sold on average when all other predictors are disre-garded.
# The Price coefficient is negative and so sales will fall by roughly 54 seats(0.054x1000)for every unit($1)increase in price.
# The Urban=Yes coefficient is not statistically significant. The US=Yes coefficient is 1.2, and this means an average increase in car seat sales of 1200 units when US=Yes.
#(C.)
# πππππ =13.04 +β0.05πππππ +β0.02πππππ(πππ βΆ1,ππβΆ0) +1.20ππ(πππ βΆ1,ππβΆ0)
#(D.)
carseats_all_lm = lm(Sales~.,data = Carseats)
summary(carseats_all_lm)
##
## Call:
## lm(formula = Sales ~ ., data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8692 -0.6908 0.0211 0.6636 3.4115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.6606231 0.6034487 9.380 < 2e-16 ***
## CompPrice 0.0928153 0.0041477 22.378 < 2e-16 ***
## Income 0.0158028 0.0018451 8.565 2.58e-16 ***
## Advertising 0.1230951 0.0111237 11.066 < 2e-16 ***
## Population 0.0002079 0.0003705 0.561 0.575
## Price -0.0953579 0.0026711 -35.700 < 2e-16 ***
## ShelveLocGood 4.8501827 0.1531100 31.678 < 2e-16 ***
## ShelveLocMedium 1.9567148 0.1261056 15.516 < 2e-16 ***
## Age -0.0460452 0.0031817 -14.472 < 2e-16 ***
## Education -0.0211018 0.0197205 -1.070 0.285
## UrbanYes 0.1228864 0.1129761 1.088 0.277
## USYes -0.1840928 0.1498423 -1.229 0.220
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.019 on 388 degrees of freedom
## Multiple R-squared: 0.8734, Adjusted R-squared: 0.8698
## F-statistic: 243.4 on 11 and 388 DF, p-value: < 2.2e-16
# Null hypothesis can be rejected for CompPrice, Income, Advertising, Price, ShelvelocGood,ShelvelocMedium and Age.
#(E.)
carseats_all_lm2 = lm(Sales~.-Education-Urban-US-Population, data = Carseats)
summary(carseats_all_lm2)
##
## Call:
## lm(formula = Sales ~ . - Education - Urban - US - Population,
## data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7728 -0.6954 0.0282 0.6732 3.3292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.475226 0.505005 10.84 <2e-16 ***
## CompPrice 0.092571 0.004123 22.45 <2e-16 ***
## Income 0.015785 0.001838 8.59 <2e-16 ***
## Advertising 0.115903 0.007724 15.01 <2e-16 ***
## Price -0.095319 0.002670 -35.70 <2e-16 ***
## ShelveLocGood 4.835675 0.152499 31.71 <2e-16 ***
## ShelveLocMedium 1.951993 0.125375 15.57 <2e-16 ***
## Age -0.046128 0.003177 -14.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.019 on 392 degrees of freedom
## Multiple R-squared: 0.872, Adjusted R-squared: 0.8697
## F-statistic: 381.4 on 7 and 392 DF, p-value: < 2.2e-16
#(F.)
# The RSE goes down from 2.47model (a)to 1.02model (e). The R2 statistic goes up from 0.24(a) to 0.872(e) and the F-statistic goes up from 41.52 to 381.4. # The statistical evidence clearly shows that(e)is a much better fit.
#(G.)
confint(carseats_all_lm2)
## 2.5 % 97.5 %
## (Intercept) 4.48236820 6.46808427
## CompPrice 0.08446498 0.10067795
## Income 0.01217210 0.01939784
## Advertising 0.10071856 0.13108825
## Price -0.10056844 -0.09006946
## ShelveLocGood 4.53585700 5.13549250
## ShelveLocMedium 1.70550103 2.19848429
## Age -0.05237301 -0.03988204
#(H.)
par(mfrow=c(2,2))
plot(carseats_all_lm2)

# The residuals v fitted values chart doesnβt show any distinct shape, so the model appears to be a goodfit to the data.
#End Question10
#Beginning Question12
#(A.)
#For regression of y onto x:Μπ½ = βππ=1(π₯ππ¦π)/βππβ²=1(π₯β²π)2, and for regression of x onto y:Μπ½β²=βππ=1(οΏ½ποΏ½ππβ²=1(π¦β²π)2.β’ The # # #coefficients are equal when the denominators are the same:βππβ²=1(π₯β²π)2=βππβ²=1(π¦β²π)2
#(B.)
set.seed(1)
x = rnorm(100)
y = 2*x + rnorm(100)
lm.fit = lm(y~x+0)
summary(lm.fit)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9154 -0.6472 -0.1771 0.5056 2.3109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.9939 0.1065 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9586 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
lm.fit2 = lm(x~y+0)
summary(lm.fit2)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8699 -0.2368 0.1030 0.2858 0.8938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.39111 0.02089 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4246 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
#(C.)
set.seed(1)
x1 = rnorm(100)
y1 = sample(x1)
lm.fit4 = lm(x1~y1+0)
lm.fit5 = lm(y1~x1+0)
summary(lm.fit4)$coefficients[1,1]
## [1] -0.07767695
summary(lm.fit5)$coefficients[1,1]
## [1] -0.07767695
#End Question12