Assignment 2

K-nearest neighbor is a non-parametric method that can be used for classification or regression. In the case of classification, a K-nearest neighbor classifier identifies the nearest K points to the observation. Then, it estimates the probability of the observation belonging to a specific class. Thus, K-nearest neighbor classifier provides a qualitative response. KNN regression is similar to classification in that to also uses the nearest K points to observed values. However, unlike K-nearest neighbor classifier, it provides a quantitative prediction.

#Beginning QUestion9

library(ISLR)

## Warning: package 'ISLR' was built under R version 3.6.3

#(A.)
pairs(Auto[,1:8])

#(B.)
cor(Auto[,1:8])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000

#(C.)
mpg_all = lm(mpg~.-name, data = Auto)
summary(mpg_all)

## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

#i. p-value: < 3.3e-16 for a F-statistic of 252.4. This shows significant evidence of a relationship betweenthe predictors and the mpg.

#ii. Displacement, weight, year and origin are statistically significant as their p-values are below 0.05 ornear zero.

#iii. The coefficient for the ‘year’ predictor is 0.75, and suggests that increasing it by one year will mean avehicles predicted mpg will be 0.75mpg higher.

#(D.)
par(mfrow=c(2,2))
plot(mpg_all)

rstudent(mpg_all)[which(rstudent(mpg_all)>3)]

##      245      323      326      327 
## 3.390068 4.029537 3.494823 3.690246

library(car)

## Warning: package 'car' was built under R version 3.6.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 3.6.3

vif(mpg_all)

##    cylinders displacement   horsepower       weight acceleration         year 
##    10.737535    21.836792     9.943693    10.831260     2.625806     1.244952 
##       origin 
##     1.772386

#(E.)
mpg_interaction = lm(mpg~.-name + year:cylinders + acceleration:horsepower,data = Auto)
summary(mpg_interaction)

## 
## Call:
## lm(formula = mpg ~ . - name + year:cylinders + acceleration:horsepower, 
##     data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0203 -1.7318 -0.1015  1.5639 11.9559 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -8.862e+01  1.212e+01  -7.311 1.57e-12 ***
## cylinders                1.181e+01  2.349e+00   5.029 7.61e-07 ***
## displacement            -8.775e-03  7.916e-03  -1.108  0.26838    
## horsepower               9.151e-02  2.502e-02   3.658  0.00029 ***
## weight                  -4.269e-03  6.967e-04  -6.127 2.23e-09 ***
## acceleration             8.439e-01  1.590e-01   5.306 1.90e-07 ***
## year                     1.521e+00  1.590e-01   9.569  < 2e-16 ***
## origin                   1.070e+00  2.609e-01   4.102 5.00e-05 ***
## cylinders:year          -1.520e-01  3.017e-02  -5.037 7.32e-07 ***
## horsepower:acceleration -9.837e-03  1.778e-03  -5.533 5.84e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.049 on 382 degrees of freedom
## Multiple R-squared:  0.8509, Adjusted R-squared:  0.8474 
## F-statistic: 242.2 on 9 and 382 DF,  p-value: < 2.2e-16

# cylinders:year and horsepower:acceleration are statistically significant. The R2 has in-creased from 0.82 to 0.85

mpg_poly = lm(mpg~.-name + year:cylinders + I(horsepower^2)+ I(acceleration^2), data = Auto)
summary(mpg_poly)

## 
## Call:
## lm(formula = mpg ~ . - name + year:cylinders + I(horsepower^2) + 
##     I(acceleration^2), data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9986 -1.5525 -0.1194  1.4348 11.7722 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -3.394e+01  1.430e+01  -2.374 0.018075 *  
## cylinders          8.481e+00  2.340e+00   3.624 0.000329 ***
## displacement      -1.106e-02  7.330e-03  -1.509 0.132051    
## horsepower        -2.720e-01  3.531e-02  -7.703 1.16e-13 ***
## weight            -3.338e-03  6.812e-04  -4.900 1.42e-06 ***
## acceleration      -1.378e+00  5.421e-01  -2.542 0.011403 *  
## year               1.272e+00  1.594e-01   7.982 1.71e-14 ***
## origin             1.027e+00  2.493e-01   4.121 4.63e-05 ***
## I(horsepower^2)    8.040e-04  1.140e-04   7.054 8.22e-12 ***
## I(acceleration^2)  3.351e-02  1.578e-02   2.124 0.034303 *  
## cylinders:year    -1.056e-01  3.023e-02  -3.493 0.000533 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.935 on 381 degrees of freedom
## Multiple R-squared:  0.8622, Adjusted R-squared:  0.8586 
## F-statistic: 238.3 on 10 and 381 DF,  p-value: < 2.2e-16

mpg_poly2 = lm(mpg~.-name-cylinders + log(weight) + log(acceleration) + 
sqrt(displacement), data = Auto)
summary(mpg_poly2)

## 
## Call:
## lm(formula = mpg ~ . - name - cylinders + log(weight) + log(acceleration) + 
##     sqrt(displacement), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2104  -1.6665  -0.1085   1.5977  12.5231 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        290.113140  49.599189   5.849 1.06e-08 ***
## displacement         0.032477   0.028592   1.136 0.256720    
## horsepower          -0.043782   0.013065  -3.351 0.000885 ***
## weight               0.006923   0.002226   3.110 0.002013 ** 
## acceleration         2.001283   0.468834   4.269 2.48e-05 ***
## year                 0.801707   0.044950  17.836  < 2e-16 ***
## origin               0.502973   0.262462   1.916 0.056064 .  
## log(weight)        -34.848861   6.862200  -5.078 5.96e-07 ***
## log(acceleration)  -33.152402   7.671145  -4.322 1.98e-05 ***
## sqrt(displacement)  -1.043089   0.820337  -1.272 0.204311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.914 on 382 degrees of freedom
## Multiple R-squared:  0.8639, Adjusted R-squared:  0.8607 
## F-statistic: 269.3 on 9 and 382 DF,  p-value: < 2.2e-16

#End Question9

#Beginning Question10

library(ISLR)

#(A.)
carseats_lm = lm(Sales~Price + Urban + US, data = Carseats)
summary(carseats_lm)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

#(B.)

# The intercept represents the number of car seats sold on average when all other predictors are disre-garded.
# The Price coefficient is negative and so sales will fall by roughly 54 seats(0.054x1000)for every unit($1)increase in price.
# The Urban=Yes coefficient is not statistically significant. The US=Yes coefficient is 1.2, and this means an average increase in car seat sales of 1200 units when US=Yes.

#(C.)

# 𝑆𝑎𝑙𝑒𝑠=13.04 +−0.05𝑃𝑟𝑖𝑐𝑒 +−0.02𝑈𝑟𝑏𝑎𝑛(𝑌𝑒𝑠∶1,𝑁𝑜∶0) +1.20𝑈𝑆(𝑌𝑒𝑠∶1,𝑁𝑜∶0)

#(D.)
carseats_all_lm = lm(Sales~.,data = Carseats)
summary(carseats_all_lm)

## 
## Call:
## lm(formula = Sales ~ ., data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8692 -0.6908  0.0211  0.6636  3.4115 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.6606231  0.6034487   9.380  < 2e-16 ***
## CompPrice        0.0928153  0.0041477  22.378  < 2e-16 ***
## Income           0.0158028  0.0018451   8.565 2.58e-16 ***
## Advertising      0.1230951  0.0111237  11.066  < 2e-16 ***
## Population       0.0002079  0.0003705   0.561    0.575    
## Price           -0.0953579  0.0026711 -35.700  < 2e-16 ***
## ShelveLocGood    4.8501827  0.1531100  31.678  < 2e-16 ***
## ShelveLocMedium  1.9567148  0.1261056  15.516  < 2e-16 ***
## Age             -0.0460452  0.0031817 -14.472  < 2e-16 ***
## Education       -0.0211018  0.0197205  -1.070    0.285    
## UrbanYes         0.1228864  0.1129761   1.088    0.277    
## USYes           -0.1840928  0.1498423  -1.229    0.220    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.019 on 388 degrees of freedom
## Multiple R-squared:  0.8734, Adjusted R-squared:  0.8698 
## F-statistic: 243.4 on 11 and 388 DF,  p-value: < 2.2e-16

# Null hypothesis can be rejected for CompPrice, Income, Advertising, Price, ShelvelocGood,ShelvelocMedium and Age.

#(E.)

carseats_all_lm2 = lm(Sales~.-Education-Urban-US-Population, data = Carseats)
summary(carseats_all_lm2)

## 
## Call:
## lm(formula = Sales ~ . - Education - Urban - US - Population, 
##     data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7728 -0.6954  0.0282  0.6732  3.3292 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.475226   0.505005   10.84   <2e-16 ***
## CompPrice        0.092571   0.004123   22.45   <2e-16 ***
## Income           0.015785   0.001838    8.59   <2e-16 ***
## Advertising      0.115903   0.007724   15.01   <2e-16 ***
## Price           -0.095319   0.002670  -35.70   <2e-16 ***
## ShelveLocGood    4.835675   0.152499   31.71   <2e-16 ***
## ShelveLocMedium  1.951993   0.125375   15.57   <2e-16 ***
## Age             -0.046128   0.003177  -14.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.019 on 392 degrees of freedom
## Multiple R-squared:  0.872,  Adjusted R-squared:  0.8697 
## F-statistic: 381.4 on 7 and 392 DF,  p-value: < 2.2e-16

#(F.)

# The RSE goes down from 2.47model (a)to 1.02model (e). The R2 statistic goes up from 0.24(a) to 0.872(e) and the F-statistic goes up from 41.52 to 381.4. # The statistical evidence clearly shows that(e)is a much better fit.

#(G.)
confint(carseats_all_lm2)

##                       2.5 %      97.5 %
## (Intercept)      4.48236820  6.46808427
## CompPrice        0.08446498  0.10067795
## Income           0.01217210  0.01939784
## Advertising      0.10071856  0.13108825
## Price           -0.10056844 -0.09006946
## ShelveLocGood    4.53585700  5.13549250
## ShelveLocMedium  1.70550103  2.19848429
## Age             -0.05237301 -0.03988204

#(H.)
par(mfrow=c(2,2))
plot(carseats_all_lm2)

# The residuals v fitted values chart doesn’t show any distinct shape, so the model appears to be a goodfit to the data.

#End Question10

#Beginning Question12

#(A.)
#For regression of y onto x:̂𝛽 = ∑𝑛𝑖=1(𝑥𝑖𝑦𝑖)/∑𝑛𝑖′=1(𝑥′𝑖)2, and for regression of x onto y:̂𝛽′=∑𝑛𝑖=1(�𝑖�𝑛𝑖′=1(𝑦′𝑖)2.• The # # #coefficients are equal when the denominators are the same:∑𝑛𝑖′=1(𝑥′𝑖)2=∑𝑛𝑖′=1(𝑦′𝑖)2

#(B.)
set.seed(1)
x = rnorm(100)
y = 2*x + rnorm(100)

lm.fit = lm(y~x+0)
summary(lm.fit)

## 
## Call:
## lm(formula = y ~ x + 0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9154 -0.6472 -0.1771  0.5056  2.3109 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)    
## x   1.9939     0.1065   18.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9586 on 99 degrees of freedom
## Multiple R-squared:  0.7798, Adjusted R-squared:  0.7776 
## F-statistic: 350.7 on 1 and 99 DF,  p-value: < 2.2e-16

lm.fit2 = lm(x~y+0)
summary(lm.fit2)

## 
## Call:
## lm(formula = x ~ y + 0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8699 -0.2368  0.1030  0.2858  0.8938 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)    
## y  0.39111    0.02089   18.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4246 on 99 degrees of freedom
## Multiple R-squared:  0.7798, Adjusted R-squared:  0.7776 
## F-statistic: 350.7 on 1 and 99 DF,  p-value: < 2.2e-16

#(C.)
set.seed(1)
x1 = rnorm(100)
y1 = sample(x1)

lm.fit4 = lm(x1~y1+0)
lm.fit5 = lm(y1~x1+0)

summary(lm.fit4)$coefficients[1,1]

## [1] -0.07767695

summary(lm.fit5)$coefficients[1,1]

## [1] -0.07767695

#End Question12

Assignment 2

Josh Gauntt

1/29/2021