#Question 2: Carefully explain the differences between the KNN classifier and KNN regression methods.

# The KNN (K-nearest neighbor) identifies the number of K points that are closest to point x, of our choice, in the training data set. And the KNN classification attempts to predict the class to which the output variable belonged. It is used on qualitative predictor.

# n the other side, KNN regression is a non-parametric method that try to predict the value of the output variable by averaging the statements in the same neighborhood. KNN regression is used on quantitative dependent variable.

#Question 9: This question involves the use of multiple linear regression on the Auto data set.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(ISLR)
library(corrplot)
## corrplot 0.92 loaded
Auto <- read_csv("Auto.csv")
## Rows: 397 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): horsepower, name
## dbl (7): mpg, cylinders, displacement, weight, acceleration, year, origin
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Auto)

#Take a look at the data
str(Auto)
## spc_tbl_ [397 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ mpg         : num [1:397] 18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num [1:397] 8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num [1:397] 307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : chr [1:397] "130" "165" "150" "150" ...
##  $ weight      : num [1:397] 3504 3693 3436 3433 3449 ...
##  $ acceleration: num [1:397] 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num [1:397] 70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num [1:397] 1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : chr [1:397] "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   mpg = col_double(),
##   ..   cylinders = col_double(),
##   ..   displacement = col_double(),
##   ..   horsepower = col_character(),
##   ..   weight = col_double(),
##   ..   acceleration = col_double(),
##   ..   year = col_double(),
##   ..   origin = col_double(),
##   ..   name = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
#Change horsepower into numeric
na.omit(Auto$horsepower)
##   [1] "130" "165" "150" "150" "140" "198" "220" "215" "225" "190" "170" "160"
##  [13] "150" "225" "95"  "95"  "97"  "85"  "88"  "46"  "87"  "90"  "95"  "113"
##  [25] "90"  "215" "200" "210" "193" "88"  "90"  "95"  "?"   "100" "105" "100"
##  [37] "88"  "100" "165" "175" "153" "150" "180" "170" "175" "110" "72"  "100"
##  [49] "88"  "86"  "90"  "70"  "76"  "65"  "69"  "60"  "70"  "95"  "80"  "54" 
##  [61] "90"  "86"  "165" "175" "150" "153" "150" "208" "155" "160" "190" "97" 
##  [73] "150" "130" "140" "150" "112" "76"  "87"  "69"  "86"  "92"  "97"  "80" 
##  [85] "88"  "175" "150" "145" "137" "150" "198" "150" "158" "150" "215" "225"
##  [97] "175" "105" "100" "100" "88"  "95"  "46"  "150" "167" "170" "180" "100"
## [109] "88"  "72"  "94"  "90"  "85"  "107" "90"  "145" "230" "49"  "75"  "91" 
## [121] "112" "150" "110" "122" "180" "95"  "?"   "100" "100" "67"  "80"  "65" 
## [133] "75"  "100" "110" "105" "140" "150" "150" "140" "150" "83"  "67"  "78" 
## [145] "52"  "61"  "75"  "75"  "75"  "97"  "93"  "67"  "95"  "105" "72"  "72" 
## [157] "170" "145" "150" "148" "110" "105" "110" "95"  "110" "110" "129" "75" 
## [169] "83"  "100" "78"  "96"  "71"  "97"  "97"  "70"  "90"  "95"  "88"  "98" 
## [181] "115" "53"  "86"  "81"  "92"  "79"  "83"  "140" "150" "120" "152" "100"
## [193] "105" "81"  "90"  "52"  "60"  "70"  "53"  "100" "78"  "110" "95"  "71" 
## [205] "70"  "75"  "72"  "102" "150" "88"  "108" "120" "180" "145" "130" "150"
## [217] "68"  "80"  "58"  "96"  "70"  "145" "110" "145" "130" "110" "105" "100"
## [229] "98"  "180" "170" "190" "149" "78"  "88"  "75"  "89"  "63"  "83"  "67" 
## [241] "78"  "97"  "110" "110" "48"  "66"  "52"  "70"  "60"  "110" "140" "139"
## [253] "105" "95"  "85"  "88"  "100" "90"  "105" "85"  "110" "120" "145" "165"
## [265] "139" "140" "68"  "95"  "97"  "75"  "95"  "105" "85"  "97"  "103" "125"
## [277] "115" "133" "71"  "68"  "115" "85"  "88"  "90"  "110" "130" "129" "138"
## [289] "135" "155" "142" "125" "150" "71"  "65"  "80"  "80"  "77"  "125" "71" 
## [301] "90"  "70"  "70"  "65"  "69"  "90"  "115" "115" "90"  "76"  "60"  "70" 
## [313] "65"  "90"  "88"  "90"  "90"  "78"  "90"  "75"  "92"  "75"  "65"  "105"
## [325] "65"  "48"  "48"  "67"  "67"  "67"  "?"   "67"  "62"  "132" "100" "88" 
## [337] "?"   "72"  "84"  "84"  "92"  "110" "84"  "58"  "64"  "60"  "67"  "65" 
## [349] "62"  "68"  "63"  "65"  "65"  "74"  "?"   "75"  "75"  "100" "74"  "80" 
## [361] "76"  "116" "120" "110" "105" "88"  "85"  "88"  "88"  "88"  "85"  "84" 
## [373] "90"  "92"  "74"  "68"  "68"  "63"  "70"  "88"  "75"  "70"  "67"  "67" 
## [385] "67"  "110" "85"  "92"  "112" "96"  "84"  "90"  "86"  "52"  "84"  "79" 
## [397] "82"
Auto$horsepower <- as.numeric(Auto$horsepower)
## Warning: NAs introduced by coercion
str(Auto)
## spc_tbl_ [397 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ mpg         : num [1:397] 18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num [1:397] 8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num [1:397] 307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num [1:397] 130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num [1:397] 3504 3693 3436 3433 3449 ...
##  $ acceleration: num [1:397] 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num [1:397] 70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num [1:397] 1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : chr [1:397] "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   mpg = col_double(),
##   ..   cylinders = col_double(),
##   ..   displacement = col_double(),
##   ..   horsepower = col_character(),
##   ..   weight = col_double(),
##   ..   acceleration = col_double(),
##   ..   year = col_double(),
##   ..   origin = col_double(),
##   ..   name = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

(a) Produce a scatterplot matrix which includes all of the variables in the data set.

plot(Auto)

(b) Compute the matrix of correlations between the variables using the function cor(). You will need to exclude the name variable, cor() which is qualitative.

#create a numeric data set from Auto
Auto_num <- select_if(Auto, is.numeric)

#Check for correlation
M = cor(Auto_num) 
corrplot(M, method = "number")

(c) Use the lm() function to perform a multiple linear regression with mpg as the response and all other variables except name as the predictors. Use the summary() function to print the results. Comment on the output. For instance:

lm.fit=lm(mpg~.-name,data=Auto) #regression on all values except ages
summary(lm.fit)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

i. Is there a relationship between the predictors and the response?

# Since p value is less than 0.05 (p-value < 2.2e-16), we can reject the null hypothesis and say that there is a relationship between predictor variable, mpg, and other response variables.

ii. Which predictors appear to have a statistically significant relationship to the response?

# Displacement, weight, year, and origin have a significant relationship to mpg (with very small p-value)

iii. What does the coefficient for the year variable suggest?

# It suggests that there is a postive relationship between the year and mpg variable. If other variables are constant, then the mpg will increase by 0.75 every year

(d) Use the plot() function to produce diagnostic plots of the linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers? Does the leverage plot identify any observations with unusually high leverage?

par(mfrow=c(2,2))
plot(lm.fit)

###The residual plot doesn't follow a straight line and has a U-shape, which suggests that data is not linear. It also suggests of heterocedascity since variance of the points are not equally distributed around the line. 

###For the normal Q-Q plot, the residuals are not normally distributed since they scattered at the end of the line. 

###The Scale-Location plot has all values lie between 0 and 2. Therefore, we seem to have no outliers. (must be above 3)

(e) Use the * and : symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?

y_inter<-lm(mpg~.-name + displacement*horsepower + origin*horsepower,data=Auto)
summary(y_inter)
## 
## Call:
## lm(formula = mpg ~ . - name + displacement * horsepower + origin * 
##     horsepower, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7222 -1.5251 -0.0968  1.3553 12.8419 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -4.706e+00  4.686e+00  -1.004   0.3159    
## cylinders                5.142e-01  3.139e-01   1.638   0.1022    
## displacement            -6.970e-02  1.143e-02  -6.098 2.63e-09 ***
## horsepower              -1.540e-01  3.547e-02  -4.342 1.81e-05 ***
## weight                  -3.084e-03  6.478e-04  -4.761 2.73e-06 ***
## acceleration            -2.276e-01  9.099e-02  -2.501   0.0128 *  
## year                     7.349e-01  4.460e-02  16.478  < 2e-16 ***
## origin                   2.281e+00  1.090e+00   2.092   0.0371 *  
## displacement:horsepower  4.665e-04  6.127e-05   7.614 2.10e-13 ***
## horsepower:origin       -1.918e-02  1.278e-02  -1.500   0.1343    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.908 on 382 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.8644, Adjusted R-squared:  0.8612 
## F-statistic: 270.6 on 9 and 382 DF,  p-value: < 2.2e-16
###After trying many interaction effects, only displacement*horsepower and origian*horsepower interactions are significant

(f) Try a few different transformations of the variables, such as log(X), √X, X2. Comment on your findings.

# Try a log transformations for horsepower
summary(lm(mpg ~ . -name + log(horsepower), data=Auto))
## 
## Call:
## lm(formula = mpg ~ . - name + log(horsepower), data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5777 -1.6623 -0.1213  1.4913 12.0230 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.674e+01  1.106e+01   7.839 4.54e-14 ***
## cylinders       -5.530e-02  2.907e-01  -0.190 0.849230    
## displacement    -4.607e-03  7.108e-03  -0.648 0.517291    
## horsepower       1.764e-01  2.269e-02   7.775 7.05e-14 ***
## weight          -3.366e-03  6.561e-04  -5.130 4.62e-07 ***
## acceleration    -3.277e-01  9.670e-02  -3.388 0.000776 ***
## year             7.421e-01  4.534e-02  16.368  < 2e-16 ***
## origin           8.976e-01  2.528e-01   3.551 0.000432 ***
## log(horsepower) -2.685e+01  2.652e+00 -10.127  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.959 on 383 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.8592, Adjusted R-squared:  0.8562 
## F-statistic: 292.1 on 8 and 383 DF,  p-value: < 2.2e-16
### log(horsepower) is also significant to model 

#Question 10: This question should be answered using the Carseats data set.

#Check the data for multicollinearity
str(Carseats)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : num  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : num  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: num  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : num  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : num  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : num  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : num  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
Carseats_num <- select_if(Carseats, is.numeric)
C = cor(Carseats_num) 
corrplot(C, method = "number")

(a) Fit a multiple regression model to predict Sales using Price, Urban, and US.

#Linear regression model]
lm_Carseat=lm(Sales~Price+Urban+US,data=Carseats)
summary(lm_Carseat)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

(b) Provide an interpretation of each coefficient in the model. Be careful—some of the variables in the model are qualitative!

### Price is negative correlate with Sales. For one unit increase in price would decrease the sales would decrease by 0.054 units

### The amount of sales would not be affect if it is in the Urban or not, since Urban is not significant (p-value > 0.05)

### A store in the US would result in 1.2 more sales than stores is not in U.S. 

(c) Write out the model in equation form, being careful to handle the qualitative variables properly.

###Sales=13.043469 - 0.054459*(Price) - (0.021916)*(Urban) + 1.200573 (US)

### Urban: 1 if Urban is Yes, 0 if No
### US: 1 if US is Yes, 0 if No

(d) For which of the predictors can you reject the null hypothesis H0 : βj = 0?

### We can reject the null hypothesis with Price and US predictors since their p-value is significant

(e) On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.

#Linear regression model for Price and US
lm_Carseat2=lm(Sales~Price+US,data=Carseats) 
summary(lm_Carseat2)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

(f) How well do the models in (a) and (e) fit the data?

### Model in (a) and (e) have Adjusted R-squared of 0.2335 and 0.2354, respectively. This is a very low score, only 23% of y variable would be explained by the independent variables. Therefore, the model is not good enough

(g) Using the model from (e), obtain 95 % confidence intervals for the coefficient(s).

confint(lm_Carseat2)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632
### There is a 95% chances that Price would fall between (-0.065,-0.044) and 95% chances of the stores being in the U.S. is between (0.6915,1.7078)

(h) Is there evidence of outliers or high leverage observations in the model from (e)?

par(mfrow=c(2,2))
plot(lm_Carseat2)

### Base on the Normal Q-Q plot, there is no evidence for outliers. There's also no evidence for high leverage observations based onResiduals vs Leverage plot

#Question 12: This problem involves simple linear regression without an intercept.

#(a) Recall that the coefficient estimate βˆ for the linear regression of Y onto X without an intercept is given by (3.38). Under what circumstance is the coefficient estimate for the regression of X onto Y the same as the coefficient estimate for the regression of Y onto X?

# From the equation from 3.38, the coefficient estimate for regression of X onto Y the same as coefficient estimate for regression of Y onto X when: sum of xi^2 = sum of yi^2

(b) Generate an example in R with n = 100 observations in which the coefficient estimate for the regression of X onto Y is different from the coefficient estimate for the regression of Y onto X.

x <- c(1:100) 
y <- rbinom(100,1,0.05)
eq1 <- lm(y ~ x + 0)
eq2 <- lm(x ~ y + 0)
summary(eq1)
## 
## Call:
## lm(formula = y ~ x + 0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.07950 -0.05824 -0.03856 -0.01729  0.99046 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)  
## x 0.000795   0.000378   2.103    0.038 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2199 on 99 degrees of freedom
## Multiple R-squared:  0.04277,    Adjusted R-squared:  0.0331 
## F-statistic: 4.424 on 1 and 99 DF,  p-value: 0.03798
summary(eq2)
## 
## Call:
## lm(formula = x ~ y + 0)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.80  23.75  48.50  73.25 100.00 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)  
## y    53.80      25.58   2.103    0.038 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.2 on 99 degrees of freedom
## Multiple R-squared:  0.04277,    Adjusted R-squared:  0.0331 
## F-statistic: 4.424 on 1 and 99 DF,  p-value: 0.03798
#We can see that the coefficient for both cases are different

(c) Generate an example in R with n = 100 observations in which the coefficient estimate for the regression of X onto Y is the same as the coefficient estimate for the regression of Y onto X.

x2 <- c(1:100)
y2 <- c(100:1)
eq3 <- lm(y2 ~ x2 + 0)
eq4 <- lm(x2 ~ y2 + 0)
summary(eq3)
## 
## Call:
## lm(formula = y2 ~ x2 + 0)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49.75 -12.44  24.87  62.18  99.49 
## 
## Coefficients:
##    Estimate Std. Error t value Pr(>|t|)    
## x2   0.5075     0.0866    5.86 6.09e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared:  0.2575, Adjusted R-squared:   0.25 
## F-statistic: 34.34 on 1 and 99 DF,  p-value: 6.094e-08
summary(eq4)
## 
## Call:
## lm(formula = x2 ~ y2 + 0)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49.75 -12.44  24.87  62.18  99.49 
## 
## Coefficients:
##    Estimate Std. Error t value Pr(>|t|)    
## y2   0.5075     0.0866    5.86 6.09e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared:  0.2575, Adjusted R-squared:   0.25 
## F-statistic: 34.34 on 1 and 99 DF,  p-value: 6.094e-08