#Question 2: Carefully explain the differences between the KNN classifier and KNN regression methods.
# The KNN (K-nearest neighbor) identifies the number of K points that are closest to point x, of our choice, in the training data set. And the KNN classification attempts to predict the class to which the output variable belonged. It is used on qualitative predictor.
# n the other side, KNN regression is a non-parametric method that try to predict the value of the output variable by averaging the statements in the same neighborhood. KNN regression is used on quantitative dependent variable.
#Question 9: This question involves the use of multiple linear regression on the Auto data set.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(ISLR)
library(corrplot)
## corrplot 0.92 loaded
Auto <- read_csv("Auto.csv")
## Rows: 397 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): horsepower, name
## dbl (7): mpg, cylinders, displacement, weight, acceleration, year, origin
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Auto)
#Take a look at the data
str(Auto)
## spc_tbl_ [397 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ mpg : num [1:397] 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num [1:397] 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num [1:397] 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : chr [1:397] "130" "165" "150" "150" ...
## $ weight : num [1:397] 3504 3693 3436 3433 3449 ...
## $ acceleration: num [1:397] 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num [1:397] 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num [1:397] 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr [1:397] "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "spec")=
## .. cols(
## .. mpg = col_double(),
## .. cylinders = col_double(),
## .. displacement = col_double(),
## .. horsepower = col_character(),
## .. weight = col_double(),
## .. acceleration = col_double(),
## .. year = col_double(),
## .. origin = col_double(),
## .. name = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
#Change horsepower into numeric
na.omit(Auto$horsepower)
## [1] "130" "165" "150" "150" "140" "198" "220" "215" "225" "190" "170" "160"
## [13] "150" "225" "95" "95" "97" "85" "88" "46" "87" "90" "95" "113"
## [25] "90" "215" "200" "210" "193" "88" "90" "95" "?" "100" "105" "100"
## [37] "88" "100" "165" "175" "153" "150" "180" "170" "175" "110" "72" "100"
## [49] "88" "86" "90" "70" "76" "65" "69" "60" "70" "95" "80" "54"
## [61] "90" "86" "165" "175" "150" "153" "150" "208" "155" "160" "190" "97"
## [73] "150" "130" "140" "150" "112" "76" "87" "69" "86" "92" "97" "80"
## [85] "88" "175" "150" "145" "137" "150" "198" "150" "158" "150" "215" "225"
## [97] "175" "105" "100" "100" "88" "95" "46" "150" "167" "170" "180" "100"
## [109] "88" "72" "94" "90" "85" "107" "90" "145" "230" "49" "75" "91"
## [121] "112" "150" "110" "122" "180" "95" "?" "100" "100" "67" "80" "65"
## [133] "75" "100" "110" "105" "140" "150" "150" "140" "150" "83" "67" "78"
## [145] "52" "61" "75" "75" "75" "97" "93" "67" "95" "105" "72" "72"
## [157] "170" "145" "150" "148" "110" "105" "110" "95" "110" "110" "129" "75"
## [169] "83" "100" "78" "96" "71" "97" "97" "70" "90" "95" "88" "98"
## [181] "115" "53" "86" "81" "92" "79" "83" "140" "150" "120" "152" "100"
## [193] "105" "81" "90" "52" "60" "70" "53" "100" "78" "110" "95" "71"
## [205] "70" "75" "72" "102" "150" "88" "108" "120" "180" "145" "130" "150"
## [217] "68" "80" "58" "96" "70" "145" "110" "145" "130" "110" "105" "100"
## [229] "98" "180" "170" "190" "149" "78" "88" "75" "89" "63" "83" "67"
## [241] "78" "97" "110" "110" "48" "66" "52" "70" "60" "110" "140" "139"
## [253] "105" "95" "85" "88" "100" "90" "105" "85" "110" "120" "145" "165"
## [265] "139" "140" "68" "95" "97" "75" "95" "105" "85" "97" "103" "125"
## [277] "115" "133" "71" "68" "115" "85" "88" "90" "110" "130" "129" "138"
## [289] "135" "155" "142" "125" "150" "71" "65" "80" "80" "77" "125" "71"
## [301] "90" "70" "70" "65" "69" "90" "115" "115" "90" "76" "60" "70"
## [313] "65" "90" "88" "90" "90" "78" "90" "75" "92" "75" "65" "105"
## [325] "65" "48" "48" "67" "67" "67" "?" "67" "62" "132" "100" "88"
## [337] "?" "72" "84" "84" "92" "110" "84" "58" "64" "60" "67" "65"
## [349] "62" "68" "63" "65" "65" "74" "?" "75" "75" "100" "74" "80"
## [361] "76" "116" "120" "110" "105" "88" "85" "88" "88" "88" "85" "84"
## [373] "90" "92" "74" "68" "68" "63" "70" "88" "75" "70" "67" "67"
## [385] "67" "110" "85" "92" "112" "96" "84" "90" "86" "52" "84" "79"
## [397] "82"
Auto$horsepower <- as.numeric(Auto$horsepower)
## Warning: NAs introduced by coercion
str(Auto)
## spc_tbl_ [397 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ mpg : num [1:397] 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num [1:397] 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num [1:397] 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num [1:397] 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num [1:397] 3504 3693 3436 3433 3449 ...
## $ acceleration: num [1:397] 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num [1:397] 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num [1:397] 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr [1:397] "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "spec")=
## .. cols(
## .. mpg = col_double(),
## .. cylinders = col_double(),
## .. displacement = col_double(),
## .. horsepower = col_character(),
## .. weight = col_double(),
## .. acceleration = col_double(),
## .. year = col_double(),
## .. origin = col_double(),
## .. name = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
plot(Auto)
#create a numeric data set from Auto
Auto_num <- select_if(Auto, is.numeric)
#Check for correlation
M = cor(Auto_num)
corrplot(M, method = "number")
lm.fit=lm(mpg~.-name,data=Auto) #regression on all values except ages
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
# Since p value is less than 0.05 (p-value < 2.2e-16), we can reject the null hypothesis and say that there is a relationship between predictor variable, mpg, and other response variables.
# Displacement, weight, year, and origin have a significant relationship to mpg (with very small p-value)
# It suggests that there is a postive relationship between the year and mpg variable. If other variables are constant, then the mpg will increase by 0.75 every year
par(mfrow=c(2,2))
plot(lm.fit)
###The residual plot doesn't follow a straight line and has a U-shape, which suggests that data is not linear. It also suggests of heterocedascity since variance of the points are not equally distributed around the line.
###For the normal Q-Q plot, the residuals are not normally distributed since they scattered at the end of the line.
###The Scale-Location plot has all values lie between 0 and 2. Therefore, we seem to have no outliers. (must be above 3)
y_inter<-lm(mpg~.-name + displacement*horsepower + origin*horsepower,data=Auto)
summary(y_inter)
##
## Call:
## lm(formula = mpg ~ . - name + displacement * horsepower + origin *
## horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7222 -1.5251 -0.0968 1.3553 12.8419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.706e+00 4.686e+00 -1.004 0.3159
## cylinders 5.142e-01 3.139e-01 1.638 0.1022
## displacement -6.970e-02 1.143e-02 -6.098 2.63e-09 ***
## horsepower -1.540e-01 3.547e-02 -4.342 1.81e-05 ***
## weight -3.084e-03 6.478e-04 -4.761 2.73e-06 ***
## acceleration -2.276e-01 9.099e-02 -2.501 0.0128 *
## year 7.349e-01 4.460e-02 16.478 < 2e-16 ***
## origin 2.281e+00 1.090e+00 2.092 0.0371 *
## displacement:horsepower 4.665e-04 6.127e-05 7.614 2.10e-13 ***
## horsepower:origin -1.918e-02 1.278e-02 -1.500 0.1343
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.908 on 382 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8644, Adjusted R-squared: 0.8612
## F-statistic: 270.6 on 9 and 382 DF, p-value: < 2.2e-16
###After trying many interaction effects, only displacement*horsepower and origian*horsepower interactions are significant
# Try a log transformations for horsepower
summary(lm(mpg ~ . -name + log(horsepower), data=Auto))
##
## Call:
## lm(formula = mpg ~ . - name + log(horsepower), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5777 -1.6623 -0.1213 1.4913 12.0230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.674e+01 1.106e+01 7.839 4.54e-14 ***
## cylinders -5.530e-02 2.907e-01 -0.190 0.849230
## displacement -4.607e-03 7.108e-03 -0.648 0.517291
## horsepower 1.764e-01 2.269e-02 7.775 7.05e-14 ***
## weight -3.366e-03 6.561e-04 -5.130 4.62e-07 ***
## acceleration -3.277e-01 9.670e-02 -3.388 0.000776 ***
## year 7.421e-01 4.534e-02 16.368 < 2e-16 ***
## origin 8.976e-01 2.528e-01 3.551 0.000432 ***
## log(horsepower) -2.685e+01 2.652e+00 -10.127 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.959 on 383 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8592, Adjusted R-squared: 0.8562
## F-statistic: 292.1 on 8 and 383 DF, p-value: < 2.2e-16
### log(horsepower) is also significant to model
#Question 10: This question should be answered using the Carseats data set.
#Check the data for multicollinearity
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
Carseats_num <- select_if(Carseats, is.numeric)
C = cor(Carseats_num)
corrplot(C, method = "number")
#Linear regression model]
lm_Carseat=lm(Sales~Price+Urban+US,data=Carseats)
summary(lm_Carseat)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
### Price is negative correlate with Sales. For one unit increase in price would decrease the sales would decrease by 0.054 units
### The amount of sales would not be affect if it is in the Urban or not, since Urban is not significant (p-value > 0.05)
### A store in the US would result in 1.2 more sales than stores is not in U.S.
###Sales=13.043469 - 0.054459*(Price) - (0.021916)*(Urban) + 1.200573 (US)
### Urban: 1 if Urban is Yes, 0 if No
### US: 1 if US is Yes, 0 if No
### We can reject the null hypothesis with Price and US predictors since their p-value is significant
#Linear regression model for Price and US
lm_Carseat2=lm(Sales~Price+US,data=Carseats)
summary(lm_Carseat2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
### Model in (a) and (e) have Adjusted R-squared of 0.2335 and 0.2354, respectively. This is a very low score, only 23% of y variable would be explained by the independent variables. Therefore, the model is not good enough
confint(lm_Carseat2)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
### There is a 95% chances that Price would fall between (-0.065,-0.044) and 95% chances of the stores being in the U.S. is between (0.6915,1.7078)
par(mfrow=c(2,2))
plot(lm_Carseat2)
### Base on the Normal Q-Q plot, there is no evidence for outliers. There's also no evidence for high leverage observations based onResiduals vs Leverage plot
#Question 12: This problem involves simple linear regression without an intercept.
#(a) Recall that the coefficient estimate βˆ for the linear regression of Y onto X without an intercept is given by (3.38). Under what circumstance is the coefficient estimate for the regression of X onto Y the same as the coefficient estimate for the regression of Y onto X?
# From the equation from 3.38, the coefficient estimate for regression of X onto Y the same as coefficient estimate for regression of Y onto X when: sum of xi^2 = sum of yi^2
x <- c(1:100)
y <- rbinom(100,1,0.05)
eq1 <- lm(y ~ x + 0)
eq2 <- lm(x ~ y + 0)
summary(eq1)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.07950 -0.05824 -0.03856 -0.01729 0.99046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 0.000795 0.000378 2.103 0.038 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2199 on 99 degrees of freedom
## Multiple R-squared: 0.04277, Adjusted R-squared: 0.0331
## F-statistic: 4.424 on 1 and 99 DF, p-value: 0.03798
summary(eq2)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.80 23.75 48.50 73.25 100.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 53.80 25.58 2.103 0.038 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.2 on 99 degrees of freedom
## Multiple R-squared: 0.04277, Adjusted R-squared: 0.0331
## F-statistic: 4.424 on 1 and 99 DF, p-value: 0.03798
#We can see that the coefficient for both cases are different
x2 <- c(1:100)
y2 <- c(100:1)
eq3 <- lm(y2 ~ x2 + 0)
eq4 <- lm(x2 ~ y2 + 0)
summary(eq3)
##
## Call:
## lm(formula = y2 ~ x2 + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.75 -12.44 24.87 62.18 99.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x2 0.5075 0.0866 5.86 6.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared: 0.2575, Adjusted R-squared: 0.25
## F-statistic: 34.34 on 1 and 99 DF, p-value: 6.094e-08
summary(eq4)
##
## Call:
## lm(formula = x2 ~ y2 + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.75 -12.44 24.87 62.18 99.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y2 0.5075 0.0866 5.86 6.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared: 0.2575, Adjusted R-squared: 0.25
## F-statistic: 34.34 on 1 and 99 DF, p-value: 6.094e-08