Problem 2 Carefully explain the differences between the KNN classifier and KNN#regression methods. KNN classifier uses k nearest neighbors to output a class (think….type/thing/noun). KNN regression uses k nearest neighbors to output a numerical value (think…stock rices/miles per gallon/number of widgets/East|West of Charles# River/0|1/Yes|No/Pass|Fail). Both are machine learning methods that use the k nearest neighbor to build a model for prediction purposes.
options(repos = list(CRAN="http://cran.rstudio.com/"))
install.packages("ISLR2")
## Installing package into 'C:/Users/aliso/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ISLR2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\aliso\AppData\Local\Temp\RtmpKkBXr9\downloaded_packages
install.packages("ISLR")
## Installing package into 'C:/Users/aliso/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ISLR' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\aliso\AppData\Local\Temp\RtmpKkBXr9\downloaded_packages
install.packages("GGally")
## Installing package into 'C:/Users/aliso/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'GGally' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\aliso\AppData\Local\Temp\RtmpKkBXr9\downloaded_packages
install.packages("ggplot2")
## Installing package into 'C:/Users/aliso/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\aliso\AppData\Local\Temp\RtmpKkBXr9\downloaded_packages
install.packages("tidyverse")
## Installing package into 'C:/Users/aliso/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\aliso\AppData\Local\Temp\RtmpKkBXr9\downloaded_packages
install.packages("rsconnect")
## Installing package into 'C:/Users/aliso/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'rsconnect' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\aliso\AppData\Local\Temp\RtmpKkBXr9\downloaded_packages
install.packages("knitr")
## Installing package into 'C:/Users/aliso/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'knitr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\aliso\AppData\Local\Temp\RtmpKkBXr9\downloaded_packages
library(ISLR)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rsconnect)
library(knitr)
data("Auto")
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
pairs(Auto, main = "Scatterplot Auto Matrix", col="orange", pch = 19)
auto <- as_tibble(Auto)
autoNoName <- select(Auto, -name)
cor(autoNoName)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
NineC <- lm(mpg ~ . -name, data = Auto)
summary(NineC)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(NineC)
interactionPlot <- lm(mpg ~weight*displacement + . -name, data = Auto)
summary(interactionPlot)
##
## Call:
## lm(formula = mpg ~ weight * displacement + . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9027 -1.8092 -0.0946 1.5549 12.1687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.389e+00 4.301e+00 -1.253 0.2109
## weight -1.064e-02 7.136e-04 -14.915 < 2e-16 ***
## displacement -6.837e-02 1.104e-02 -6.193 1.52e-09 ***
## cylinders 1.175e-01 2.943e-01 0.399 0.6899
## horsepower -3.280e-02 1.238e-02 -2.649 0.0084 **
## acceleration 6.724e-02 8.805e-02 0.764 0.4455
## year 7.852e-01 4.553e-02 17.246 < 2e-16 ***
## origin 5.610e-01 2.622e-01 2.139 0.0331 *
## weight:displacement 2.269e-05 2.257e-06 10.054 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.964 on 383 degrees of freedom
## Multiple R-squared: 0.8588, Adjusted R-squared: 0.8558
## F-statistic: 291.1 on 8 and 383 DF, p-value: < 2.2e-16
interactionPlot2 <- lm(mpg ~ year*origin + . -name, data = Auto)
summary(interactionPlot2)
##
## Call:
## lm(formula = mpg ~ year * origin + . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6072 -2.0439 -0.0596 1.7121 12.3368
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.492e+00 9.044e+00 0.939 0.348353
## year 4.189e-01 1.125e-01 3.723 0.000226 ***
## origin -1.405e+01 4.699e+00 -2.989 0.002978 **
## cylinders -5.042e-01 3.192e-01 -1.579 0.115082
## displacement 1.567e-02 7.530e-03 2.081 0.038060 *
## horsepower -1.399e-02 1.364e-02 -1.025 0.305786
## weight -6.352e-03 6.449e-04 -9.851 < 2e-16 ***
## acceleration 9.185e-02 9.766e-02 0.941 0.347546
## year:origin 1.989e-01 6.030e-02 3.298 0.001064 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.286 on 383 degrees of freedom
## Multiple R-squared: 0.8264, Adjusted R-squared: 0.8228
## F-statistic: 227.9 on 8 and 383 DF, p-value: < 2.2e-16
interactionPlot3 <- lm(mpg ~ log(year*origin) + . -name, data = Auto)
summary(interactionPlot3)
##
## Call:
## lm(formula = mpg ~ log(year * origin) + . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0421 -2.0758 -0.0992 1.9924 13.3481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.142e+01 1.141e+01 -3.631 0.000321 ***
## log(year * origin) 7.966e+00 3.434e+00 2.320 0.020872 *
## cylinders -4.916e-01 3.215e-01 -1.529 0.127026
## displacement 2.384e-02 7.663e-03 3.110 0.002008 **
## horsepower -1.800e-02 1.372e-02 -1.312 0.190233
## weight -6.707e-03 6.561e-04 -10.223 < 2e-16 ***
## acceleration 7.950e-02 9.829e-02 0.809 0.419116
## year 6.710e-01 6.125e-02 10.955 < 2e-16 ***
## origin -2.949e+00 1.906e+00 -1.547 0.122641
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.309 on 383 degrees of freedom
## Multiple R-squared: 0.824, Adjusted R-squared: 0.8203
## F-statistic: 224.1 on 8 and 383 DF, p-value: < 2.2e-16
data("Carseats")
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
carseat_Sales <- lm(Sales ~Price + Urban + US, data = Carseats)
carseat_Sales
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Coefficients:
## (Intercept) Price UrbanYes USYes
## 13.04347 -0.05446 -0.02192 1.20057
set.seed(997)
carseat_AOV_Sales <- anova(carseat_Sales)
print(carseat_AOV_Sales)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.0603 < 2.2e-16 ***
## Urban 1 0.10 0.10 0.0158 0.9001
## US 1 131.31 131.31 21.4802 4.86e-06 ***
## Residuals 396 2420.83 6.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
For each dollar raised on the price of the carseat, the number of carseats sold goes down by 5%. If the car seat is sold in an Urban area, the number of carseats sold goes down by 2% (possible causes being greater use of mass transportation/fewer personal vehicles, or fewer young children). For every car seat placed in a US store, the number of car seats sold go up by 120%.
#Sales = 13.04347 - 0.05446(Price) - 0.02192(Urban-0|1) + 1.20057(US-0|1)
You can reject null hypothesis for the ANOVA statistically significant Price and US factors.
smaller_Carseats <- lm(Sales ~ Price + US, data = Carseats)
smaller_Carseats
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Coefficients:
## (Intercept) Price USYes
## 13.03079 -0.05448 1.19964
smaller_CarseatsAOV <- anova(smaller_Carseats)
smaller_CarseatsAOV
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## Price 1 630.03 630.03 103.319 < 2.2e-16 ***
## US 1 131.37 131.37 21.543 4.707e-06 ***
## Residuals 397 2420.87 6.10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
reduced_Carseats <- anova(smaller_Carseats, carseat_Sales)
reduced_Carseats
## Analysis of Variance Table
##
## Model 1: Sales ~ Price + US
## Model 2: Sales ~ Price + Urban + US
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 397 2420.9
## 2 396 2420.8 1 0.03979 0.0065 0.9357
summary(smaller_Carseats)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
AIC(smaller_Carseats)
## [1] 1863.319
AIC(carseat_Sales)
## [1] 1865.312
BIC(smaller_Carseats)
## [1] 1879.285
BIC(carseat_Sales)
## [1] 1885.269
Neither fit the data well. The R-squared and adjusted R-squared only predicts ~23% of the model. I noticed that none of the models included number of children < 4yo & <1yo as a factor (i.e. possible demand). Might want to look into that instead of Urban.
confint(smaller_Carseats, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
par(mfrow = c(2,2))
plot(smaller_Carseats)
residuals <- resid(smaller_Carseats)
outliers <- which(abs(residuals) > 2*sd(residuals))
print(outliers)
## 18 26 29 31 50 51 58 63 69 83 99 107 144 210 259 298 299 305 317 329
## 18 26 29 31 50 51 58 63 69 83 99 107 144 210 259 298 299 305 317 329
## 353 377 396
## 353 377 396
#Below are the outlier observation numbers.
# Calculate Cook's Distance
cooks_dist <- cooks.distance(smaller_Carseats)
# Identify influential points
outliers2 <- which(cooks_dist > 0.75)
print(outliers2)
## named integer(0)
This means that none of the outlier observations have significant leverage on the model.
If there is no error term and/or a 1:1 relationship between X and Y.
set.seed(997)
x = rnorm(100)
y = 1.5*x + rnorm(100)
modelx_y <- lm(x~y+0)
coefx_y <- coef(modelx_y)
modely_x <- lm(y~x+0)
coefy_x <- coef(modely_x)
summary(modelx_y)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.27251 -0.28998 0.01147 0.36325 1.04423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.5001 0.0276 18.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5044 on 99 degrees of freedom
## Multiple R-squared: 0.7683, Adjusted R-squared: 0.766
## F-statistic: 328.3 on 1 and 99 DF, p-value: < 2.2e-16
summary(modely_x)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8367 -0.5946 0.0378 0.5903 2.0377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.53631 0.08479 18.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.884 on 99 degrees of freedom
## Multiple R-squared: 0.7683, Adjusted R-squared: 0.766
## F-statistic: 328.3 on 1 and 99 DF, p-value: < 2.2e-16
print(coefx_y)
## y
## 0.5001115
print(coefy_x)
## x
## 1.536306
set.seed(997)
x = rnorm(100)
y = x
model2y_x <- lm(y ~ x + 0)
model2x_y <- lm(x ~ y + 0)
coef2y_x <- coef(model2y_x)
coef2x_y <- coef(model2x_y)
summary(model2y_x)
## Warning in summary.lm(model2y_x): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.091e-16 -2.670e-17 1.060e-17 5.000e-17 3.650e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.000e+00 3.608e-17 2.771e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.762e-16 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.68e+32 on 1 and 99 DF, p-value: < 2.2e-16
summary(model2x_y)
## Warning in summary.lm(model2x_y): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.091e-16 -2.670e-17 1.060e-17 5.000e-17 3.650e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 1.000e+00 3.608e-17 2.771e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.762e-16 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.68e+32 on 1 and 99 DF, p-value: < 2.2e-16
print(coef2y_x)
## x
## 1
print(coef2x_y)
## y
## 1