#The difference between the KNN classifier and the KNN regression methods the classifier uses a categorical independent variable and the regression uses a continuous independent variable.
## Number 9
library(ISLR2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("Auto")
pairs(Auto)
## Part B
cor(Auto[, c(1:8)])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
options(scipen = 999)
linearmodel <- lm(data=Auto[,1:8], mpg ~.)
summary(linearmodel)
##
## Call:
## lm(formula = mpg ~ ., data = Auto[, 1:8])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 0.0000000000000002 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 0.0000000000000002 ***
## origin 1.426141 0.278136 5.127 0.000000467 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 0.00000000000000022
# i. There is a relationship between the predictors and the response variable, mpg. The .8 r-squared value tells us that the model is a pretty good fit for analysis.
# ii.From the linear model, we can see there is a significant relationship between a couple of variables and mpg. These include displacement, weight, year, and origin. However, origin may need to be changed to a factor variable. This is shown by the P value.
#iii. The coefficient for the year variable tells us there is a weak positive relationship between year and mpg.
par(mfrow = c(2,2))
plot(linearmodel)
# The residual plot suggests to us there are a few outliers (323,326,327), but not many. The residual vs. leverage plot shows us that there are some points with high leverage, but they are all almost within the cooks distance line.
linearmodel <- lm(formula = mpg ~ cylinders * cylinders + weight * weight + horsepower * year + horsepower * weight, data = Auto[, 1:8])
summary(linearmodel)
##
## Call:
## lm(formula = mpg ~ cylinders * cylinders + weight * weight +
## horsepower * year + horsepower * weight, data = Auto[, 1:8])
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3390 -1.7208 -0.0183 1.3665 11.9425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -51.007282504 10.674370551 -4.778 0.000002516645302063 ***
## cylinders 0.158219457 0.203707623 0.777 0.43781
## weight -0.010459504 0.000703948 -14.858 < 0.0000000000000002 ***
## horsepower 0.336483446 0.104778631 3.211 0.00143 **
## year 1.449502772 0.130802655 11.082 < 0.0000000000000002 ***
## horsepower:year -0.007114799 0.001302777 -5.461 0.000000084966691387 ***
## weight:horsepower 0.000045693 0.000005365 8.517 0.000000000000000373 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.861 on 385 degrees of freedom
## Multiple R-squared: 0.8677, Adjusted R-squared: 0.8656
## F-statistic: 420.7 on 6 and 385 DF, p-value: < 0.00000000000000022
## There are several interactions that are significant. The only interaction that was not significant was the cylinders ^2.
par(mfrow = c(1,2))
plot((Auto$acceleration)^2, Auto$mpg)
plot(log(Auto$horsepower), Auto$mpg)
# The log transformation between horsepower and auto appears to be more linear.
data("Carseats")
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
model <- lm(data=Carseats, Sales ~ Price + Urban + US)
summary(model)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 0.0000000000000002 ***
## Price -0.054459 0.005242 -10.389 < 0.0000000000000002 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 0.00000486 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 0.00000000000000022
# At least one slope is different from another. The low p values in price amd USYes coefficient shows that there is significance. There is no significance between yes/no values in the "urban" variable and sales. Our R-Squared is low, which tells us that the model won't predict the best as of now.
#Sales = 13.04 - .05(Price) -.02(Urban) + 1.2 (USYes)
model <- lm(data=Carseats, Sales ~ Price + US)
summary(model)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 0.0000000000000002 ***
## Price -0.05448 0.00523 -10.416 < 0.0000000000000002 ***
## USYes 1.19964 0.25846 4.641 0.00000471 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 0.00000000000000022
confint(model)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
par(mfrow = c(2,2))
plot(model)
# Based on the fitted plot, there is a relatively even spread of variables that have a fitted value between 6 and 10. The Quantiles plot looks good. However, there is a high leverage between 0 and .01.
##Part A
x <- 1:100
sum(x^2)
## [1] 338350
y <- 2 * x + rnorm(100, sd = .01)
sum(y^2)
## [1] 1353461
fitx <- lm(y~x+0)
fity <- lm(x~y+0)
summary(fity)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.013628 -0.003679 -0.000283 0.002634 0.015427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.499988722 0.000004447 112428 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.005174 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.264e+10 on 1 and 99 DF, p-value: < 0.00000000000000022
summary(fitx)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.030853 -0.005267 0.000567 0.007359 0.027258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 2.00004510 0.00001779 112428 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01035 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.264e+10 on 1 and 99 DF, p-value: < 0.00000000000000022
x <- 1:100
sum(x^2)
## [1] 338350
y <- 100:1
sum(y^2)
## [1] 338350
fitx <- lm(y~x+0)
fity <- lm(x~y+0)
summary(fity)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.75 -12.44 24.87 62.18 99.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.5075 0.0866 5.86 0.0000000609 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared: 0.2575, Adjusted R-squared: 0.25
## F-statistic: 34.34 on 1 and 99 DF, p-value: 0.00000006094
summary(fitx)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.75 -12.44 24.87 62.18 99.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 0.5075 0.0866 5.86 0.0000000609 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.37 on 99 degrees of freedom
## Multiple R-squared: 0.2575, Adjusted R-squared: 0.25
## F-statistic: 34.34 on 1 and 99 DF, p-value: 0.00000006094