Auto <- read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.csv",
header=TRUE,
na.strings = "?")
library(tidyverse)
## ── Attaching packages ──────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
p <-2
mod <- lm(horsepower~mpg, data = Auto)
mod
##
## Call:
## lm(formula = horsepower ~ mpg, data = Auto)
##
## Coefficients:
## (Intercept) mpg
## 194.476 -3.839
summary(mod)
##
## Call:
## lm(formula = horsepower ~ mpg, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.892 -15.716 -2.094 13.108 96.947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 194.4756 3.8732 50.21 <2e-16 ***
## mpg -3.8389 0.1568 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.19 on 390 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
confint(mod)
## 2.5 % 97.5 %
## (Intercept) 186.860756 202.09053
## mpg -4.147086 -3.53069
new.df <- data.frame(mpg = c(0,98), horsepower = c(0,98))
predict(mod, new.df,
interval = "predict")
## fit lwr upr
## 1 194.4756 146.3045 242.6468
## 2 -181.7354 -234.6145 -128.8562
ggplot(Auto, aes(x=mpg, y=horsepower))+
geom_point()+
geom_abline(slope=coefficients(mod)[2], intercept =coefficients(mod)[1],
color="blue", lty=2, lwd=1)+
theme_bw()
## Warning: Removed 5 rows containing missing values (geom_point).
plot(mod)
###### The fit for the line on most of the graphs appears to be within reason, with the exception of scale/location plot which has a lot of noise around the line of best fit
auto <- Auto[,-c(8:9)]
pairs(auto)
##### b.
cor(auto)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7762599 -0.8044430 NA -0.8317389
## cylinders -0.7762599 1.0000000 0.9509199 NA 0.8970169
## displacement -0.8044430 0.9509199 1.0000000 NA 0.9331044
## horsepower NA NA NA 1 NA
## weight -0.8317389 0.8970169 0.9331044 NA 1.0000000
## acceleration 0.4222974 -0.5040606 -0.5441618 NA -0.4195023
## year 0.5814695 -0.3467172 -0.3698041 NA -0.3079004
## acceleration year
## mpg 0.4222974 0.5814695
## cylinders -0.5040606 -0.3467172
## displacement -0.5441618 -0.3698041
## horsepower NA NA
## weight -0.4195023 -0.3079004
## acceleration 1.0000000 0.2829009
## year 0.2829009 1.0000000
mod_mpg <- lm(mpg~ cylinders+displacement+horsepower+weight+acceleration+year, data = auto)
summary(mod_mpg)
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + year, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6927 -2.3864 -0.0801 2.0291 14.3607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.454e+01 4.764e+00 -3.051 0.00244 **
## cylinders -3.299e-01 3.321e-01 -0.993 0.32122
## displacement 7.678e-03 7.358e-03 1.044 0.29733
## horsepower -3.914e-04 1.384e-02 -0.028 0.97745
## weight -6.795e-03 6.700e-04 -10.141 < 2e-16 ***
## acceleration 8.527e-02 1.020e-01 0.836 0.40383
## year 7.534e-01 5.262e-02 14.318 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.435 on 385 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8093, Adjusted R-squared: 0.8063
## F-statistic: 272.2 on 6 and 385 DF, p-value: < 2.2e-16
# response vector
Y <- as.matrix(auto$mpg)
head(Y)
## [,1]
## [1,] 18
## [2,] 15
## [3,] 18
## [4,] 16
## [5,] 17
## [6,] 15
dim(Y)
## [1] 397 1
n <- dim(Y)[1]
# design matrix
X <- matrix(c(rep(1,n),
auto$cylinders,
auto$displacement,
auto$horsepower,
auto$weight,
auto$acceleration,
auto$year),
ncol = 7,
byrow = FALSE)
dim(X)
## [1] 397 7
# Beta-hat estimate
betaHat<-solve(t(Y)%*%Y)%*%t(Y)%*%X
betaHat
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.03829416 0.1922358 6.343801 NA 104.7789 0.6104535 2.937438
## SE beta-hat estimate:
## find projection & idenity matrcies
P<-Y%*%solve(t(Y)%*%Y)%*%t(Y)
dim(P)
## [1] 397 397
I<-diag(n)
dim(I)
## [1] 397 397
## verify residual values
res<-(I-P)%*%Y
## sum of square residuals
ss_res<-sum(t(res)%*%res)
ss_res
## [1] 8.64469e-26
## then MS(res)
ms_res<-ss_res/(n-p-1)
ms_res
## [1] 2.194084e-28
## variance of the beta-hat
var_betaHat<-ms_res*solve(t(Y)%*%Y)
var_betaHat
## [,1]
## [1,] 8.999827e-34
## SE beta-hat
se_betaHat<-sqrt(diag(var_betaHat))
se_betaHat
## [1] 2.999971e-17
plot(mod_mpg)
##### The line of fit in these graphs is greatly affected by noise within the data set, including some outliers that are shown specifically in the residuals / leverage graphs