library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Problem 1

intercept_tval<- -17.5791/6.7587
intercept_tval
## [1] -2.600959
speed_tval<- 3.9324/.4155
speed_tval
## [1] 9.46426
pt(speed_tval, 48, lower.tail=FALSE)
## [1] 7.442476e-13

48 degrees of freedom

F-statistic: 89.56562 on 1 and 48 DF p-value: 1.490228e-12

speed_MS<- 21186/1
speed_MS
## [1] 21186
residual_MS<- 11354/48
residual_MS
## [1] 236.5417
fval<- speed_MS/residual_MS
fval
## [1] 89.56562
pf(fval, 1, 48, lower.tail=FALSE)
## [1] 1.490228e-12

Problem 2

  1. Use the lm() function to perform a simple linear regression with mpg as the response and horsepower as the predictor. Use the summary() function to print the results.
auto<-read.csv("Auto (2).csv", header=TRUE, na.strings = "?")
head(auto)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
Auto<-na.omit(auto)
attach(Auto)
## The following object is masked from package:ggplot2:
## 
##     mpg
plot(horsepower, mpg)

mod<-lm(mpg~horsepower, data=Auto)
summary(mod)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

There seems to be a linear, negative, moderately strong relationship between the response (mpg) and the predictor (horsepower).

  1. Plot the response and the predictor. Use the abline() function to display the least squares regression line.
ggplot(Auto, aes(x=horsepower, y=mpg))+
  geom_point()+
  geom_abline(slope=mod$coefficients[2], intercept=mod$coefficients[1],
              color="blue", lty=2, lwd=1)+
  theme_bw()

  1. Use the plot() function to produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.
mpg.residual<-resid(mod)
plot(Auto$horsepower, mpg.residual)
abline(0,0)

While the residual plot seems pretty random, it is concerning to see a little bit of a curve in the residual data that could be a pattern that signals an inappropriate fit for the model. Data points above a horsepower of 150 consistently haev positive residual values, actual is consistently greater than the predicted.

Problem 3

auto<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.csv",
header=TRUE,
na.strings = "?")

auto<-auto[,-c(8:9)]
auto2<-na.omit(auto)
  1. Produce a scatter plot matrix which includes all of the variables in the dataset.
pairs(auto2)

  1. Compute the matrix of correlations between the variables using the function cor().
cor(auto2)
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
##              acceleration       year
## mpg             0.4233285  0.5805410
## cylinders      -0.5046834 -0.3456474
## displacement   -0.5438005 -0.3698552
## horsepower     -0.6891955 -0.4163615
## weight         -0.4168392 -0.3091199
## acceleration    1.0000000  0.2903161
## year            0.2903161  1.0000000
  1. Use the lm() function to perform a multiple linear regression with mpg as the response and all other variables (except origin and name) as predictors. Use the summary() function to print the results.
mlr_mod<-lm(mpg~ horsepower+year+cylinders+displacement+acceleration+weight, data=auto2)
summary(mlr_mod)
## 
## Call:
## lm(formula = mpg ~ horsepower + year + cylinders + displacement + 
##     acceleration + weight, data = auto2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6927 -2.3864 -0.0801  2.0291 14.3607 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.454e+01  4.764e+00  -3.051  0.00244 ** 
## horsepower   -3.914e-04  1.384e-02  -0.028  0.97745    
## year          7.534e-01  5.262e-02  14.318  < 2e-16 ***
## cylinders    -3.299e-01  3.321e-01  -0.993  0.32122    
## displacement  7.678e-03  7.358e-03   1.044  0.29733    
## acceleration  8.527e-02  1.020e-01   0.836  0.40383    
## weight       -6.795e-03  6.700e-04 -10.141  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.435 on 385 degrees of freedom
## Multiple R-squared:  0.8093, Adjusted R-squared:  0.8063 
## F-statistic: 272.2 on 6 and 385 DF,  p-value: < 2.2e-16
p<-6
  1. Ther does appear to be a relationship between the response (mpg) and the predictors (horsepower, year, cylinders, displacement, acceleration, weight).

  2. The predictors of year and weight appear to have a statistically significant relationship to the response of mpg.

  3. For every 1 increase in year, you would predict a 0.7534 increase in mpg. (slope interpretation)

  1. Write code using matrix algebra to produce the summary output (ie. B vector and SE(B), you’ll need to find the response vector and design matrix to do this).
Y<-as.matrix(auto2$mpg)
head(Y)
##      [,1]
## [1,]   18
## [2,]   15
## [3,]   18
## [4,]   16
## [5,]   17
## [6,]   15
n<-dim(Y)[1]


X<-matrix(c(rep(1, n),
            auto2$cylinders, auto2$displacement, auto2$horsepower, auto2$weight, auto2$acceleration, auto2$year), 
          ncol=7, 
          byrow=FALSE)
head(X)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,]    1    8  307  130 3504 12.0   70
## [2,]    1    8  350  165 3693 11.5   70
## [3,]    1    8  318  150 3436 11.0   70
## [4,]    1    8  304  150 3433 12.0   70
## [5,]    1    8  302  140 3449 10.5   70
## [6,]    1    8  429  198 4341 10.0   70
dim(X)
## [1] 392   7
betaHat<-solve(t(X)%*%X)%*%t(X)%*%Y
betaHat
##               [,1]
## [1,] -1.453525e+01
## [2,] -3.298591e-01
## [3,]  7.678430e-03
## [4,] -3.913556e-04
## [5,] -6.794618e-03
## [6,]  8.527325e-02
## [7,]  7.533672e-01
P<-X%*%solve(t(X)%*%X)%*%t(X)
dim(P)
## [1] 392 392
fit<-P%*%Y
head(fit)
##          [,1]
## [1,] 15.08292
## [2,] 14.07257
## [3,] 15.53632
## [4,] 15.53447
## [5,] 15.28641
## [6,] 10.13543
head(mlr_mod$fitted)
##        1        2        3        4        5        6 
## 15.08292 14.07257 15.53632 15.53447 15.28641 10.13543
I<-diag(n)
dim(I)
## [1] 392 392
res<-(I-P)%*%Y
head(res)
##           [,1]
## [1,] 2.9170810
## [2,] 0.9274253
## [3,] 2.4636846
## [4,] 0.4655255
## [5,] 1.7135925
## [6,] 4.8645663
head(mlr_mod$residuals)
##         1         2         3         4         5         6 
## 2.9170810 0.9274253 2.4636846 0.4655255 1.7135925 4.8645663
ss_res<-sum(t(res)%*%res)
ss_res
## [1] 4543.347
ms_res<-ss_res/(n-p-1)
ms_res
## [1] 11.8009
var_betaHat<-ms_res*solve(t(X)%*%X)
var_betaHat
##               [,1]          [,2]          [,3]          [,4]          [,5]
## [1,] 22.6945706995 -3.009494e-01  3.969964e-03 -3.197896e-02  6.246273e-04
## [2,] -0.3009493595  1.102932e-01 -1.637297e-03  4.719516e-04 -2.646884e-05
## [3,]  0.0039699638 -1.637297e-03  5.413628e-05 -2.418195e-05 -2.173088e-06
## [4,] -0.0319789568  4.719516e-04 -2.418195e-05  1.914493e-04 -4.139916e-06
## [5,]  0.0006246273 -2.646884e-05 -2.173088e-06 -4.139916e-06  4.489304e-07
## [6,] -0.2143654463  1.688654e-03  8.326532e-05  8.941480e-04 -3.298516e-05
## [7,] -0.2234420177  2.501586e-04  2.852420e-05  1.648842e-04 -5.908396e-06
##               [,6]          [,7]
## [1,] -2.143654e-01 -2.234420e-01
## [2,]  1.688654e-03  2.501586e-04
## [3,]  8.326532e-05  2.852420e-05
## [4,]  8.941480e-04  1.648842e-04
## [5,] -3.298516e-05 -5.908396e-06
## [6,]  1.041126e-02  4.203459e-04
## [7,]  4.203459e-04  2.768669e-03
se_betaHat<-sqrt(diag(var_betaHat))
se_betaHat
## [1] 4.7638818940 0.3321041332 0.0073577361 0.0138365215 0.0006700227
## [6] 0.1020355670 0.0526181480
  1. Use the plot function in base R or use ggplot to produce diagnostic plots of linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers (in the y direction)?
mlr.resid<-resid(mod)
plot(year, mlr.resid)

plot(cylinders, mlr.resid)

plot(weight, mlr.resid)

plot(displacement, mlr.resid)

plot(acceleration, mlr.resid)