Problem Set #4

library(tidyverse)

## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Problem 1

intercept_tval<- -17.5791/6.7587
intercept_tval

## [1] -2.600959

speed_tval<- 3.9324/.4155
speed_tval

## [1] 9.46426

pt(speed_tval, 48, lower.tail=FALSE)

## [1] 7.442476e-13

48 degrees of freedom

F-statistic: 89.56562 on 1 and 48 DF p-value: 1.490228e-12

speed_MS<- 21186/1
speed_MS

## [1] 21186

residual_MS<- 11354/48
residual_MS

## [1] 236.5417

fval<- speed_MS/residual_MS
fval

## [1] 89.56562

pf(fval, 1, 48, lower.tail=FALSE)

## [1] 1.490228e-12

Problem 2

Use the lm() function to perform a simple linear regression with mpg as the response and horsepower as the predictor. Use the summary() function to print the results.

auto<-read.csv("Auto (2).csv", header=TRUE, na.strings = "?")
head(auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

Auto<-na.omit(auto)
attach(Auto)

## The following object is masked from package:ggplot2:
## 
##     mpg

plot(horsepower, mpg)

mod<-lm(mpg~horsepower, data=Auto)
summary(mod)

## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

There seems to be a linear, negative, moderately strong relationship between the response (mpg) and the predictor (horsepower).

Plot the response and the predictor. Use the abline() function to display the least squares regression line.

ggplot(Auto, aes(x=horsepower, y=mpg))+
  geom_point()+
  geom_abline(slope=mod$coefficients[2], intercept=mod$coefficients[1],
              color="blue", lty=2, lwd=1)+
  theme_bw()

Use the plot() function to produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.

mpg.residual<-resid(mod)
plot(Auto$horsepower, mpg.residual)
abline(0,0)

While the residual plot seems pretty random, it is concerning to see a little bit of a curve in the residual data that could be a pattern that signals an inappropriate fit for the model. Data points above a horsepower of 150 consistently haev positive residual values, actual is consistently greater than the predicted.

Problem 3

auto<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.csv",
header=TRUE,
na.strings = "?")

auto<-auto[,-c(8:9)]
auto2<-na.omit(auto)

Produce a scatter plot matrix which includes all of the variables in the dataset.

pairs(auto2)

Compute the matrix of correlations between the variables using the function cor().

cor(auto2)

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
##              acceleration       year
## mpg             0.4233285  0.5805410
## cylinders      -0.5046834 -0.3456474
## displacement   -0.5438005 -0.3698552
## horsepower     -0.6891955 -0.4163615
## weight         -0.4168392 -0.3091199
## acceleration    1.0000000  0.2903161
## year            0.2903161  1.0000000

Use the lm() function to perform a multiple linear regression with mpg as the response and all other variables (except origin and name) as predictors. Use the summary() function to print the results.

mlr_mod<-lm(mpg~ horsepower+year+cylinders+displacement+acceleration+weight, data=auto2)
summary(mlr_mod)

## 
## Call:
## lm(formula = mpg ~ horsepower + year + cylinders + displacement + 
##     acceleration + weight, data = auto2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6927 -2.3864 -0.0801  2.0291 14.3607 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.454e+01  4.764e+00  -3.051  0.00244 ** 
## horsepower   -3.914e-04  1.384e-02  -0.028  0.97745    
## year          7.534e-01  5.262e-02  14.318  < 2e-16 ***
## cylinders    -3.299e-01  3.321e-01  -0.993  0.32122    
## displacement  7.678e-03  7.358e-03   1.044  0.29733    
## acceleration  8.527e-02  1.020e-01   0.836  0.40383    
## weight       -6.795e-03  6.700e-04 -10.141  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.435 on 385 degrees of freedom
## Multiple R-squared:  0.8093, Adjusted R-squared:  0.8063 
## F-statistic: 272.2 on 6 and 385 DF,  p-value: < 2.2e-16

p<-6

Ther does appear to be a relationship between the response (mpg) and the predictors (horsepower, year, cylinders, displacement, acceleration, weight).
The predictors of year and weight appear to have a statistically significant relationship to the response of mpg.
For every 1 increase in year, you would predict a 0.7534 increase in mpg. (slope interpretation)

Write code using matrix algebra to produce the summary output (ie. B vector and SE(B), you’ll need to find the response vector and design matrix to do this).

Y<-as.matrix(auto2$mpg)
head(Y)

##      [,1]
## [1,]   18
## [2,]   15
## [3,]   18
## [4,]   16
## [5,]   17
## [6,]   15

n<-dim(Y)[1]


X<-matrix(c(rep(1, n),
            auto2$cylinders, auto2$displacement, auto2$horsepower, auto2$weight, auto2$acceleration, auto2$year), 
          ncol=7, 
          byrow=FALSE)
head(X)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,]    1    8  307  130 3504 12.0   70
## [2,]    1    8  350  165 3693 11.5   70
## [3,]    1    8  318  150 3436 11.0   70
## [4,]    1    8  304  150 3433 12.0   70
## [5,]    1    8  302  140 3449 10.5   70
## [6,]    1    8  429  198 4341 10.0   70

dim(X)

## [1] 392   7

betaHat<-solve(t(X)%*%X)%*%t(X)%*%Y
betaHat

##               [,1]
## [1,] -1.453525e+01
## [2,] -3.298591e-01
## [3,]  7.678430e-03
## [4,] -3.913556e-04
## [5,] -6.794618e-03
## [6,]  8.527325e-02
## [7,]  7.533672e-01

P<-X%*%solve(t(X)%*%X)%*%t(X)
dim(P)

## [1] 392 392

fit<-P%*%Y
head(fit)

##          [,1]
## [1,] 15.08292
## [2,] 14.07257
## [3,] 15.53632
## [4,] 15.53447
## [5,] 15.28641
## [6,] 10.13543

head(mlr_mod$fitted)

##        1        2        3        4        5        6 
## 15.08292 14.07257 15.53632 15.53447 15.28641 10.13543

I<-diag(n)
dim(I)

## [1] 392 392

res<-(I-P)%*%Y
head(res)

##           [,1]
## [1,] 2.9170810
## [2,] 0.9274253
## [3,] 2.4636846
## [4,] 0.4655255
## [5,] 1.7135925
## [6,] 4.8645663

head(mlr_mod$residuals)

##         1         2         3         4         5         6 
## 2.9170810 0.9274253 2.4636846 0.4655255 1.7135925 4.8645663

ss_res<-sum(t(res)%*%res)
ss_res

## [1] 4543.347

ms_res<-ss_res/(n-p-1)
ms_res

## [1] 11.8009

var_betaHat<-ms_res*solve(t(X)%*%X)
var_betaHat

##               [,1]          [,2]          [,3]          [,4]          [,5]
## [1,] 22.6945706995 -3.009494e-01  3.969964e-03 -3.197896e-02  6.246273e-04
## [2,] -0.3009493595  1.102932e-01 -1.637297e-03  4.719516e-04 -2.646884e-05
## [3,]  0.0039699638 -1.637297e-03  5.413628e-05 -2.418195e-05 -2.173088e-06
## [4,] -0.0319789568  4.719516e-04 -2.418195e-05  1.914493e-04 -4.139916e-06
## [5,]  0.0006246273 -2.646884e-05 -2.173088e-06 -4.139916e-06  4.489304e-07
## [6,] -0.2143654463  1.688654e-03  8.326532e-05  8.941480e-04 -3.298516e-05
## [7,] -0.2234420177  2.501586e-04  2.852420e-05  1.648842e-04 -5.908396e-06
##               [,6]          [,7]
## [1,] -2.143654e-01 -2.234420e-01
## [2,]  1.688654e-03  2.501586e-04
## [3,]  8.326532e-05  2.852420e-05
## [4,]  8.941480e-04  1.648842e-04
## [5,] -3.298516e-05 -5.908396e-06
## [6,]  1.041126e-02  4.203459e-04
## [7,]  4.203459e-04  2.768669e-03

se_betaHat<-sqrt(diag(var_betaHat))
se_betaHat

## [1] 4.7638818940 0.3321041332 0.0073577361 0.0138365215 0.0006700227
## [6] 0.1020355670 0.0526181480

Use the plot function in base R or use ggplot to produce diagnostic plots of linear regression fit. Comment on any problems you see with the fit. Do the residual plots suggest any unusually large outliers (in the y direction)?

mlr.resid<-resid(mod)
plot(year, mlr.resid)

plot(cylinders, mlr.resid)

plot(weight, mlr.resid)

plot(displacement, mlr.resid)

plot(acceleration, mlr.resid)

Problem Set #4

Jack Boydell

10/4/2019

Problem 1

Problem 2

Problem 3