Problem set 4

Auto <- read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.csv",
header=TRUE,
na.strings = "?")
library(tidyverse)

## ── Attaching packages ──────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ─────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

p <-2

Problem 1

1. -2.60107 2. 9.46425 3. p<0.001 4. 48 5. 0.6408 6.

Problem 2

a.

mod <- lm(horsepower~mpg, data = Auto)
mod

## 
## Call:
## lm(formula = horsepower ~ mpg, data = Auto)
## 
## Coefficients:
## (Intercept)          mpg  
##     194.476       -3.839

summary(mod)

## 
## Call:
## lm(formula = horsepower ~ mpg, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.892 -15.716  -2.094  13.108  96.947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 194.4756     3.8732   50.21   <2e-16 ***
## mpg          -3.8389     0.1568  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.19 on 390 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

i&ii. Based on the r squared value, 0.6059, this suggests a relatively strong relationship between the predictor and the response

iii. the relationship between the predictor and the response is negative

iv.

confint(mod)

##                  2.5 %    97.5 %
## (Intercept) 186.860756 202.09053
## mpg          -4.147086  -3.53069

new.df <- data.frame(mpg = c(0,98), horsepower = c(0,98))
predict(mod, new.df,
        interval = "predict")

##         fit       lwr       upr
## 1  194.4756  146.3045  242.6468
## 2 -181.7354 -234.6145 -128.8562

b.

ggplot(Auto, aes(x=mpg, y=horsepower))+
  geom_point()+
  geom_abline(slope=coefficients(mod)[2], intercept =coefficients(mod)[1],
              color="blue", lty=2, lwd=1)+
  theme_bw()

## Warning: Removed 5 rows containing missing values (geom_point).

c.

plot(mod)

###### The fit for the line on most of the graphs appears to be within reason, with the exception of scale/location plot which has a lot of noise around the line of best fit

Problem 3

auto <- Auto[,-c(8:9)]

a.

pairs(auto)

##### b.

cor(auto)

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7762599   -0.8044430         NA -0.8317389
## cylinders    -0.7762599  1.0000000    0.9509199         NA  0.8970169
## displacement -0.8044430  0.9509199    1.0000000         NA  0.9331044
## horsepower           NA         NA           NA          1         NA
## weight       -0.8317389  0.8970169    0.9331044         NA  1.0000000
## acceleration  0.4222974 -0.5040606   -0.5441618         NA -0.4195023
## year          0.5814695 -0.3467172   -0.3698041         NA -0.3079004
##              acceleration       year
## mpg             0.4222974  0.5814695
## cylinders      -0.5040606 -0.3467172
## displacement   -0.5441618 -0.3698041
## horsepower             NA         NA
## weight         -0.4195023 -0.3079004
## acceleration    1.0000000  0.2829009
## year            0.2829009  1.0000000

c.

mod_mpg <- lm(mpg~ cylinders+displacement+horsepower+weight+acceleration+year, data = auto)

summary(mod_mpg)

## 
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight + 
##     acceleration + year, data = auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6927 -2.3864 -0.0801  2.0291 14.3607 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.454e+01  4.764e+00  -3.051  0.00244 ** 
## cylinders    -3.299e-01  3.321e-01  -0.993  0.32122    
## displacement  7.678e-03  7.358e-03   1.044  0.29733    
## horsepower   -3.914e-04  1.384e-02  -0.028  0.97745    
## weight       -6.795e-03  6.700e-04 -10.141  < 2e-16 ***
## acceleration  8.527e-02  1.020e-01   0.836  0.40383    
## year          7.534e-01  5.262e-02  14.318  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.435 on 385 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.8093, Adjusted R-squared:  0.8063 
## F-statistic: 272.2 on 6 and 385 DF,  p-value: < 2.2e-16

1. There appears to be a strong relationship between the predictors and the response

2. Year and weight of the car appear to be the most significant predictors of mpg in this model based on their large negative t values

3. The coefficient for the year variable (estimated 7.534e-01) would suggest that it has a statistically significant relationship to the response output.

d.

# response vector
Y <- as.matrix(auto$mpg)
head(Y)

##      [,1]
## [1,]   18
## [2,]   15
## [3,]   18
## [4,]   16
## [5,]   17
## [6,]   15

dim(Y)

## [1] 397   1

n <- dim(Y)[1]

# design matrix
X <- matrix(c(rep(1,n), 
              auto$cylinders,
              auto$displacement,
              auto$horsepower,
              auto$weight,
              auto$acceleration,
              auto$year),
            ncol = 7,
            byrow = FALSE)
dim(X)

## [1] 397   7

# Beta-hat estimate
betaHat<-solve(t(Y)%*%Y)%*%t(Y)%*%X
betaHat

##            [,1]      [,2]     [,3] [,4]     [,5]      [,6]     [,7]
## [1,] 0.03829416 0.1922358 6.343801   NA 104.7789 0.6104535 2.937438

## SE beta-hat estimate:
## find projection & idenity matrcies
P<-Y%*%solve(t(Y)%*%Y)%*%t(Y)
dim(P)

## [1] 397 397

I<-diag(n)
dim(I)

## [1] 397 397

## verify residual values
res<-(I-P)%*%Y
## sum of square residuals
ss_res<-sum(t(res)%*%res)
ss_res

## [1] 8.64469e-26

## then MS(res)
ms_res<-ss_res/(n-p-1)
ms_res

## [1] 2.194084e-28

## variance of the beta-hat
var_betaHat<-ms_res*solve(t(Y)%*%Y)
var_betaHat

##              [,1]
## [1,] 8.999827e-34

## SE beta-hat
se_betaHat<-sqrt(diag(var_betaHat))
se_betaHat

## [1] 2.999971e-17

e.

plot(mod_mpg)

##### The line of fit in these graphs is greatly affected by noise within the data set, including some outliers that are shown specifically in the residuals / leverage graphs