asst 5

Problem 1

auto <- read.table("Data/Auto.data", 
                   header=TRUE,
                   na.strings = "?",)

auto <- na.omit(auto)
auto <- auto[,-c(8:9)]
attach(auto)

## The following object is masked from package:ggplot2:
## 
##     mpg

plot(auto)

model <- lm(mpg ~ displacement*horsepower*weight, data=auto)
summary(model)

## 
## Call:
## lm(formula = mpg ~ displacement * horsepower * weight, data = auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.2534  -2.1937  -0.3389   1.7998  16.7662 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     6.130e+01  5.319e+00  11.524  < 2e-16 ***
## displacement                   -1.051e-01  3.736e-02  -2.812  0.00517 ** 
## horsepower                     -2.267e-01  7.046e-02  -3.218  0.00140 ** 
## weight                         -5.976e-03  2.249e-03  -2.657  0.00820 ** 
## displacement:horsepower         6.248e-04  2.593e-04   2.409  0.01646 *  
## displacement:weight             1.390e-05  9.715e-06   1.430  0.15341    
## horsepower:weight               1.736e-05  2.368e-05   0.733  0.46398    
## displacement:horsepower:weight -6.838e-08  6.488e-08  -1.054  0.29253    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.874 on 384 degrees of freedom
## Multiple R-squared:  0.7581, Adjusted R-squared:  0.7536 
## F-statistic: 171.9 on 7 and 384 DF,  p-value: < 2.2e-16

model2 <- lm(mpg ~ displacement*horsepower, data=auto)
summary(model2)

## 
## Call:
## lm(formula = mpg ~ displacement * horsepower, data = auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9391  -2.3373  -0.5816   2.1698  17.5771 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              5.305e+01  1.526e+00   34.77   <2e-16 ***
## displacement            -9.805e-02  6.682e-03  -14.67   <2e-16 ***
## horsepower              -2.343e-01  1.959e-02  -11.96   <2e-16 ***
## displacement:horsepower  5.828e-04  5.193e-05   11.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.944 on 388 degrees of freedom
## Multiple R-squared:  0.7466, Adjusted R-squared:  0.7446 
## F-statistic:   381 on 3 and 388 DF,  p-value: < 2.2e-16

model3 <- lm(mpg ~ displacement:horsepower, data = auto)
summary(model3)

## 
## Call:
## lm(formula = mpg ~ displacement:horsepower, data = auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1917  -3.9460  -0.9919   3.0108  18.2170 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.989e+01  3.901e-01   76.62   <2e-16 ***
## displacement:horsepower -2.694e-04  1.209e-05  -22.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.184 on 390 degrees of freedom
## Multiple R-squared:   0.56,  Adjusted R-squared:  0.5589 
## F-statistic: 496.4 on 1 and 390 DF,  p-value: < 2.2e-16

The interaction between displacement and horsepower, even looking at it in a larger model, seems to be significant, so I isolated it in a smaller model a few different ways.

ggplot(data=auto, aes(x=horsepower, y=mpg))+ geom_point()

model4 <- lm(mpg ~ displacement:horsepower+I(horsepower^2), data = auto)
summary(model4)

## 
## Call:
## lm(formula = mpg ~ displacement:horsepower + I(horsepower^2), 
##     data = auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7443  -3.7544  -0.9565   3.0978  18.2383 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.958e+01  4.398e-01  67.255  < 2e-16 ***
## I(horsepower^2)          1.610e-04  1.072e-04   1.502    0.134    
## displacement:horsepower -3.400e-04  4.854e-05  -7.005 1.09e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.175 on 389 degrees of freedom
## Multiple R-squared:  0.5626, Adjusted R-squared:  0.5603 
## F-statistic: 250.1 on 2 and 389 DF,  p-value: < 2.2e-16

plot(model4)

Hmm, that didn’t seem to accomplish too much…

model5 <- lm(mpg ~ horsepower+I(sqrt(horsepower)), data = auto)
summary(model4)

## 
## Call:
## lm(formula = mpg ~ displacement:horsepower + I(horsepower^2), 
##     data = auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7443  -3.7544  -0.9565   3.0978  18.2383 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.958e+01  4.398e-01  67.255  < 2e-16 ***
## I(horsepower^2)          1.610e-04  1.072e-04   1.502    0.134    
## displacement:horsepower -3.400e-04  4.854e-05  -7.005 1.09e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.175 on 389 degrees of freedom
## Multiple R-squared:  0.5626, Adjusted R-squared:  0.5603 
## F-statistic: 250.1 on 2 and 389 DF,  p-value: < 2.2e-16

model6 <- lm(mpg ~ horsepower + I(log(horsepower)), data = auto)
summary(model6)

## 
## Call:
## lm(formula = mpg ~ horsepower + I(log(horsepower)), data = auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5118  -2.5018  -0.2533   2.4446  15.3102 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        156.04057   12.08267  12.914  < 2e-16 ***
## horsepower           0.11846    0.02929   4.044 6.34e-05 ***
## I(log(horsepower)) -31.59815    3.28363  -9.623  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.415 on 389 degrees of freedom
## Multiple R-squared:  0.6817, Adjusted R-squared:  0.6801 
## F-statistic: 416.6 on 2 and 389 DF,  p-value: < 2.2e-16

plot(model6)

detach(auto)

Aha, using a log term for horsepower makes the residual vs fitted graph flat a lot straighter, which means we have a better fit.

Problem 2

#library(MASS)
#install.packages("ISLR")
library(ISLR)
Carseat <- Carseats
attach(Carseat)
mod <- lm(Sales ~ Price + Urban + US, data = Carseat)
summary(mod)

## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16

Sales fall by roughly -0.054 for each ditional dollar of price. If the area is urban the average sales are -0.022 lower If it’s in the US, the average sales are 1.2005 higher.

with true = 1 and false = 0: sales = (13.04347 - 0.02192 * UrbanYes + 1.20057 * USYes) -0.05446 * Price + epsilon

Price and USYes seem to be significant predictors that are unlikely to have been so predictive due to chance as shown by their low p-values, but UrbanYes has a p-value of 0.936 and so could easily have happened by chance even if the null hypothesis were true.

mod2 <- lm(Sales ~ Price + US, data = Carseat)
summary(mod2)

## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16

plot(mod2)

These seem like not particularly well fitted models given that the adjusted r-squared is only 0.2354, implying that only about 23 percent of the response variable variation can be explained by the model. However, the fitted plot looks decent so there’s that at least.

ggplot(data=Carseat,aes(y=Sales,x=Price)) +
  geom_point() +
  geom_abline(intercept=13.03079,slope=-0.05448,linetype="dashed",color="red",size=1.5)

Looking at this plot, maybe this is the bast we can get with this data. (This doesn’t include the separation from the US categories, but I haven’t quite figured out how to do that with ggplot yet.)

confint(mod2)

##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632