auto <- read.table("Data/Auto.data",
header=TRUE,
na.strings = "?",)
auto <- na.omit(auto)
auto <- auto[,-c(8:9)]
attach(auto)
## The following object is masked from package:ggplot2:
##
## mpg
plot(auto)
model <- lm(mpg ~ displacement*horsepower*weight, data=auto)
summary(model)
##
## Call:
## lm(formula = mpg ~ displacement * horsepower * weight, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2534 -2.1937 -0.3389 1.7998 16.7662
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.130e+01 5.319e+00 11.524 < 2e-16 ***
## displacement -1.051e-01 3.736e-02 -2.812 0.00517 **
## horsepower -2.267e-01 7.046e-02 -3.218 0.00140 **
## weight -5.976e-03 2.249e-03 -2.657 0.00820 **
## displacement:horsepower 6.248e-04 2.593e-04 2.409 0.01646 *
## displacement:weight 1.390e-05 9.715e-06 1.430 0.15341
## horsepower:weight 1.736e-05 2.368e-05 0.733 0.46398
## displacement:horsepower:weight -6.838e-08 6.488e-08 -1.054 0.29253
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.874 on 384 degrees of freedom
## Multiple R-squared: 0.7581, Adjusted R-squared: 0.7536
## F-statistic: 171.9 on 7 and 384 DF, p-value: < 2.2e-16
model2 <- lm(mpg ~ displacement*horsepower, data=auto)
summary(model2)
##
## Call:
## lm(formula = mpg ~ displacement * horsepower, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9391 -2.3373 -0.5816 2.1698 17.5771
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.305e+01 1.526e+00 34.77 <2e-16 ***
## displacement -9.805e-02 6.682e-03 -14.67 <2e-16 ***
## horsepower -2.343e-01 1.959e-02 -11.96 <2e-16 ***
## displacement:horsepower 5.828e-04 5.193e-05 11.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.944 on 388 degrees of freedom
## Multiple R-squared: 0.7466, Adjusted R-squared: 0.7446
## F-statistic: 381 on 3 and 388 DF, p-value: < 2.2e-16
model3 <- lm(mpg ~ displacement:horsepower, data = auto)
summary(model3)
##
## Call:
## lm(formula = mpg ~ displacement:horsepower, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1917 -3.9460 -0.9919 3.0108 18.2170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.989e+01 3.901e-01 76.62 <2e-16 ***
## displacement:horsepower -2.694e-04 1.209e-05 -22.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.184 on 390 degrees of freedom
## Multiple R-squared: 0.56, Adjusted R-squared: 0.5589
## F-statistic: 496.4 on 1 and 390 DF, p-value: < 2.2e-16
The interaction between displacement and horsepower, even looking at it in a larger model, seems to be significant, so I isolated it in a smaller model a few different ways.
ggplot(data=auto, aes(x=horsepower, y=mpg))+ geom_point()
model4 <- lm(mpg ~ displacement:horsepower+I(horsepower^2), data = auto)
summary(model4)
##
## Call:
## lm(formula = mpg ~ displacement:horsepower + I(horsepower^2),
## data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7443 -3.7544 -0.9565 3.0978 18.2383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.958e+01 4.398e-01 67.255 < 2e-16 ***
## I(horsepower^2) 1.610e-04 1.072e-04 1.502 0.134
## displacement:horsepower -3.400e-04 4.854e-05 -7.005 1.09e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.175 on 389 degrees of freedom
## Multiple R-squared: 0.5626, Adjusted R-squared: 0.5603
## F-statistic: 250.1 on 2 and 389 DF, p-value: < 2.2e-16
plot(model4)
Hmm, that didn’t seem to accomplish too much…
model5 <- lm(mpg ~ horsepower+I(sqrt(horsepower)), data = auto)
summary(model4)
##
## Call:
## lm(formula = mpg ~ displacement:horsepower + I(horsepower^2),
## data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7443 -3.7544 -0.9565 3.0978 18.2383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.958e+01 4.398e-01 67.255 < 2e-16 ***
## I(horsepower^2) 1.610e-04 1.072e-04 1.502 0.134
## displacement:horsepower -3.400e-04 4.854e-05 -7.005 1.09e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.175 on 389 degrees of freedom
## Multiple R-squared: 0.5626, Adjusted R-squared: 0.5603
## F-statistic: 250.1 on 2 and 389 DF, p-value: < 2.2e-16
model6 <- lm(mpg ~ horsepower + I(log(horsepower)), data = auto)
summary(model6)
##
## Call:
## lm(formula = mpg ~ horsepower + I(log(horsepower)), data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5118 -2.5018 -0.2533 2.4446 15.3102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 156.04057 12.08267 12.914 < 2e-16 ***
## horsepower 0.11846 0.02929 4.044 6.34e-05 ***
## I(log(horsepower)) -31.59815 3.28363 -9.623 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.415 on 389 degrees of freedom
## Multiple R-squared: 0.6817, Adjusted R-squared: 0.6801
## F-statistic: 416.6 on 2 and 389 DF, p-value: < 2.2e-16
plot(model6)
detach(auto)
Aha, using a log term for horsepower makes the residual vs fitted graph flat a lot straighter, which means we have a better fit.
#library(MASS)
#install.packages("ISLR")
library(ISLR)
Carseat <- Carseats
attach(Carseat)
mod <- lm(Sales ~ Price + Urban + US, data = Carseat)
summary(mod)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
Sales fall by roughly -0.054 for each ditional dollar of price. If the area is urban the average sales are -0.022 lower If it’s in the US, the average sales are 1.2005 higher.
with true = 1 and false = 0: sales = (13.04347 - 0.02192 * UrbanYes + 1.20057 * USYes) -0.05446 * Price + epsilon
Price and USYes seem to be significant predictors that are unlikely to have been so predictive due to chance as shown by their low p-values, but UrbanYes has a p-value of 0.936 and so could easily have happened by chance even if the null hypothesis were true.
mod2 <- lm(Sales ~ Price + US, data = Carseat)
summary(mod2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
plot(mod2)
These seem like not particularly well fitted models given that the adjusted r-squared is only 0.2354, implying that only about 23 percent of the response variable variation can be explained by the model. However, the fitted plot looks decent so there’s that at least.
ggplot(data=Carseat,aes(y=Sales,x=Price)) +
geom_point() +
geom_abline(intercept=13.03079,slope=-0.05448,linetype="dashed",color="red",size=1.5)
Looking at this plot, maybe this is the bast we can get with this data. (This doesn’t include the separation from the US categories, but I haven’t quite figured out how to do that with ggplot yet.)
confint(mod2)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632