auto<-read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.csv",
header=TRUE,
na.strings = "?")
auto <- na.omit(auto)
auto<-auto[,-c(8:9)]
summary(auto)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
## weight acceleration year
## Min. :1613 Min. : 8.00 Min. :70.00
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00
## Median :2804 Median :15.50 Median :76.00
## Mean :2978 Mean :15.54 Mean :75.98
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00
## Max. :5140 Max. :24.80 Max. :82.00
#1A) Use the * and : symbols to fit linear regression models with interaction effects. Do any interactions appear to be statistically significant?
mod1 <- lm(mpg~acceleration*displacement, auto)
summary(mod1)
##
## Call:
## lm(formula = mpg ~ acceleration * displacement, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1540 -2.2872 -0.2687 2.0308 20.4099
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.0532678 2.9221224 7.889 3.13e-14 ***
## acceleration 0.8303377 0.1815300 4.574 6.44e-06 ***
## displacement 0.0031393 0.0113352 0.277 0.782
## acceleration:displacement -0.0045805 0.0007899 -5.799 1.38e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.456 on 388 degrees of freedom
## Multiple R-squared: 0.6766, Adjusted R-squared: 0.6741
## F-statistic: 270.5 on 3 and 388 DF, p-value: < 2.2e-16
plot(mod1$fitted.values,mod1$residuals)
mod2 <- lm(mpg~horsepower*weight, auto)
summary(mod2)
##
## Call:
## lm(formula = mpg ~ horsepower * weight, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7725 -2.2074 -0.2708 1.9973 14.7314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.356e+01 2.343e+00 27.127 < 2e-16 ***
## horsepower -2.508e-01 2.728e-02 -9.195 < 2e-16 ***
## weight -1.077e-02 7.738e-04 -13.921 < 2e-16 ***
## horsepower:weight 5.355e-05 6.649e-06 8.054 9.93e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.93 on 388 degrees of freedom
## Multiple R-squared: 0.7484, Adjusted R-squared: 0.7465
## F-statistic: 384.8 on 3 and 388 DF, p-value: < 2.2e-16
plot(mod2$fitted.values, mod2$residuals)
#The first model assesses the relationship between acceleration and displacement. The output suggests that the interaction between acceleration and displacement is significant, meaning the effect of acceleration on mpg changes depending on the displacement.There is no main effect of displacement, but there is a main effect of acceleration. However, the residual plot shows a general trend in the residuals to get more varied, indicating non-constant variance.
#The second model assesses the relationship between horsepower and weight.The output suggests that the interaction between horsepower and weight is significant, meaning the effect of horsepower on mpg changes depending on the weight. There were main effects for both horsepower and weight. The residual plot is a little bit fan shaped.
#1B) Try a few transformations of the variables such as log(x), sqrt(x), or x^2. Comment on your findings.
mod11 <- lm(mpg~weight+I(horsepower^2),auto)
summary(mod11)
##
## Call:
## lm(formula = mpg ~ weight + I(horsepower^2), data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.6755 -2.8081 -0.3665 2.2985 16.5202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.497e+01 9.977e-01 45.075 <2e-16 ***
## weight -6.899e-03 4.441e-04 -15.534 <2e-16 ***
## I(horsepower^2) -7.943e-05 3.844e-05 -2.066 0.0395 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.315 on 389 degrees of freedom
## Multiple R-squared: 0.696, Adjusted R-squared: 0.6944
## F-statistic: 445.2 on 2 and 389 DF, p-value: < 2.2e-16
plot(mod11$fitted.values,mod11$residuals)
mod12 <- lm(mpg~log10(acceleration), auto)
summary(mod12)
##
## Call:
## lm(formula = mpg ~ log10(acceleration), data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0234 -5.6231 -0.9787 4.5943 23.0872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -27.834 5.373 -5.180 3.56e-07 ***
## log10(acceleration) 43.291 4.526 9.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.033 on 390 degrees of freedom
## Multiple R-squared: 0.19, Adjusted R-squared: 0.1879
## F-statistic: 91.49 on 1 and 390 DF, p-value: < 2.2e-16
plot(mod12$fitted.values, mod12$residuals)
#Mod11 assesses mpg as a function of weight and squared horsepower. Both weight and horsepower are significant, but horsepower squared actually reduces the significance of predicting mpg with horsepower. The residual plot shows a trend in the residuals, which indicates that this may not be the best model for using horsepower and weight to predict mpg.
#Mod12 assesses mpg as a function of the log10 of acceleration. Log10 of acceleration is a significant predictor of mpg. The residual plot of this model shows fairly variance.
library(ISLR)
data(Carseats)
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
#2A) Fit a multiple regression model to predict Sales using Price, Urban and US.
carMod1 <- lm(Sales~Price+Urban+US, Carseats)
summary(carMod1)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
#2B) Provide an interpretation of each coefficient in the model. Be careful-some of the variables are qualitative.
#Price has a significant relationship to Sales such that for every unit increase in Price there is a .05 unit decrease in sales. There is a non-significant relationship between Sales and Urban such that in an Urban area, sales decrease by approx. .02 units. There is a significant relationship between Sales and US such that in the US, Sales are about 1.2 units higher.
#2C) Write out the model in equation form.
#xi1 = Price
#xi2 = Urban
#xi3 = US
#y = 13.0435 + -.054459(xi1) + -.02192(xi2) + 1.20057(xi3)
#xi2: 1 if Urban, 0 if else
#xi3: 1 if US, 0 if else
#2D) For which of the predictors can you reject the null hypothesis H0:Bj=0?
#You can reject the null that Bj=0 for Price and for US. Rejecting the null for Price says that the slope coefficient is significantly different from 0. Rejecting the null for US says that there is a significant difference between the US and nonUS in Sales.
#2E) On the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome.
carMod2 <- lm(Sales~Price+US, Carseats)
summary(carMod2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
#2F) The model used in A) shows that we can explain about 23.93% of the variance in mpg by knowing Price, Urban, and US. The model used in E) shows that we can also explain 23.93% of the variance in mpg, but this time by only knowing Price and US. This shows that Urban did not help to predict mpg and therefore does not contribute to explaining the variance in mpg. Also, the adjusted R-Squared is higher for the model used in E) (23.54% compared to 23.35%) due to using less predictors in the model.
#2G) Using the model from E), obtain the 95% confidence interval for the coefficients.
confint(carMod2)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632