#load data
auto = read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.csv",header=TRUE,na.strings = "?")
auto = na.omit(auto)
auto = auto[,-c(8,9)]
attach(auto)
#review the response as a consequence of all predictors
pre_mod = lm(mpg~.,data=auto)
summary(pre_mod)
##
## Call:
## lm(formula = mpg ~ ., data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6927 -2.3864 -0.0801 2.0291 14.3607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.454e+01 4.764e+00 -3.051 0.00244 **
## cylinders -3.299e-01 3.321e-01 -0.993 0.32122
## displacement 7.678e-03 7.358e-03 1.044 0.29733
## horsepower -3.914e-04 1.384e-02 -0.028 0.97745
## weight -6.795e-03 6.700e-04 -10.141 < 2e-16 ***
## acceleration 8.527e-02 1.020e-01 0.836 0.40383
## year 7.534e-01 5.262e-02 14.318 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.435 on 385 degrees of freedom
## Multiple R-squared: 0.8093, Adjusted R-squared: 0.8063
## F-statistic: 272.2 on 6 and 385 DF, p-value: < 2.2e-16
Since only weight and year are statistically significant as predictors, we will examine some interaction effects between those two and other predictors in the dataset.
#choose some predictors to observe interactions
mod1 = lm(mpg~year:weight) #only year and weight interaction on mpg
summary(mod1)
##
## Call:
## lm(formula = mpg ~ year:weight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.3849 -3.3041 -0.5901 2.6158 17.5737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.571e+01 9.581e-01 47.71 <2e-16 ***
## year:weight -9.882e-05 4.105e-06 -24.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.957 on 390 degrees of freedom
## Multiple R-squared: 0.5977, Adjusted R-squared: 0.5967
## F-statistic: 579.4 on 1 and 390 DF, p-value: < 2.2e-16
We see that the interaction between just weight and year is statistically significant. The relationship appears to be moderately negatively correlated, evidenced by both the R-squared value and F-statistic p-value. We can therefore be confident that the coefficient, interaction of year and weight, is nonzero.
mod2 = lm(mpg~year*weight) #all possible interactions between year and weight on mpg
summary(mod2)
##
## Call:
## lm(formula = mpg ~ year * weight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0397 -1.9956 -0.0983 1.6525 12.9896
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.105e+02 1.295e+01 -8.531 3.30e-16 ***
## year 2.040e+00 1.718e-01 11.876 < 2e-16 ***
## weight 2.755e-02 4.413e-03 6.242 1.14e-09 ***
## year:weight -4.579e-04 5.907e-05 -7.752 8.02e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.193 on 388 degrees of freedom
## Multiple R-squared: 0.8339, Adjusted R-squared: 0.8326
## F-statistic: 649.3 on 3 and 388 DF, p-value: < 2.2e-16
Here, we notice that every coefficient is significant and the F-statistic is also statistically significant. Therefore, we are confident that the relationship between mpg and its predictors is nonzero.
mod3 = lm(mpg~year*weight*horsepower) #all possible interactions between year, weight, and horsepower on mpg
summary(mod3)
##
## Call:
## lm(formula = mpg ~ year * weight * horsepower)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8796 -1.6800 -0.0639 1.2556 11.5534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.577e+02 3.694e+01 -4.270 2.47e-05 ***
## year 2.886e+00 4.906e-01 5.884 8.75e-09 ***
## weight 1.889e-02 1.286e-02 1.469 0.14273
## horsepower 1.696e+00 4.208e-01 4.029 6.74e-05 ***
## year:weight -3.943e-04 1.712e-04 -2.303 0.02180 *
## year:horsepower -2.540e-02 5.652e-03 -4.494 9.27e-06 ***
## weight:horsepower -3.218e-04 1.112e-04 -2.893 0.00403 **
## year:weight:horsepower 4.972e-06 1.500e-06 3.314 0.00101 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.826 on 384 degrees of freedom
## Multiple R-squared: 0.8712, Adjusted R-squared: 0.8689
## F-statistic: 371.2 on 7 and 384 DF, p-value: < 2.2e-16
Here, all the coefficients are statistically significant except for weight. Despite that, however, since weight:horsepower, year:weight, and year:weight:horsepower are all statistically significant, we cannot disregard its correlation, even though its interaction alone is not significant. Furthermore, the F-statistic and the large R-squared value firmly assert that our coefficients are nonzero and that the overall relationship is strongly correlated.
Y = as.matrix(mpg)
X = matrix(c(rep(1,dim(Y)[1]),
cylinders,
displacement,
horsepower,
weight,
acceleration,
year),
ncol=7,
byrow = FALSE)
log_X = log(X)
root_X = sqrt(X)
square_X = X**2
X[1:10,]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1 8 307 130 3504 12.0 70
## [2,] 1 8 350 165 3693 11.5 70
## [3,] 1 8 318 150 3436 11.0 70
## [4,] 1 8 304 150 3433 12.0 70
## [5,] 1 8 302 140 3449 10.5 70
## [6,] 1 8 429 198 4341 10.0 70
## [7,] 1 8 454 220 4354 9.0 70
## [8,] 1 8 440 215 4312 8.5 70
## [9,] 1 8 455 225 4425 10.0 70
## [10,] 1 8 390 190 3850 8.5 70
log_X[1:10,]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0 2.079442 5.726848 4.867534 8.161660 2.484907 4.248495
## [2,] 0 2.079442 5.857933 5.105945 8.214194 2.442347 4.248495
## [3,] 0 2.079442 5.762051 5.010635 8.142063 2.397895 4.248495
## [4,] 0 2.079442 5.717028 5.010635 8.141190 2.484907 4.248495
## [5,] 0 2.079442 5.710427 4.941642 8.145840 2.351375 4.248495
## [6,] 0 2.079442 6.061457 5.288267 8.375860 2.302585 4.248495
## [7,] 0 2.079442 6.118097 5.393628 8.378850 2.197225 4.248495
## [8,] 0 2.079442 6.086775 5.370638 8.369157 2.140066 4.248495
## [9,] 0 2.079442 6.120297 5.416100 8.395026 2.302585 4.248495
## [10,] 0 2.079442 5.966147 5.247024 8.255828 2.140066 4.248495
root_X[1:10,]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1 2.828427 17.52142 11.40175 59.19459 3.464102 8.3666
## [2,] 1 2.828427 18.70829 12.84523 60.77006 3.391165 8.3666
## [3,] 1 2.828427 17.83255 12.24745 58.61740 3.316625 8.3666
## [4,] 1 2.828427 17.43560 12.24745 58.59181 3.464102 8.3666
## [5,] 1 2.828427 17.37815 11.83216 58.72819 3.240370 8.3666
## [6,] 1 2.828427 20.71232 14.07125 65.88627 3.162278 8.3666
## [7,] 1 2.828427 21.30728 14.83240 65.98485 3.000000 8.3666
## [8,] 1 2.828427 20.97618 14.66288 65.66582 2.915476 8.3666
## [9,] 1 2.828427 21.33073 15.00000 66.52067 3.162278 8.3666
## [10,] 1 2.828427 19.74842 13.78405 62.04837 2.915476 8.3666
square_X[1:10,]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1 64 94249 16900 12278016 144.00 4900
## [2,] 1 64 122500 27225 13638249 132.25 4900
## [3,] 1 64 101124 22500 11806096 121.00 4900
## [4,] 1 64 92416 22500 11785489 144.00 4900
## [5,] 1 64 91204 19600 11895601 110.25 4900
## [6,] 1 64 184041 39204 18844281 100.00 4900
## [7,] 1 64 206116 48400 18957316 81.00 4900
## [8,] 1 64 193600 46225 18593344 72.25 4900
## [9,] 1 64 207025 50625 19580625 100.00 4900
## [10,] 1 64 152100 36100 14822500 72.25 4900
We notice that the transformation of the variables following the rational mathematical result of taking the singular value and applying the corresponding operation. We notice that the transformation along these variables are done independent of row and column.
library(ISLR)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'auto':
##
## mpg
#load data
data("Carseats")
carseats = Carseats
attach(carseats)
mod = lm(Sales~Price+Urban+US)
summary(mod)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
We want to first analyze the Urban and US predictors. Here, we notice that Urban and US use “No” as the reference; therefore the value is [1,0] if Urban and not US, [0,1] if not Urban and US, [1,1] if Urban and US, and [0,0] otherwise. For Urban areas, the intercept is decreased by a factor of 0.022. US areas, however, have a larger intercept, 14.24. So, without price as a factor, we would expect slightly less sales in Urban areas whereas US areas would have higher sales, initially.
The predictor, price, is a factor in all iterations and gives us the slope of our linear model. For Urban areas, non-US, the line is y = 13.02 - 0.0544x. For US areas, non-Urban, the line is y = 14.24 - 0.0544x. For non-Urban, non-US areas, the line is y = 13.04 - 0.0544x. For Urban, US areas, the line is y = 14.22 - 0.0544x.
We can reject the null hypothesis that H_{0}: B_{j} = 0 for price and US.
mod2 = lm(Sales~Price+US)
summary(mod2)
##
## Call:
## lm(formula = Sales ~ Price + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
plot(mod)
plot(mod2)
The first model provides a great fit, but contains unnecessary information. The second model seems to have a better fit in that it maintains a low Cook’s distance, which signifies that it is a closer examination and possibly a better model.
confint(mod)
## 2.5 % 97.5 %
## (Intercept) 11.76359670 14.32334118
## Price -0.06476419 -0.04415351
## UrbanYes -0.55597316 0.51214085
## USYes 0.69130419 1.70984121
confint(mod2)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
The confidence interval is given above.