Mihreteab Teklehaimanot
auto = read.csv("auto.csv")
auto$horsepower = as.integer(auto$horsepower)
auto = na.omit(auto)
str(auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
pairs(auto[,1:7]) # can't produce scatterplot matrix with non numeric variables, name is excluded
cor(auto[,1:7])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## acceleration year
## mpg 0.4233285 0.5805410
## cylinders -0.5046834 -0.3456474
## displacement -0.5438005 -0.3698552
## horsepower -0.6891955 -0.4163615
## weight -0.4168392 -0.3091199
## acceleration 1.0000000 0.2903161
## year 0.2903161 1.0000000
auto_lm = lm(mpg ~ . -name, data = auto)
summary(auto_lm)
##
## Call:
## lm(formula = mpg ~ . - name, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
i. Is there a relationship between the predictors and the response?
ii. Which predictors appear to have a statistically significant relationship to the response?
iii. What does the coefficient for the “year” variable suggest?
par(mfrow = c(2,2))
plot(auto_lm)
Comment on any problems you see with the fit
Do the residual plots suggest any unusually large outliers?
Does the leverage plot identify any observations with unusually high leverage?
fit.int = (lm(mpg ~ .-name + cylinders:displacement, data = auto))
summary(fit.int)
##
## Call:
## lm(formula = mpg ~ . - name + cylinders:displacement, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.6081 -1.7833 -0.0465 1.6821 12.2617
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.7096590 4.6858582 -0.578 0.563426
## cylinders -2.6962123 0.4094916 -6.584 1.51e-10 ***
## displacement -0.0774797 0.0141535 -5.474 7.96e-08 ***
## horsepower -0.0476026 0.0133736 -3.559 0.000418 ***
## weight -0.0052339 0.0006253 -8.370 1.10e-15 ***
## acceleration 0.0597997 0.0918038 0.651 0.515188
## year 0.7594500 0.0473354 16.044 < 2e-16 ***
## origin 0.7087399 0.2736917 2.590 0.009976 **
## cylinders:displacement 0.0136081 0.0017209 7.907 2.84e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.089 on 383 degrees of freedom
## Multiple R-squared: 0.8465, Adjusted R-squared: 0.8433
## F-statistic: 264.1 on 8 and 383 DF, p-value: < 2.2e-16
fit.int1 = (lm(mpg ~ .-name + weight * displacement, data = auto))
summary(fit.int1)
##
## Call:
## lm(formula = mpg ~ . - name + weight * displacement, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9027 -1.8092 -0.0946 1.5549 12.1687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.389e+00 4.301e+00 -1.253 0.2109
## cylinders 1.175e-01 2.943e-01 0.399 0.6899
## displacement -6.837e-02 1.104e-02 -6.193 1.52e-09 ***
## horsepower -3.280e-02 1.238e-02 -2.649 0.0084 **
## weight -1.064e-02 7.136e-04 -14.915 < 2e-16 ***
## acceleration 6.724e-02 8.805e-02 0.764 0.4455
## year 7.852e-01 4.553e-02 17.246 < 2e-16 ***
## origin 5.610e-01 2.622e-01 2.139 0.0331 *
## displacement:weight 2.269e-05 2.257e-06 10.054 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.964 on 383 degrees of freedom
## Multiple R-squared: 0.8588, Adjusted R-squared: 0.8558
## F-statistic: 291.1 on 8 and 383 DF, p-value: < 2.2e-16
fit_cyl = lm(mpg ~ cylinders, auto)
fit_ptrans = lm(mpg ~ poly(cylinders,3), auto)
summary(fit_cyl)
##
## Call:
## lm(formula = mpg ~ cylinders, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.2413 -3.1832 -0.6332 2.5491 17.9168
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.9155 0.8349 51.40 <2e-16 ***
## cylinders -3.5581 0.1457 -24.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.914 on 390 degrees of freedom
## Multiple R-squared: 0.6047, Adjusted R-squared: 0.6037
## F-statistic: 596.6 on 1 and 390 DF, p-value: < 2.2e-16
summary(fit_ptrans)
##
## Call:
## lm(formula = mpg ~ poly(cylinders, 3), data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2869 -2.9058 -0.9627 2.3403 18.0218
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.446 0.237 98.919 < 2e-16 ***
## poly(cylinders, 3)1 -120.013 4.693 -25.574 < 2e-16 ***
## poly(cylinders, 3)2 8.113 4.693 1.729 0.0846 .
## poly(cylinders, 3)3 28.379 4.693 6.047 3.46e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.693 on 388 degrees of freedom
## Multiple R-squared: 0.6413, Adjusted R-squared: 0.6385
## F-statistic: 231.2 on 3 and 388 DF, p-value: < 2.2e-16
anova(fit_cyl,fit_ptrans)
lm.mha = lm(mpg ~ horsepower, auto)
lm.trans = lm(mpg ~ horsepower + log(horsepower), auto)
summary(lm.mha)
##
## Call:
## lm(formula = mpg ~ horsepower, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
summary(lm.trans)
##
## Call:
## lm(formula = mpg ~ horsepower + log(horsepower), data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5118 -2.5018 -0.2533 2.4446 15.3102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 156.04057 12.08267 12.914 < 2e-16 ***
## horsepower 0.11846 0.02929 4.044 6.34e-05 ***
## log(horsepower) -31.59815 3.28363 -9.623 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.415 on 389 degrees of freedom
## Multiple R-squared: 0.6817, Adjusted R-squared: 0.6801
## F-statistic: 416.6 on 2 and 389 DF, p-value: < 2.2e-16
anova(lm.mha, lm.trans)
cseats_lm = lm(Sales ~ Price + Urban + US, Carseats)
summary(cseats_lm)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
Price is significant and is inversely (negatively) related to Sales and this indicates that for every dollar price increase, sales decreases by $54.
The dummy variable USYes is significant and sales in stores located in US are $1,200 higher than sales in stores located outside US.
Urban is not significant and doesn’t have effect on sales.
cseats_lm2 = lm(Sales ~ Price + US, Carseats)
summary(cseats_lm2)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
Based on the F-statistics and its smaller p-value both models are significant. Both models fit the data similarly, with the model from (e) fitting the data slightly better (adjusted R2 - 23.54 vs 23.35%).
However, the R-square and Adjust R-squared values for both models are relatively small and only 23.54 (23.35) % of the variation is explained by the models.
confint(cseats_lm2, level = 0.95 )
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
par(mfrow=c(2,2))
plot(cseats_lm2)
summary(influence.measures(cseats_lm2))
## Potentially influential observations of
## lm(formula = Sales ~ Price + US, data = Carseats) :
##
## dfb.1_ dfb.Pric dfb.USYs dffit cov.r cook.d hat
## 26 0.24 -0.18 -0.17 0.28_* 0.97_* 0.03 0.01
## 29 -0.10 0.10 -0.10 -0.18 0.97_* 0.01 0.01
## 43 -0.11 0.10 0.03 -0.11 1.05_* 0.00 0.04_*
## 50 -0.10 0.17 -0.17 0.26_* 0.98 0.02 0.01
## 51 -0.05 0.05 -0.11 -0.18 0.95_* 0.01 0.00
## 58 -0.05 -0.02 0.16 -0.20 0.97_* 0.01 0.01
## 69 -0.09 0.10 0.09 0.19 0.96_* 0.01 0.01
## 126 -0.07 0.06 0.03 -0.07 1.03_* 0.00 0.03_*
## 160 0.00 0.00 0.00 0.01 1.02_* 0.00 0.02
## 166 0.21 -0.23 -0.04 -0.24 1.02 0.02 0.03_*
## 172 0.06 -0.07 0.02 0.08 1.03_* 0.00 0.02
## 175 0.14 -0.19 0.09 -0.21 1.03_* 0.02 0.03_*
## 210 -0.14 0.15 -0.10 -0.22 0.97_* 0.02 0.01
## 270 -0.03 0.05 -0.03 0.06 1.03_* 0.00 0.02
## 298 -0.06 0.06 -0.09 -0.15 0.97_* 0.01 0.00
## 314 -0.05 0.04 0.02 -0.05 1.03_* 0.00 0.02_*
## 353 -0.02 0.03 0.09 0.15 0.97_* 0.01 0.00
## 357 0.02 -0.02 0.02 -0.03 1.03_* 0.00 0.02
## 368 0.26 -0.23 -0.11 0.27_* 1.01 0.02 0.02_*
## 377 0.14 -0.15 0.12 0.24 0.95_* 0.02 0.01
## 384 0.00 0.00 0.00 0.00 1.02_* 0.00 0.02
## 387 -0.03 0.04 -0.03 0.05 1.02_* 0.00 0.02
## 396 -0.05 0.05 0.08 0.14 0.98_* 0.01 0.00
x = rnorm(100)
y = 2^x + rnorm(100)
sum(x^2)
## [1] 104.7884
sum(y^2)
## [1] 339.4378
lm(x ~ y - 1) # -1 to ensure, there will be no intercept
##
## Call:
## lm(formula = x ~ y - 1)
##
## Coefficients:
## y
## 0.3314
lm(y ~ x - 1)
##
## Call:
## lm(formula = y ~ x - 1)
##
## Coefficients:
## x
## 1.074
set.seed(1)
x <- rnorm(100)
y <- -sample(x)
sum(x^2)
## [1] 81.05509
sum(y^2)
## [1] 81.05509
lm(x ~ y - 1)
##
## Call:
## lm(formula = x ~ y - 1)
##
## Coefficients:
## y
## 0.07768
lm(y ~ x - 1)
##
## Call:
## lm(formula = y ~ x - 1)
##
## Coefficients:
## x
## 0.07768