pairs(Auto)
names(Auto)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name"
names(Auto[1:8])
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin"
cor(Auto[1:8])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
data = Auto[1:8]
model <- lm(mpg ~ ., data = data)
summary(model)
##
## Call:
## lm(formula = mpg ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model, pch = 16)
- From the residual vs fitted plot we can see that, the residuals does not “bounce randomly” but show a non linear pattern. this suggests that the relationship is non-linear. - here are few residuals that stands out from the pattern. suggests there are outliers. - variance among the residual does not seem to be equal
lm.auto2 <- lm(mpg ~ .:. , data = data)
summary(lm.auto2)
##
## Call:
## lm(formula = mpg ~ .:., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
– displacement:year
– acceleration:year
– acceleration:origin
plot(sqrt(data$displacement),data$mpg,pch = 16, col = "red")
plot((data$displacement)^2,data$mpg,pch = 16, col = "red")
plot(log(data$displacement),data$mpg,pch = 16, col = "red")
it looks like log transformation gives the most linear looking scatter plot
data("Carseats")
attach(Carseats)
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
lm.carseats <- lm(Sales ~ Price + Urban + US)
summary(lm.carseats)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
For urban the 0 - non-urban location and 1- urban location of store. Negative coefficient indicates urban location store’s carseat sale is bad as compare to non urban location
for US - 0 - non - US store and 1- US store. Positive coefficient indicates US stores has better sales of carseat compare to non US stores.
Coefficient and statistical Significance:
Price(-0.054459) - Sales drop by 54 for each dollar increase in Price - Statistically Significant
UrbanYes(-0.021916) - Sales are 21 lower for Urban location - Not statistically significant
USYes (1.200573) - Sales are 1201 higher in the US location - Statistically significant
contrasts(Urban)
## Yes
## No 0
## Yes 1
contrasts(US)
## Yes
## No 0
## Yes 1
\[ Sales_i=\beta_0+\beta_1Price_i+ \begin{cases} \beta_2+\beta_3 & Urban=Yes, US=Yes \\ \beta_2 & Urban=Yes, US=No\\ \beta_3 & Urban=No, US=Yes \\ 0 & Urban=No, US=No \end{cases} \]
I have used this line of code from online repo to represent the function
We can reject the Null hypothesis of US and Price. as the p-value is less than 0.05
lm.carseats2 <- lm(Sales ~ Price + US)
summary(lm.carseats2)
##
## Call:
## lm(formula = Sales ~ Price + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
anova(lm.carseats,lm.carseats2)
## Analysis of Variance Table
##
## Model 1: Sales ~ Price + Urban + US
## Model 2: Sales ~ Price + US
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 396 2420.8
## 2 397 2420.9 -1 -0.03979 0.0065 0.9357
Second model can fit data better.
confint(lm.carseats2)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
par(mfrow=c(2,2))
plot(lm.carseats2)
No eveidence of Outliers
But there is high leverage point
#library(car)
#leveragePlots(lm.carseats2)
set.seed(100)
x = rnorm(100)
y = 2 * x + rnorm(100)
lm.xy <- lm(x ~ y)
summary(lm.xy)
##
## Call:
## lm(formula = x ~ y)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.17358 -0.22120 0.02454 0.21603 1.10483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.004765 0.038751 -0.123 0.902
## y 0.452505 0.018647 24.267 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3875 on 98 degrees of freedom
## Multiple R-squared: 0.8573, Adjusted R-squared: 0.8559
## F-statistic: 588.9 on 1 and 98 DF, p-value: < 2.2e-16
lm.yx <- lm(y ~ x)
summary(lm.yx)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05195 -0.43265 -0.07854 0.48583 1.93858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.01145 0.07929 0.144 0.885
## x 1.89463 0.07807 24.267 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7929 on 98 degrees of freedom
## Multiple R-squared: 0.8573, Adjusted R-squared: 0.8559
## F-statistic: 588.9 on 1 and 98 DF, p-value: < 2.2e-16
x2 = y2 = rnorm(100)
y2 = sample(x,replace = FALSE,100)
summary(lm(y2~x2+0))
##
## Call:
## lm(formula = y2 ~ x2 + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23285 -0.74348 0.03916 0.66026 2.41761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x2 0.15408 0.09748 1.581 0.117
##
## Residual standard error: 1.008 on 99 degrees of freedom
## Multiple R-squared: 0.02462, Adjusted R-squared: 0.01476
## F-statistic: 2.498 on 1 and 99 DF, p-value: 0.1171
summary(lm(x2~y2+0))
##
## Call:
## lm(formula = x2 ~ y2 + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.00424 -0.55479 0.09887 0.67766 2.79793
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y2 0.1598 0.1011 1.581 0.117
##
## Residual standard error: 1.026 on 99 degrees of freedom
## Multiple R-squared: 0.02462, Adjusted R-squared: 0.01476
## F-statistic: 2.498 on 1 and 99 DF, p-value: 0.1171