library(ISLR2)
Auto data set.pairs(Auto)
name
variable, which is qualitative.names(Auto)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name"
cor(Auto[1:8])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
lm.fit=lm(mpg~.-name, data=Auto)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
Yes, There is a relationship between predictor and the response variable by a p-value < 0.05
Displacement, weight, year and origin have a statistically significant relationship with mpg.
year variable
suggest?The year coefficient suggests a positive correlation between increasing 1 year, then the mpg is increased 0.75.
par(mfrow = c(2, 2))
plot(lm.fit)
which.max(hatvalues(lm.fit))
## 14
## 14
The plot of residuals versus fitted values indicates the non linearity in the data. The plot of standardized residuals versus leverage indicate the outliers (higher than 2 or lower than -2) and one high leverage point 14.
lm.fit1 <- lm(mpg ~ +cylinders : displacement+displacement : weight, data = Auto[, 1:8])
summary(lm.fit1)
##
## Call:
## lm(formula = mpg ~ +cylinders:displacement + displacement:weight,
## data = Auto[, 1:8])
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5522 -3.3109 -0.6592 2.7231 17.4894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.126e+01 3.884e-01 80.47 < 2e-16 ***
## cylinders:displacement -7.481e-04 1.335e-03 -0.56 0.576
## displacement:weight -1.041e-05 2.546e-06 -4.09 5.25e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.767 on 389 degrees of freedom
## Multiple R-squared: 0.6288, Adjusted R-squared: 0.6269
## F-statistic: 329.5 on 2 and 389 DF, p-value: < 2.2e-16
lm.fit2 <- lm(mpg ~ cylinders * displacement+displacement * weight, data = Auto[, 1:8])
summary(lm.fit2)
##
## Call:
## lm(formula = mpg ~ cylinders * displacement + displacement *
## weight, data = Auto[, 1:8])
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.2934 -2.5184 -0.3476 1.8399 17.7723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.262e+01 2.237e+00 23.519 < 2e-16 ***
## cylinders 7.606e-01 7.669e-01 0.992 0.322
## displacement -7.351e-02 1.669e-02 -4.403 1.38e-05 ***
## weight -9.888e-03 1.329e-03 -7.438 6.69e-13 ***
## cylinders:displacement -2.986e-03 3.426e-03 -0.872 0.384
## displacement:weight 2.128e-05 5.002e-06 4.254 2.64e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.103 on 386 degrees of freedom
## Multiple R-squared: 0.7272, Adjusted R-squared: 0.7237
## F-statistic: 205.8 on 5 and 386 DF, p-value: < 2.2e-16
The interaction between displacement and weight is statistically significant (p-value < 0.05), while the interaction between cylinders and displacement is not (p-value > 0.05)
lm.fit3<-lm(log(mpg) ~ . - name, data = Auto)
summary(lm.fit3)
##
## Call:
## lm(formula = log(mpg) ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40955 -0.06533 0.00079 0.06785 0.33925
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.751e+00 1.662e-01 10.533 < 2e-16 ***
## cylinders -2.795e-02 1.157e-02 -2.415 0.01619 *
## displacement 6.362e-04 2.690e-04 2.365 0.01852 *
## horsepower -1.475e-03 4.935e-04 -2.989 0.00298 **
## weight -2.551e-04 2.334e-05 -10.931 < 2e-16 ***
## acceleration -1.348e-03 3.538e-03 -0.381 0.70339
## year 2.958e-02 1.824e-03 16.211 < 2e-16 ***
## origin 4.071e-02 9.955e-03 4.089 5.28e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1191 on 384 degrees of freedom
## Multiple R-squared: 0.8795, Adjusted R-squared: 0.8773
## F-statistic: 400.4 on 7 and 384 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm.fit3)
All predictors are statistically significant except acceleration
Carseats
data setattach(Carseats)
Sales
using Price, Urban, and US.mult_regression <- lm(Sales~Price+Urban+US)
summary(mult_regression)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
Urban has no effect on Sales. So, Price and US are two variables used for Sales predictions.Increase $1 in Price, Sales go down by $54.4. Sales inside the US are $1,200 higher than outside the US.
Sales = 13.043469−0.054459XPrice − 0.021916XUrbanYes + 1.200573XUSYes
Price and US because they have p-value <
0.05
mult_regression <- lm(Sales~Price+US)
summary(mult_regression)
##
## Call:
## lm(formula = Sales ~ Price + US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
not good
confint(mult_regression)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
par(mfrow=c(2,2))
plot(mult_regression)
It will be equal if the summation of Xi^2 equals with the summation of Yi^2
set.seed (1)
x <- rnorm (100)
y <- 2 * x + rnorm (100)
sum(x^2)
## [1] 81.05509
sum(y^2)
## [1] 413.2135
fit.X <- lm(x ~ y + 0)
summary(fit.X)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8699 -0.2368 0.1030 0.2858 0.8938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.39111 0.02089 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4246 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
fit.Y <- lm(y ~ x + 0)
summary(fit.Y)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9154 -0.6472 -0.1771 0.5056 2.3109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.9939 0.1065 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9586 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
set.seed (1)
x <- rnorm (100)
y <- 1 * x
sum(x^2)
## [1] 81.05509
sum(y^2)
## [1] 81.05509
fit.X <- lm(x ~ y + 0)
summary(fit.X)
## Warning in summary.lm(fit.X): essentially perfect fit: summary may be unreliable
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.888e-16 -1.689e-17 1.339e-18 3.057e-17 2.552e-16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 1.000e+00 6.479e-18 1.543e+17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.833e-17 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.382e+34 on 1 and 99 DF, p-value: < 2.2e-16
fit.Y <- lm(y ~ x + 0)
summary(fit.Y)
## Warning in summary.lm(fit.Y): essentially perfect fit: summary may be unreliable
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.888e-16 -1.689e-17 1.339e-18 3.057e-17 2.552e-16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.000e+00 6.479e-18 1.543e+17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.833e-17 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.382e+34 on 1 and 99 DF, p-value: < 2.2e-16