mpgds <- read_xlsx("C:/Users/justt/Desktop/School/621/Assignment/Homework 7/mpg.xlsx")
mpgds <- as.data.frame(mpgds)
class(mpgds)
## [1] "data.frame"
str(mpgds)
## 'data.frame': 398 obs. of 2 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
model1 <- lm(mpg ~., data = mpgds)
summary(model1)
##
## Call:
## lm(formula = mpg ~ ., data = mpgds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.007 -5.636 -1.242 4.758 23.192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.9698 2.0432 2.432 0.0154 *
## acceleration 1.1912 0.1292 9.217 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.101 on 396 degrees of freedom
## Multiple R-squared: 0.1766, Adjusted R-squared: 0.1746
## F-statistic: 84.96 on 1 and 396 DF, p-value: < 2.2e-16
The model’s adjusted R-squared is 0.1746.
boxcox(model1)
model3 <- lm(I(log(mpg)) ~ ., data = mpgds)
summary(model3)
##
## Call:
## lm(formula = I(log(mpg)) ~ ., data = mpgds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06515 -0.23641 -0.00943 0.23576 0.79343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.24656 0.08759 25.648 <2e-16 ***
## acceleration 0.05491 0.00554 9.911 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3044 on 396 degrees of freedom
## Multiple R-squared: 0.1987, Adjusted R-squared: 0.1967
## F-statistic: 98.23 on 1 and 396 DF, p-value: < 2.2e-16
I also ran unit normal scaling (code is hidden from printing in RMD) on my original model and it did not change the adjusted R-squared. I performed the Box-Cox transformation on the original model which had a lambda closest to the integer 0, meaning we do a log transformation. I then created model3 using this information and it increased the adjusted R-squared to 0.1967.
pairs(mpgds)
No, these do not appear to be linear. It looks like it might be good to do a transformation.
model4 <- lm(I(log(mpg)) ~ acceleration + I(sqrt(acceleration)), data = mpgds)
summary(model4)
##
## Call:
## lm(formula = I(log(mpg)) ~ acceleration + I(sqrt(acceleration)),
## data = mpgds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06339 -0.22731 0.00077 0.21655 0.77214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.38791 1.23506 -1.933 0.053897 .
## acceleration -0.24561 0.08008 -3.067 0.002310 **
## I(sqrt(acceleration)) 2.36968 0.62997 3.762 0.000194 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2995 on 395 degrees of freedom
## Multiple R-squared: 0.2265, Adjusted R-squared: 0.2225
## F-statistic: 57.82 on 2 and 395 DF, p-value: < 2.2e-16
model5 <- lm(I(log(mpg)) ~ acceleration + I(acceleration^2), data = mpgds)
summary(model5)
##
## Call:
## lm(formula = I(log(mpg)) ~ acceleration + I(acceleration^2),
## data = mpgds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07126 -0.22527 -0.00066 0.21838 0.77803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.023320 0.331575 3.086 0.002170 **
## acceleration 0.213095 0.041764 5.102 5.22e-07 ***
## I(acceleration^2) -0.004959 0.001298 -3.820 0.000155 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2993 on 395 degrees of freedom
## Multiple R-squared: 0.2273, Adjusted R-squared: 0.2234
## F-statistic: 58.1 on 2 and 395 DF, p-value: < 2.2e-16
model6 <- lm(I(log(mpg)) ~ acceleration + I(1/acceleration), data = mpgds)
summary(model6)
##
## Call:
## lm(formula = I(log(mpg)) ~ acceleration + I(1/acceleration),
## data = mpgds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05749 -0.22920 0.00108 0.22127 0.76895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.26294 0.58605 7.274 1.89e-12 ***
## acceleration -0.01068 0.01963 -0.544 0.58682
## I(1/acceleration) -14.99800 4.31148 -3.479 0.00056 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3002 on 395 degrees of freedom
## Multiple R-squared: 0.2226, Adjusted R-squared: 0.2186
## F-statistic: 56.54 on 2 and 395 DF, p-value: < 2.2e-16
model7 <- lm(I(log(mpg)) ~ acceleration + I(log(acceleration)), data = mpgds)
summary(model7)
##
## Call:
## lm(formula = I(log(mpg)) ~ acceleration + I(log(acceleration)),
## data = mpgds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06111 -0.22515 0.00151 0.21794 0.77069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.59298 1.04365 -1.526 0.127724
## acceleration -0.09011 0.03966 -2.272 0.023624 *
## I(log(acceleration)) 2.23400 0.60516 3.692 0.000254 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2997 on 395 degrees of freedom
## Multiple R-squared: 0.2255, Adjusted R-squared: 0.2215
## F-statistic: 57.49 on 2 and 395 DF, p-value: < 2.2e-16
model5, the x^2, that was the best model with the highest adjusted R-squared
Per the above models, model4 = 0.2225, model5 = 0.2234, model6 = 0.2186, model7 = 0.2215
mpgds2 <- data.frame(mpg_box = I(log(mpgds$mpg)),
acceleration = mpgds$acceleration,
acc_trans = I(mpgds$acceleration^2))
str(mpgds2)
## 'data.frame': 398 obs. of 3 variables:
## $ mpg_box : 'AsIs' num 2.890371.... 2.708050.... 2.890371.... 2.772588.... 2.833213.... ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ acc_trans : 'AsIs' num 144 132.25 121 144 110.25 ...
model8_unit_normal = as.data.frame(apply(mpgds2,2,function(x){(x-mean(x))/sd(x)}))
summary(model8_unit_normal)
## mpg_box acceleration acc_trans
## Min. :-2.6620 Min. :-2.74436 Min. :-2.0957
## 1st Qu.:-0.7042 1st Qu.:-0.63208 1st Qu.:-0.6629
## Median : 0.1004 Median :-0.02469 Median :-0.1093
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7829 3rd Qu.: 0.58270 3rd Qu.: 0.5075
## Max. : 2.1793 Max. : 3.34770 Max. : 4.1145
model9 <- lm(mpg_box ~., data = model8_unit_normal)
summary(model9)
##
## Call:
## lm(formula = mpg_box ~ ., data = model8_unit_normal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.15395 -0.66322 -0.00194 0.64295 2.29063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.683e-16 4.417e-02 0.000 1.000000
## acceleration 1.730e+00 3.391e-01 5.102 5.22e-07 ***
## acc_trans -1.295e+00 3.391e-01 -3.820 0.000155 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8813 on 395 degrees of freedom
## Multiple R-squared: 0.2273, Adjusted R-squared: 0.2234
## F-statistic: 58.1 on 2 and 395 DF, p-value: < 2.2e-16
model10 <- lm(mpg_box ~., data = mpgds2)
summary(model10)
##
## Call:
## lm(formula = mpg_box ~ ., data = mpgds2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.07126 -0.22527 -0.00066 0.21838 0.77803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.023320 0.331575 3.086 0.002170 **
## acceleration 0.213095 0.041764 5.102 5.22e-07 ***
## acc_trans -0.004959 0.001298 -3.820 0.000155 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2993 on 395 degrees of freedom
## Multiple R-squared: 0.2273, Adjusted R-squared: 0.2234
## F-statistic: 58.1 on 2 and 395 DF, p-value: < 2.2e-16
boxcox(model10)
According to Normal Unit Scaling, acceleration has an absolute estimate of 1.730 and the transformation of acceleration has an absolute estimate of 1.295. Acceleration is more influential in predicting the Box-Cox transformation of mpg.