summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
cor(mtcars)
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
boxplot(mpg ~ wt, data = mtcars,
title = "MPG by Weight",
xlab = "Weight",
ylab = "MPG")
Based on the boxplot you can see that as weight increases, MPG decreases
cor(mtcars$mpg, mtcars)
## mpg cyl disp hp drat wt qsec
## [1,] 1 -0.852162 -0.8475514 -0.7761684 0.6811719 -0.8676594 0.418684
## vs am gear carb
## [1,] 0.6640389 0.5998324 0.4802848 -0.5509251
?mtcars The variables with the strongest correlation with MPG are Cylinders (negative), Displacement (negative), and Weight (negative). These are the most correlated because these 3 variables have the largest effect on MPG. The heavier the car the less mpg you will get
sum(is.na(mtcars))
## [1] 0
colSums(is.na(mtcars))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 0 0 0 0 0 0 0 0
There is no missing data
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
table(mtcars$am)
##
## 0 1
## 19 13
table(mtcars$vs)
##
## 0 1
## 18 14
table(mtcars$gear)
##
## 3 4 5
## 15 12 5
table(mtcars$carb)
##
## 1 2 3 4 6 8
## 7 10 3 10 1 1
No inconsistent or invalid data in this data set. Everything makes sense and there isn’t any odd data
lm_model_mpg <- lm(mpg ~ ., data = mtcars)
summary(lm_model_mpg)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Each coefficient shows the change in MPG per 1 unit of change in the other variables (assuming all else is held constant)
par(mfrow = c(2,2))
plot(lm_model_mpg)
We are assuming that the relationship is linear, that the residuals are independent of one another, the residuals are normally distributed, and the residuals have a constant variance at every level of predictor variables. Based on the Q-Q Plot, all points are on the line so it is normally distributed. All assumptions are met, but you could make the case that the “Residuals vs Fitted” plot is curved which would mean the model is not linear. Overall all assumptions are met.
MSE_model <- mean((lm_model_mpg$residuals)^2)
MSE_model
## [1] 4.609201
model_interact <- lm(mpg ~ wt * hp + ., data = mtcars)
summary(model_interact)
##
## Call:
## lm(formula = mpg ~ wt * hp + ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6129 -1.4482 0.2571 1.1179 4.0907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.903972 16.390539 1.702 0.104165
## wt -9.613350 2.439829 -3.940 0.000809 ***
## hp -0.140989 0.041789 -3.374 0.003018 **
## cyl 1.011371 0.941887 1.074 0.295710
## disp -0.002363 0.015716 -0.150 0.882013
## drat -0.803048 1.455063 -0.552 0.587132
## qsec 0.744333 0.611042 1.218 0.237347
## vs 0.133431 1.759111 0.076 0.940291
## am -0.725300 1.999043 -0.363 0.720543
## gear 2.907613 1.434933 2.026 0.056279 .
## carb -0.512939 0.699359 -0.733 0.471800
## wt:hp 0.036219 0.011403 3.176 0.004746 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared: 0.9129, Adjusted R-squared: 0.865
## F-statistic: 19.06 on 11 and 20 DF, p-value: 3.046e-08
WT:HP is significant because it suggests the effect weight has on MPG is dependent on HP
summary(lm_model_mpg)$r.squared
## [1] 0.8690158
summary(model_interact)$r.squared
## [1] 0.9129353
The model with the highest r^2 is the interact model which means the weight and HP created more variance in the table which could lead to more uncommon relationships and more predictive power.
boxplot(mtcars)
Yes there are outliers: 1 in HP, 1 in QSEC,1 in WT, and 1 in CARB
library(DescTools)
mtcars_win <- mtcars
mtcars_win$hp <- Winsorize(mtcars_win$hp, val = quantile(mtcars$hp, probs = c(0.05, 0.95), na.rm = T))
HP_outlier_lm <- lm(mpg ~ ., data = mtcars_win)
summary(HP_outlier_lm)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars_win)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4440 -1.5389 -0.1908 1.1157 4.5556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.52194 18.89881 0.768 0.4508
## cyl -0.13067 1.03657 -0.126 0.9009
## disp 0.01304 0.01737 0.751 0.4609
## hp -0.02675 0.02533 -1.056 0.3031
## drat 1.07929 1.63034 0.662 0.5152
## wt -3.47885 1.84405 -1.887 0.0731 .
## qsec 0.69142 0.75049 0.921 0.3674
## vs 0.18045 2.05887 0.088 0.9310
## am 2.23794 2.05534 1.089 0.2886
## gear 0.56689 1.48310 0.382 0.7061
## carb -0.31407 0.76341 -0.411 0.6849
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.642 on 21 degrees of freedom
## Multiple R-squared: 0.8699, Adjusted R-squared: 0.8079
## F-statistic: 14.04 on 10 and 21 DF, p-value: 3.559e-07
HP_outlier_MSE <- mean((HP_outlier_lm$residuals)^2)
HP_outlier_MSE
## [1] 4.579838
An improved R^2 cannot promise better prediction