Data Exploration
data(mtcars)
df <- mtcars
str(df)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(df)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
Correlation with mpg
correlations <- sort(cor(df)[, "mpg"], decreasing = TRUE)
correlations
## mpg drat vs am gear qsec carb
## 1.0000000 0.6811719 0.6640389 0.5998324 0.4802848 0.4186840 -0.5509251
## hp disp cyl wt
## -0.7761684 -0.8475514 -0.8521620 -0.8676594
Data Processing
any_NA <- sum(is.na(df))
anyNA(df)
## [1] FALSE
summary(any_NA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
Build Model
model <- lm(mpg ~ ., data = df)
model
##
## Call:
## lm(formula = mpg ~ ., data = df)
##
## Coefficients:
## (Intercept) cyl disp hp drat wt
## 12.30337 -0.11144 0.01334 -0.02148 0.78711 -3.71530
## qsec vs am gear carb
## 0.82104 0.31776 2.52023 0.65541 -0.19942
summary(model)
##
## Call:
## lm(formula = mpg ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Model is insignificant
Negative values indicare that higher powered cars have a lower eco
friendly exhaust
Positive values indicate that they have a higher efficiency
rating
Model Assumptions
par(mfrow = c(2, 2))
plot(model)

Model Interaction w/ Other Variables
model_interaction <- lm(mpg ~ wt * hp + cyl + disp + drat + qsec + vs + am + gear + carb,
data = df)
summary(model_interaction)
##
## Call:
## lm(formula = mpg ~ wt * hp + cyl + disp + drat + qsec + vs +
## am + gear + carb, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6129 -1.4482 0.2571 1.1179 4.0907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.903972 16.390539 1.702 0.104165
## wt -9.613350 2.439829 -3.940 0.000809 ***
## hp -0.140989 0.041789 -3.374 0.003018 **
## cyl 1.011371 0.941887 1.074 0.295710
## disp -0.002363 0.015716 -0.150 0.882013
## drat -0.803048 1.455063 -0.552 0.587132
## qsec 0.744333 0.611042 1.218 0.237347
## vs 0.133431 1.759111 0.076 0.940291
## am -0.725300 1.999043 -0.363 0.720543
## gear 2.907613 1.434933 2.026 0.056279 .
## carb -0.512939 0.699359 -0.733 0.471800
## wt:hp 0.036219 0.011403 3.176 0.004746 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.214 on 20 degrees of freedom
## Multiple R-squared: 0.9129, Adjusted R-squared: 0.865
## F-statistic: 19.06 on 11 and 20 DF, p-value: 3.046e-08
Outlier
outliers <- which(abs(rstandard(model_interaction)) > 2)
outliers
## Fiat 128
## 18
mtcars[outliers, ]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1
Winsorization
df_winsor <- df
lower <- quantile(df_winsor$mpg, 0.01)
upper <- quantile(df_winsor$mpg, 0.99)
df_winsor$mpg[df_winsor$mpg < lower] <- lower
df_winsor$mpg[df_winsor$mpg > upper] <- upper
model_int_wins <- lm(mpg ~ wt * hp +
cyl +
disp +
drat +
qsec +
vs +
am +
gear +
carb,
data = df_winsor)
summary(model_int_wins)
##
## Call:
## lm(formula = mpg ~ wt * hp + cyl + disp + drat + qsec + vs +
## am + gear + carb, data = df_winsor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.598 -1.399 0.260 1.136 4.173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.444058 16.195810 1.756 0.09435 .
## wt -9.472667 2.410842 -3.929 0.00083 ***
## hp -0.139227 0.041293 -3.372 0.00303 **
## cyl 0.965237 0.930697 1.037 0.31206
## disp -0.002245 0.015529 -0.145 0.88648
## drat -0.801740 1.437776 -0.558 0.58329
## qsec 0.708513 0.603782 1.173 0.25440
## vs 0.143969 1.738212 0.083 0.93481
## am -0.760277 1.975293 -0.385 0.70438
## gear 2.889049 1.417885 2.038 0.05504 .
## carb -0.509644 0.691050 -0.737 0.46939
## wt:hp 0.035629 0.011267 3.162 0.00490 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.188 on 20 degrees of freedom
## Multiple R-squared: 0.914, Adjusted R-squared: 0.8667
## F-statistic: 19.33 on 11 and 20 DF, p-value: 2.698e-08
Does an improved R^2 Improve the model
yes it does imporve the model because an improved R squared allows
for models to be compared againts each other to allow more
interpretation for the models