data(mtcars)
attach(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.94 loaded
corr_matrix <- cor(mtcars)
print(corr_matrix)
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
corrplot(corr_matrix, method="circle", type="upper", order="hclust",
tl.col="black", tl.srt=45)
The variables most strongly associated with mpg are wt, cyl, and disp.
Now let’s look at these variables graphically.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'mtcars':
##
## mpg
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
labs(title = "Weight vs Miles per Gallon", x = "Weight (1000 lbs)", y = "Miles per Gallon")
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) +
geom_boxplot(aes(fill=factor(cyl))) +
ggtitle("Miles per Gallon by Number of Cylinders") +
xlab("Number of Cylinders") +
ylab("Miles per Gallon")
ggplot(mtcars, aes(x=disp, y=mpg)) +
geom_point(aes(color=disp)) +
ggtitle("Displacement vs. Miles per Gallon") +
xlab("Displacement (cubic inches)") +
ylab("Miles per Gallon") +
scale_color_continuous(name="Index")
An increase in weight, number of cylinders, and displacement correlates to a decrease in miles per gallon.
colSums(is.na(mtcars))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 0 0 0 0 0 0 0 0
#There is no missing data.
boxplot(mtcars, las=2, cex.axis=0.6)
mtcars_lm <- lm(mpg ~ wt + disp + cyl + hp, data = mtcars)
summary(mtcars_lm)
##
## Call:
## lm(formula = mpg ~ wt + disp + cyl + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0562 -1.4636 -0.4281 1.2854 5.8269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.82854 2.75747 14.807 1.76e-14 ***
## wt -3.85390 1.01547 -3.795 0.000759 ***
## disp 0.01160 0.01173 0.989 0.331386
## cyl -1.29332 0.65588 -1.972 0.058947 .
## hp -0.02054 0.01215 -1.691 0.102379
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.513 on 27 degrees of freedom
## Multiple R-squared: 0.8486, Adjusted R-squared: 0.8262
## F-statistic: 37.84 on 4 and 27 DF, p-value: 1.061e-10
#There is a significant correlation between mpg and wt and mpg and cyl, but not mpg and disp.
mean(mtcars_lm$residuals^2)
## [1] 5.326386
#The Means Squared Error is 5.326386.
Looking through the data, the displacement variable has the largest range. Let’s winsorize this variable and also truncate hp to remove the large outlier.
mt_cars_sorted <- mtcars[mtcars$hp <= 300, ]
lower_bound_disp <- quantile(mt_cars_sorted$disp, 0.05, na.rm = TRUE)
upper_bound_disp <- quantile(mt_cars_sorted$disp, 0.95, na.rm = TRUE)
mt_cars_sorted$disp[mt_cars_sorted$tax < lower_bound_disp] <- lower_bound_disp
mt_cars_sorted$disp[mt_cars_sorted$tax > upper_bound_disp] <- upper_bound_disp
Now lets run the same linear regression tests as before.
mtcars_lm2 <- lm(mpg ~ wt + disp + cyl + hp, data = mt_cars_sorted)
summary(mtcars_lm2)
##
## Call:
## lm(formula = mpg ~ wt + disp + cyl + hp, data = mt_cars_sorted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9483 -1.2877 -0.3379 0.7952 5.7201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.13342 2.78171 14.787 3.6e-14 ***
## wt -3.91774 1.01973 -3.842 0.000705 ***
## disp 0.01502 0.01229 1.221 0.232863
## cyl -1.20061 0.66446 -1.807 0.082366 .
## hp -0.03123 0.01661 -1.879 0.071432 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.518 on 26 degrees of freedom
## Multiple R-squared: 0.8501, Adjusted R-squared: 0.827
## F-statistic: 36.86 on 4 and 26 DF, p-value: 2.325e-10
mean(mtcars_lm2$residuals^2)
## [1] 5.315611
#The means squared error is 5.315611.
Lastly, lets run these two regressions with an interaction term of cyl * hp and see if there is a correlation between these two variables.
mtcars_lm <- lm(mpg ~ wt + disp + cyl + hp + cyl * hp, data = mtcars)
summary(mtcars_lm)
##
## Call:
## lm(formula = mpg ~ wt + disp + cyl + hp + cyl * hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.395 -1.426 -0.612 1.219 4.312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.164411 5.031802 10.367 9.96e-11 ***
## wt -3.302628 0.946204 -3.490 0.00174 **
## disp 0.003063 0.011144 0.275 0.78561
## cyl -2.780502 0.826175 -3.366 0.00238 **
## hp -0.159849 0.054760 -2.919 0.00716 **
## cyl:hp 0.018380 0.007077 2.597 0.01527 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.281 on 26 degrees of freedom
## Multiple R-squared: 0.8798, Adjusted R-squared: 0.8567
## F-statistic: 38.07 on 5 and 26 DF, p-value: 3.697e-11
mtcars_lm2 <- lm(mpg ~ wt + disp + cyl + hp + cyl * hp, data = mt_cars_sorted)
summary(mtcars_lm2)
##
## Call:
## lm(formula = mpg ~ wt + disp + cyl + hp + cyl * hp, data = mt_cars_sorted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3926 -1.4547 -0.7343 1.2200 4.3282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.959186 5.294022 9.815 4.67e-10 ***
## wt -3.325420 0.975411 -3.409 0.00222 **
## disp 0.003811 0.012326 0.309 0.75972
## cyl -2.731965 0.897450 -3.044 0.00543 **
## hp -0.158458 0.056522 -2.803 0.00963 **
## cyl:hp 0.017967 0.007682 2.339 0.02764 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.326 on 25 degrees of freedom
## Multiple R-squared: 0.877, Adjusted R-squared: 0.8524
## F-statistic: 35.65 on 5 and 25 DF, p-value: 1.343e-10
There is a relatively significant relationship between the number of cylinders and horsepower.