knitr::opts_chunk$set(echo = TRUE)
This study aims to address two main questions: 1) Is an automatic or manual transmission better for MPG 2) Quantify the MPG difference between automatic and manual transmissions
First, we can generate a correlation mactrix to explore linear relationships between mpg and other variables. We can see from below table that the correlation between “mpg” and “am” is hight with 0.6. There are other variables that may be helpful to explain “mpg”
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(cowplot)
library(ggplot2)
cor(mtcars)
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
We can look into data patterns by a few plots to explore the relationships. The first plot indicates that there is a big difference between automatic and manual transmissions when it comes to miles per gallon. The mpg mean for manual car is almost 25 while the mean for automative car is 15. There is large difference between the mpg means for manual and automative cars. If we consider other vairables and compare the mpg vlues, we can see that this pattern exists under different factors of variables, as shown in other 3 plots.
mean_am <- mtcars %>% group_by(am) %>% summarize(mpg = mean(mpg))
am_boxplot <- ggplot(mtcars, aes(x = factor(am), y =mpg)) +
geom_boxplot(fill = "blue") +
geom_point(data = mean_am, color = "red", size = 3) +
labs(x="")+
scale_x_discrete(labels = c("0"="Automatic","1"="Mannual"))
cyl_boxplot <- ggplot(mtcars, aes(x = factor(am), y = mpg, fill = factor(cyl))) +
geom_boxplot()+
labs(x="")+
scale_x_discrete(labels = c("0"="Automatic","1"="Mannual"))+
scale_fill_discrete(name = "Cylinders")
carb_boxplot <- ggplot(mtcars, aes(x = factor(am), y = mpg, fill = factor(carb)))+
geom_boxplot()+
labs(x="")+
scale_x_discrete(labels = c("0"="Automatic","1"="Mannual"))+
scale_fill_discrete(name = "Carburetors")
vs_boxplot <- ggplot(mtcars, aes(x = factor(am), y = mpg, fill = factor(vs)))+
geom_boxplot()+
labs(x="")+
scale_x_discrete(labels = c("0"="Automatic","1"="Mannual"))+
scale_fill_discrete(name = "Engine", labels = c("V-shape","Straight"))
plot_grid(am_boxplot, cyl_boxplot, carb_boxplot, vs_boxplot, nrow = 2, ncol = 2)
## Regression models and coefficent interpretation After comparing below
different models, ANOVA shows that “cyl” along with “am” is necessary.
Therefore, we use fit2 model to conduct regression.
data(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
library(dplyr)
fit1 <- lm(mpg~factor(am), mtcars)
fit2 <- lm(mpg~factor(am)+factor(cyl), mtcars)
fit3 <- lm(mpg~factor(am)+factor(cyl)+disp, mtcars)
fit4 <- lm(mpg~factor(am)+factor(cyl)+disp+wt, mtcars)
anova(fit1, fit2, fit3, fit4)
## Analysis of Variance Table
##
## Model 1: mpg ~ factor(am)
## Model 2: mpg ~ factor(am) + factor(cyl)
## Model 3: mpg ~ factor(am) + factor(cyl) + disp
## Model 4: mpg ~ factor(am) + factor(cyl) + disp + wt
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 30 720.90
## 2 28 264.50 2 456.40 32.4451 8.589e-08 ***
## 3 27 230.46 1 34.04 4.8391 0.03691 *
## 4 26 182.87 1 47.59 6.7663 0.01513 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit2)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.801852 1.322615 18.752135 2.182425e-17
## factor(am)1 2.559954 1.297579 1.972869 5.845717e-02
## factor(cyl)6 -6.156118 1.535723 -4.008612 4.106131e-04
## factor(cyl)8 -10.067560 1.452082 -6.933187 1.546574e-07
The residual plots are shown below. We observe that residuals may have a constant variance in the first plot, while the normality assumption may be violated in the Q-Q plot. Also, we don’t observe any pattern in the residuals, so I may have captured all the systematic variance.
par(mfrow = c(1,1))
plot(fit2)
## Conclusion Both the data exploration and the linear regression show a
difference between the automatic and the manual transmission cars for
miles per gallon. In general, manual cars have a better mpg. The
am_boxplot shows an almost 10 point in difference between the groups for
mpg, while the linear regression formalized this difference to 2.56 mpg
in favor of manual cars.