knitr::opts_chunk$set(echo = TRUE)

Executive Summary

This study aims to address two main questions: 1) Is an automatic or manual transmission better for MPG 2) Quantify the MPG difference between automatic and manual transmissions

First, we can generate a correlation mactrix to explore linear relationships between mpg and other variables. We can see from below table that the correlation between “mpg” and “am” is hight with 0.6. There are other variables that may be helpful to explain “mpg”

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(cowplot)
library(ggplot2)

cor(mtcars)
##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000

Explore data value

We can look into data patterns by a few plots to explore the relationships. The first plot indicates that there is a big difference between automatic and manual transmissions when it comes to miles per gallon. The mpg mean for manual car is almost 25 while the mean for automative car is 15. There is large difference between the mpg means for manual and automative cars. If we consider other vairables and compare the mpg vlues, we can see that this pattern exists under different factors of variables, as shown in other 3 plots.

mean_am <- mtcars %>% group_by(am) %>% summarize(mpg = mean(mpg))

am_boxplot <- ggplot(mtcars, aes(x = factor(am), y =mpg)) +
                       geom_boxplot(fill = "blue") +
                       geom_point(data = mean_am, color = "red", size = 3) +
                       labs(x="")+
                       scale_x_discrete(labels = c("0"="Automatic","1"="Mannual"))

cyl_boxplot <- ggplot(mtcars, aes(x = factor(am), y = mpg, fill = factor(cyl))) +
  geom_boxplot()+
  labs(x="")+
  scale_x_discrete(labels = c("0"="Automatic","1"="Mannual"))+
  scale_fill_discrete(name = "Cylinders")

carb_boxplot <- ggplot(mtcars, aes(x = factor(am), y = mpg, fill = factor(carb)))+
  geom_boxplot()+
  labs(x="")+
  scale_x_discrete(labels = c("0"="Automatic","1"="Mannual"))+
  scale_fill_discrete(name = "Carburetors")

vs_boxplot <- ggplot(mtcars, aes(x = factor(am), y = mpg, fill = factor(vs)))+
  geom_boxplot()+
  labs(x="")+
  scale_x_discrete(labels = c("0"="Automatic","1"="Mannual"))+
  scale_fill_discrete(name = "Engine", labels = c("V-shape","Straight"))

plot_grid(am_boxplot, cyl_boxplot, carb_boxplot, vs_boxplot, nrow = 2, ncol = 2)

## Regression models and coefficent interpretation After comparing below different models, ANOVA shows that “cyl” along with “am” is necessary. Therefore, we use fit2 model to conduct regression.

data(mtcars)
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
library(dplyr)

fit1 <- lm(mpg~factor(am), mtcars)
fit2 <- lm(mpg~factor(am)+factor(cyl), mtcars)
fit3 <- lm(mpg~factor(am)+factor(cyl)+disp, mtcars)
fit4 <- lm(mpg~factor(am)+factor(cyl)+disp+wt, mtcars)

anova(fit1, fit2, fit3, fit4)
## Analysis of Variance Table
## 
## Model 1: mpg ~ factor(am)
## Model 2: mpg ~ factor(am) + factor(cyl)
## Model 3: mpg ~ factor(am) + factor(cyl) + disp
## Model 4: mpg ~ factor(am) + factor(cyl) + disp + wt
##   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
## 1     30 720.90                                   
## 2     28 264.50  2    456.40 32.4451 8.589e-08 ***
## 3     27 230.46  1     34.04  4.8391   0.03691 *  
## 4     26 182.87  1     47.59  6.7663   0.01513 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit2)$coef
##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)   24.801852   1.322615 18.752135 2.182425e-17
## factor(am)1    2.559954   1.297579  1.972869 5.845717e-02
## factor(cyl)6  -6.156118   1.535723 -4.008612 4.106131e-04
## factor(cyl)8 -10.067560   1.452082 -6.933187 1.546574e-07

Residual plot and diagnostics

The residual plots are shown below. We observe that residuals may have a constant variance in the first plot, while the normality assumption may be violated in the Q-Q plot. Also, we don’t observe any pattern in the residuals, so I may have captured all the systematic variance.

par(mfrow = c(1,1))
plot(fit2)

## Conclusion Both the data exploration and the linear regression show a difference between the automatic and the manual transmission cars for miles per gallon. In general, manual cars have a better mpg. The am_boxplot shows an almost 10 point in difference between the groups for mpg, while the linear regression formalized this difference to 2.56 mpg in favor of manual cars.