Fuel Economy Dataset

Install packages

# run this command if before loading. need to do this only once
# install.packages(tidyverse) 

Load packages

library(tidyverse)
library(knitr)
library(DT)

Load data

dat_mpg <- mpg

To read data from an external csv file use

dat_mpg <- read_csv("mpg.csv")
## Rows: 234 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): manufacturer, model, trans, drv, fl, class
## dbl (5): displ, year, cyl, cty, hwy
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View data

library(knitr)
DT::datatable(dat_mpg)

Data description

# Type '?mpg' in RStudio console - what do you see?

Data summary

dat_mpg %>% summary()
##  manufacturer          model               displ            year     
##  Length:234         Length:234         Min.   :1.600   Min.   :1999  
##  Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
##  Mode  :character   Mode  :character   Median :3.300   Median :2004  
##                                        Mean   :3.472   Mean   :2004  
##                                        3rd Qu.:4.600   3rd Qu.:2008  
##                                        Max.   :7.000   Max.   :2008  
##       cyl           trans               drv                 cty       
##  Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
##  1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
##  Median :6.000   Mode  :character   Mode  :character   Median :17.00  
##  Mean   :5.889                                         Mean   :16.86  
##  3rd Qu.:8.000                                         3rd Qu.:19.00  
##  Max.   :8.000                                         Max.   :35.00  
##       hwy             fl               class          
##  Min.   :12.00   Length:234         Length:234        
##  1st Qu.:18.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :23.44                                        
##  3rd Qu.:27.00                                        
##  Max.   :44.00

Data quality assessment

Count of missing values in ‘hwy’

dat_mpg %>% pull(hwy) %>% is.na() %>% sum()
## [1] 0

Count of missing values in all variables

dat_mpg %>% summarise_all(., ~sum(is.na(.)))

Visualization

Manufacturer

ggplot() + 
  geom_bar(data = dat_mpg, 
           mapping = aes(x = manufacturer, fill = manufacturer)) + 
  theme(legend.position = 'none',
    text = element_text(size = 12),
        axis.text.x = element_text(angle = 90),
    )

Highway miles per gallon

ggplot() + 
  geom_histogram(data = dat_mpg, 
                 mapping = aes(x=hwy), 
                 color="black", bins = 10) +
  theme(text = element_text(size = 20)) 

dat_mpg %>% select(hwy) %>% summary()
##       hwy       
##  Min.   :12.00  
##  1st Qu.:18.00  
##  Median :24.00  
##  Mean   :23.44  
##  3rd Qu.:27.00  
##  Max.   :44.00

Highway mpg vs. Transmission type

dat_mpg %>% dplyr::group_by(trans) %>% summarise(n = n()) %>% kable()
trans n
auto(av) 5
auto(l3) 2
auto(l4) 83
auto(l5) 39
auto(l6) 6
auto(s4) 3
auto(s5) 3
auto(s6) 16
manual(m5) 58
manual(m6) 19
ggplot() + geom_bar(data = dat_mpg, 
                    mapping = aes(x=trans, fill = trans)) +
  theme(legend.position = "none", 
        text = element_text(size = 20),
        axis.text.x = element_text(angle = 90))

Data transformation

dat_mpg <- dat_mpg %>% 
  mutate(
    trans_binary = ifelse(startsWith(trans, "auto"), "auto", "manual")
  )

dat_mpg %>% dplyr::group_by(trans_binary) %>% 
  summarise(n = n())
ggplot() + geom_bar(data = dat_mpg, 
                    mapping = aes(x=trans_binary, fill = trans_binary)) +
  theme(legend.position = "none", 
        text = element_text(size = 20),
        axis.text.x = element_text(angle = 90))

ggplot() + 
  geom_histogram(data = dat_mpg, 
                 mapping = aes(x = hwy, fill = trans_binary),
                 color = "black", 
                 alpha = 0.5) +
  theme(text = element_text(size = 12), 
        legend.position = "top") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#### Boxplot

ggplot() + 
  geom_boxplot(data = dat_mpg, 
               mapping = aes(x = trans_binary, y = hwy, fill= trans_binary)) +
  theme(text = element_text(size = 12), legend.position = "top") 

dat_mpg %>% group_by(trans_binary) %>% summarise(mean = mean(hwy), sd = sd(hwy), n = n())

Exercise

Submit by Sep 05, 2023