Install Fuel Economy Package

# install.packages("fueleconomy")
library(fueleconomy) 
?vehicles
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Take an initial look at data set

str(vehicles)
## tibble [33,442 × 12] (S3: tbl_df/tbl/data.frame)
##  $ id   : num [1:33442] 13309 13310 13311 14038 14039 ...
##  $ make : chr [1:33442] "Acura" "Acura" "Acura" "Acura" ...
##  $ model: chr [1:33442] "2.2CL/3.0CL" "2.2CL/3.0CL" "2.2CL/3.0CL" "2.3CL/3.0CL" ...
##  $ year : num [1:33442] 1997 1997 1997 1998 1998 ...
##  $ class: chr [1:33442] "Subcompact Cars" "Subcompact Cars" "Subcompact Cars" "Subcompact Cars" ...
##  $ trans: chr [1:33442] "Automatic 4-spd" "Manual 5-spd" "Automatic 4-spd" "Automatic 4-spd" ...
##  $ drive: chr [1:33442] "Front-Wheel Drive" "Front-Wheel Drive" "Front-Wheel Drive" "Front-Wheel Drive" ...
##  $ cyl  : num [1:33442] 4 4 6 4 4 6 4 4 6 5 ...
##  $ displ: num [1:33442] 2.2 2.2 3 2.3 2.3 3 2.3 2.3 3 2.5 ...
##  $ fuel : chr [1:33442] "Regular" "Regular" "Regular" "Regular" ...
##  $ hwy  : num [1:33442] 26 28 26 27 29 26 27 29 26 23 ...
##  $ cty  : num [1:33442] 20 22 18 19 21 17 20 21 17 18 ...
head(vehicles)
## # A tibble: 6 × 12
##      id make  model        year class  trans drive   cyl displ fuel    hwy   cty
##   <dbl> <chr> <chr>       <dbl> <chr>  <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 13309 Acura 2.2CL/3.0CL  1997 Subco… Auto… Fron…     4   2.2 Regu…    26    20
## 2 13310 Acura 2.2CL/3.0CL  1997 Subco… Manu… Fron…     4   2.2 Regu…    28    22
## 3 13311 Acura 2.2CL/3.0CL  1997 Subco… Auto… Fron…     6   3   Regu…    26    18
## 4 14038 Acura 2.3CL/3.0CL  1998 Subco… Auto… Fron…     4   2.3 Regu…    27    19
## 5 14039 Acura 2.3CL/3.0CL  1998 Subco… Manu… Fron…     4   2.3 Regu…    29    21
## 6 14040 Acura 2.3CL/3.0CL  1998 Subco… Auto… Fron…     6   3   Regu…    26    17

The relationships of interest are how categorical variables impact the fuel economy both city and highway. Categorical variables of interest are class, trans, drive, and fuel. I am also curious to see if there is a trend in fuel economy over the years.

Generating first few plots looking at hwy mpg vs. vehicle class

# first plot looking at class vs hwy
ggplot(vehicles, aes(x = factor(class), fill = hwy))+
  geom_bar()

# too busy and not a good plot, looking to get a better one
ggplot(vehicles, aes(x = factor(class), fill = hwy))+
  geom_bar()+
  facet_wrap(class~., scales = "free_y")

# not what I was hoping to have, looking to refine in other ways
ggplot(vehicles, aes(x = class, y = hwy))+
  geom_col()

# this is more like it, but way too busy on the x axis
ggplot(vehicles, aes(x = class, y = hwy))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Could use some color to make it better
ggplot(vehicles, aes(x = class, y = hwy))+
  geom_col(aes(fill = class))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Oof, the legend ruined it, lets remove it
ggplot(vehicles, aes(x = class, y = hwy))+
  geom_col(aes(fill = class), show.legend = FALSE)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Just realized this is summing all the mpg together by class so I need to get averages of each group
class_hwy_mpg <- vehicles %>% 
  group_by(class) %>% 
  summarize(avgHwy = mean(hwy))
head(class_hwy_mpg)
## # A tibble: 6 × 2
##   class                        avgHwy
##   <chr>                         <dbl>
## 1 Compact Cars                   27.8
## 2 Large Cars                     23.8
## 3 Midsize Cars                   25.9
## 4 Midsize Station Wagons         25.2
## 5 Midsize-Large Station Wagons   24.1
## 6 Minicompact Cars               25.5
# Great, polish it up with a theme and labels
ggplot(class_hwy_mpg, aes(x = class, y = avgHwy))+
  geom_col(aes(fill = class), show.legend = FALSE)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  labs(x = "Vehicle Class", y = "Average Highway MPG", title = "Average Highway MPG for each Class of Vehicle")

Another column chart looking at hwy mpg vs vehicle transmission

# wrangle the data
trans_hwy_mpg <- vehicles %>% 
  group_by(trans) %>% 
  summarize(avgHwy = mean(hwy))
# plot hwy mpg vs. transmission
ggplot(trans_hwy_mpg, aes(x = trans, y = avgHwy))+
  geom_col(aes(fill = trans), show.legend = FALSE)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  labs(x = "Vehicle Transmission", y = "Average Highway MPG", title = "Average Highway MPG for Different Transmissions")

Exploring the same type of chart with different variables

# Creating average hwy mpg of each fuel type
fuel_hwy_mpg <- vehicles %>% 
  group_by(fuel) %>% 
  summarize(avgHwy = mean(hwy))

# Creating another column chart because it seems the most logical for the question I am asking
ggplot(fuel_hwy_mpg, aes(x = fuel, y = avgHwy))+
  geom_col(aes(fill = fuel), show.legend = FALSE)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  labs(x = "Fuel Type", y = "Average Highway MPG", title = "Average Highway MPG for Different Vehicle Fuel Types")

# To me, electricity "MPG" makes no sense but hybrid MPG does make sense
# Removing electricity
fuel_hwy_mpg
## # A tibble: 13 × 2
##    fuel                        avgHwy
##    <chr>                        <dbl>
##  1 CNG                           21.8
##  2 Diesel                        26.7
##  3 Electricity                   78.7
##  4 Gasoline or E85               21.1
##  5 Gasoline or natural gas       18  
##  6 Gasoline or propane           16.5
##  7 Midgrade                      22.2
##  8 Premium                       23.4
##  9 Premium and Electricity       29  
## 10 Premium Gas or Electricity    34.3
## 11 Premium or E85                24.1
## 12 Regular                       23.5
## 13 Regular Gas and Electricity   44.6
fuel_hwy_mpg1 <- fuel_hwy_mpg[-3,]
fuel_hwy_mpg1
## # A tibble: 12 × 2
##    fuel                        avgHwy
##    <chr>                        <dbl>
##  1 CNG                           21.8
##  2 Diesel                        26.7
##  3 Gasoline or E85               21.1
##  4 Gasoline or natural gas       18  
##  5 Gasoline or propane           16.5
##  6 Midgrade                      22.2
##  7 Premium                       23.4
##  8 Premium and Electricity       29  
##  9 Premium Gas or Electricity    34.3
## 10 Premium or E85                24.1
## 11 Regular                       23.5
## 12 Regular Gas and Electricity   44.6
# New plot without electricity as a fuel type
ggplot(fuel_hwy_mpg1, aes(x = fuel, y = avgHwy))+
  geom_col(aes(fill = fuel), show.legend = FALSE)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  labs(x = "Fuel Type", y = "Average Highway MPG", title = "Average Highway MPG for Different Vehicle Fuel Types")

ggsave("mpg_fuel.pdf", path = "/Users/slimeollie/Desktop/WU/MSDS/DATA502")
## Saving 7 x 5 in image
# This plot is cool to compare with the one above because the electricity column was squishing the data at the bottom in the previous plot

Trying out different charts

None of these were actually what I wanted but I wanted to show my thought process and experimentation

# Trying to plot year vs hwy, coloring the points by fuel type
ggplot(vehicles, aes(x = year, y = hwy))+
  geom_point(aes(color = fuel))+
  theme_minimal()

# Thought that I was on to something here but it is very messy
ggplot(vehicles, aes(x = year, y = hwy))+
  geom_point(aes(color = fuel))+
  geom_smooth()+
  facet_wrap(fuel~., scales = "free_y")+
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

# Realizing that years are essentially a bin and that a column is what I want 
ggplot(vehicles, aes(x = year, y = hwy))+
  geom_col(aes(fill = fuel))+
  theme_minimal()

# Too busy, experimenting with faceting to see if it looks cleaner
ggplot(vehicles, aes(x = year, y = hwy))+
  geom_col(aes(fill = fuel))+
  facet_wrap(year~., scales = "free_y")+
  theme_minimal()

# To be honest, I didn't really like any of these plots in this chunk and realized that I need to find a way to wrangle the data so that I can get averages rather than counts. I do not know of a way to get the average mpg per year and also be able to color by fuel. I am going to ask you about this in class. Maybe I am thinking about this incorrectly.

Trying out scatter plots with two numeric variables displacement and hwy mpg

# scatter plot of hwy mpg vs engine displacement
ggplot(vehicles, aes(x = displ, y = hwy))+
  geom_point()+
  theme_minimal()
## Warning: Removed 57 rows containing missing values (geom_point).

# Color by fuel type and add a trendline, only apply color to point layer to get global trendline
ggplot(vehicles, aes(x = displ, y = hwy))+
  geom_point(aes(color = fuel))+
  geom_smooth()+
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 57 rows containing non-finite values (stat_smooth).
## Removed 57 rows containing missing values (geom_point).

# Remove the standard error and polish
ggplot(vehicles, aes(x = displ, y = hwy))+
  geom_point(aes(color = fuel))+
  geom_smooth(se = FALSE)+
  theme_minimal()+
  labs(x = "Engine Displacement (Liters)", y = "Average Highway MPG", title = "Average Highway MPG Decreases as Engine Displacement Increases", legend = "Fuel Type", color = "Fuel Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 57 rows containing non-finite values (stat_smooth).
## Removed 57 rows containing missing values (geom_point).

ggsave("mpg_displ_fuel.pdf", path = "/Users/slimeollie/Desktop/WU/MSDS/DATA502")
## Saving 7 x 5 in image
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 57 rows containing non-finite values (stat_smooth).
## Removed 57 rows containing missing values (geom_point).

Trying out a heatmap with a categorical variable with many categories of hwy mpg vs vehicle make

# heatmap of hwy mpg over the years by make of car
ggplot(vehicles, aes(x = year, y = make))+
  geom_tile(aes(fill = hwy))+
  scale_fill_gradient(low = "pink", high = "darkblue")+
  theme_minimal()

# everything is on top of each other, gotta change y-axis labels, from some snooping the easiest thing to do is adjust the size of the graphic in ggsave()
ggplot(vehicles, aes(x = year, y = make))+
  geom_tile(aes(fill = hwy))+
  theme(axis.text.y.left = element_text(size = 4), title = element_text(size = 16, face = "bold"))+
  scale_fill_gradient(low = "pink", high = "darkblue")+
  theme_minimal()+
  labs(x = "Year of Manufacture", y = "Car Manufacturer", title = "Average Highway MPG Increases over Time", fill = "Highway MPG")

ggsave("mpg_time_heatmap.pdf", path = "/Users/slimeollie/Desktop/WU/MSDS/DATA502", width = 20, height = 16, units = "in")