# install.packages("fueleconomy")
library(fueleconomy)
?vehicles
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
str(vehicles)
## tibble [33,442 × 12] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:33442] 13309 13310 13311 14038 14039 ...
## $ make : chr [1:33442] "Acura" "Acura" "Acura" "Acura" ...
## $ model: chr [1:33442] "2.2CL/3.0CL" "2.2CL/3.0CL" "2.2CL/3.0CL" "2.3CL/3.0CL" ...
## $ year : num [1:33442] 1997 1997 1997 1998 1998 ...
## $ class: chr [1:33442] "Subcompact Cars" "Subcompact Cars" "Subcompact Cars" "Subcompact Cars" ...
## $ trans: chr [1:33442] "Automatic 4-spd" "Manual 5-spd" "Automatic 4-spd" "Automatic 4-spd" ...
## $ drive: chr [1:33442] "Front-Wheel Drive" "Front-Wheel Drive" "Front-Wheel Drive" "Front-Wheel Drive" ...
## $ cyl : num [1:33442] 4 4 6 4 4 6 4 4 6 5 ...
## $ displ: num [1:33442] 2.2 2.2 3 2.3 2.3 3 2.3 2.3 3 2.5 ...
## $ fuel : chr [1:33442] "Regular" "Regular" "Regular" "Regular" ...
## $ hwy : num [1:33442] 26 28 26 27 29 26 27 29 26 23 ...
## $ cty : num [1:33442] 20 22 18 19 21 17 20 21 17 18 ...
head(vehicles)
## # A tibble: 6 × 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 13309 Acura 2.2CL/3.0CL 1997 Subco… Auto… Fron… 4 2.2 Regu… 26 20
## 2 13310 Acura 2.2CL/3.0CL 1997 Subco… Manu… Fron… 4 2.2 Regu… 28 22
## 3 13311 Acura 2.2CL/3.0CL 1997 Subco… Auto… Fron… 6 3 Regu… 26 18
## 4 14038 Acura 2.3CL/3.0CL 1998 Subco… Auto… Fron… 4 2.3 Regu… 27 19
## 5 14039 Acura 2.3CL/3.0CL 1998 Subco… Manu… Fron… 4 2.3 Regu… 29 21
## 6 14040 Acura 2.3CL/3.0CL 1998 Subco… Auto… Fron… 6 3 Regu… 26 17
The relationships of interest are how categorical variables impact the fuel economy both city and highway. Categorical variables of interest are class, trans, drive, and fuel. I am also curious to see if there is a trend in fuel economy over the years.
# first plot looking at class vs hwy
ggplot(vehicles, aes(x = factor(class), fill = hwy))+
geom_bar()
# too busy and not a good plot, looking to get a better one
ggplot(vehicles, aes(x = factor(class), fill = hwy))+
geom_bar()+
facet_wrap(class~., scales = "free_y")
# not what I was hoping to have, looking to refine in other ways
ggplot(vehicles, aes(x = class, y = hwy))+
geom_col()
# this is more like it, but way too busy on the x axis
ggplot(vehicles, aes(x = class, y = hwy))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Could use some color to make it better
ggplot(vehicles, aes(x = class, y = hwy))+
geom_col(aes(fill = class))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Oof, the legend ruined it, lets remove it
ggplot(vehicles, aes(x = class, y = hwy))+
geom_col(aes(fill = class), show.legend = FALSE)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Just realized this is summing all the mpg together by class so I need to get averages of each group
class_hwy_mpg <- vehicles %>%
group_by(class) %>%
summarize(avgHwy = mean(hwy))
head(class_hwy_mpg)
## # A tibble: 6 × 2
## class avgHwy
## <chr> <dbl>
## 1 Compact Cars 27.8
## 2 Large Cars 23.8
## 3 Midsize Cars 25.9
## 4 Midsize Station Wagons 25.2
## 5 Midsize-Large Station Wagons 24.1
## 6 Minicompact Cars 25.5
# Great, polish it up with a theme and labels
ggplot(class_hwy_mpg, aes(x = class, y = avgHwy))+
geom_col(aes(fill = class), show.legend = FALSE)+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
labs(x = "Vehicle Class", y = "Average Highway MPG", title = "Average Highway MPG for each Class of Vehicle")
# wrangle the data
trans_hwy_mpg <- vehicles %>%
group_by(trans) %>%
summarize(avgHwy = mean(hwy))
# plot hwy mpg vs. transmission
ggplot(trans_hwy_mpg, aes(x = trans, y = avgHwy))+
geom_col(aes(fill = trans), show.legend = FALSE)+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
labs(x = "Vehicle Transmission", y = "Average Highway MPG", title = "Average Highway MPG for Different Transmissions")
# Creating average hwy mpg of each fuel type
fuel_hwy_mpg <- vehicles %>%
group_by(fuel) %>%
summarize(avgHwy = mean(hwy))
# Creating another column chart because it seems the most logical for the question I am asking
ggplot(fuel_hwy_mpg, aes(x = fuel, y = avgHwy))+
geom_col(aes(fill = fuel), show.legend = FALSE)+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
labs(x = "Fuel Type", y = "Average Highway MPG", title = "Average Highway MPG for Different Vehicle Fuel Types")
# To me, electricity "MPG" makes no sense but hybrid MPG does make sense
# Removing electricity
fuel_hwy_mpg
## # A tibble: 13 × 2
## fuel avgHwy
## <chr> <dbl>
## 1 CNG 21.8
## 2 Diesel 26.7
## 3 Electricity 78.7
## 4 Gasoline or E85 21.1
## 5 Gasoline or natural gas 18
## 6 Gasoline or propane 16.5
## 7 Midgrade 22.2
## 8 Premium 23.4
## 9 Premium and Electricity 29
## 10 Premium Gas or Electricity 34.3
## 11 Premium or E85 24.1
## 12 Regular 23.5
## 13 Regular Gas and Electricity 44.6
fuel_hwy_mpg1 <- fuel_hwy_mpg[-3,]
fuel_hwy_mpg1
## # A tibble: 12 × 2
## fuel avgHwy
## <chr> <dbl>
## 1 CNG 21.8
## 2 Diesel 26.7
## 3 Gasoline or E85 21.1
## 4 Gasoline or natural gas 18
## 5 Gasoline or propane 16.5
## 6 Midgrade 22.2
## 7 Premium 23.4
## 8 Premium and Electricity 29
## 9 Premium Gas or Electricity 34.3
## 10 Premium or E85 24.1
## 11 Regular 23.5
## 12 Regular Gas and Electricity 44.6
# New plot without electricity as a fuel type
ggplot(fuel_hwy_mpg1, aes(x = fuel, y = avgHwy))+
geom_col(aes(fill = fuel), show.legend = FALSE)+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
labs(x = "Fuel Type", y = "Average Highway MPG", title = "Average Highway MPG for Different Vehicle Fuel Types")
ggsave("mpg_fuel.pdf", path = "/Users/slimeollie/Desktop/WU/MSDS/DATA502")
## Saving 7 x 5 in image
# This plot is cool to compare with the one above because the electricity column was squishing the data at the bottom in the previous plot
# Trying to plot year vs hwy, coloring the points by fuel type
ggplot(vehicles, aes(x = year, y = hwy))+
geom_point(aes(color = fuel))+
theme_minimal()
# Thought that I was on to something here but it is very messy
ggplot(vehicles, aes(x = year, y = hwy))+
geom_point(aes(color = fuel))+
geom_smooth()+
facet_wrap(fuel~., scales = "free_y")+
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
# Realizing that years are essentially a bin and that a column is what I want
ggplot(vehicles, aes(x = year, y = hwy))+
geom_col(aes(fill = fuel))+
theme_minimal()
# Too busy, experimenting with faceting to see if it looks cleaner
ggplot(vehicles, aes(x = year, y = hwy))+
geom_col(aes(fill = fuel))+
facet_wrap(year~., scales = "free_y")+
theme_minimal()
# To be honest, I didn't really like any of these plots in this chunk and realized that I need to find a way to wrangle the data so that I can get averages rather than counts. I do not know of a way to get the average mpg per year and also be able to color by fuel. I am going to ask you about this in class. Maybe I am thinking about this incorrectly.
# scatter plot of hwy mpg vs engine displacement
ggplot(vehicles, aes(x = displ, y = hwy))+
geom_point()+
theme_minimal()
## Warning: Removed 57 rows containing missing values (geom_point).
# Color by fuel type and add a trendline, only apply color to point layer to get global trendline
ggplot(vehicles, aes(x = displ, y = hwy))+
geom_point(aes(color = fuel))+
geom_smooth()+
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 57 rows containing non-finite values (stat_smooth).
## Removed 57 rows containing missing values (geom_point).
# Remove the standard error and polish
ggplot(vehicles, aes(x = displ, y = hwy))+
geom_point(aes(color = fuel))+
geom_smooth(se = FALSE)+
theme_minimal()+
labs(x = "Engine Displacement (Liters)", y = "Average Highway MPG", title = "Average Highway MPG Decreases as Engine Displacement Increases", legend = "Fuel Type", color = "Fuel Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 57 rows containing non-finite values (stat_smooth).
## Removed 57 rows containing missing values (geom_point).
ggsave("mpg_displ_fuel.pdf", path = "/Users/slimeollie/Desktop/WU/MSDS/DATA502")
## Saving 7 x 5 in image
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 57 rows containing non-finite values (stat_smooth).
## Removed 57 rows containing missing values (geom_point).
# heatmap of hwy mpg over the years by make of car
ggplot(vehicles, aes(x = year, y = make))+
geom_tile(aes(fill = hwy))+
scale_fill_gradient(low = "pink", high = "darkblue")+
theme_minimal()
# everything is on top of each other, gotta change y-axis labels, from some snooping the easiest thing to do is adjust the size of the graphic in ggsave()
ggplot(vehicles, aes(x = year, y = make))+
geom_tile(aes(fill = hwy))+
theme(axis.text.y.left = element_text(size = 4), title = element_text(size = 16, face = "bold"))+
scale_fill_gradient(low = "pink", high = "darkblue")+
theme_minimal()+
labs(x = "Year of Manufacture", y = "Car Manufacturer", title = "Average Highway MPG Increases over Time", fill = "Highway MPG")
ggsave("mpg_time_heatmap.pdf", path = "/Users/slimeollie/Desktop/WU/MSDS/DATA502", width = 20, height = 16, units = "in")