In the mpg data set, which manufacturer produced the most fuel economic SUVs?

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
suv_data <- filter(mpg, class == "suv")

suv_efficiency <- select(suv_data, manufacturer, model, cty, hwy)

ggplot(suv_efficiency, aes(x = reorder(manufacturer, hwy, FUN = median), y = hwy)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "SUV Highway Fuel Economy by Manufacturer",
       subtitle = "Subaru and Toyota typically lead the pack",
       x = "Manufacturer", y = "Highway MPG")

# Subaru usually comes out on top for SUVs in this dataset. ## In the mpg data set, which SUV manufacturer improved fuel economy most between 1999 and 2008?

suv_years <- filter(mpg, class == "suv", year %in% c(1999, 2008))

suv_years <- arrange(suv_years, manufacturer, year)

ggplot(suv_years, aes(x = factor(year), y = hwy, color = manufacturer)) +
  geom_jitter(width = 0.1) + 
  facet_wrap(~manufacturer) +
  labs(title = "Fuel Economy Improvement: 1999 vs 2008",
       x = "Year", y = "Highway MPG")

# Some brands (like Dodge) stay flat, while others might show a slight upward trend in their 2008 points ## In the flights data set, pick up another variable other than carrier and analyze whether that variable correlates with long-delay flights or not.

library(nycflights13)
clean_flights <- filter(flights, !is.na(arr_delay), arr_delay < 200)

flight_subset <- select(clean_flights, distance, arr_delay)

ggplot(flight_subset, aes(x = distance, y = arr_delay)) +
  geom_point(alpha = 0.05, color = "darkblue") +
  geom_smooth(color = "orange") + 
  labs(title = "Does Distance Correlate with Arrival Delay?",
       x = "Distance (miles)", y = "Arrival Delay (minutes)")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# There is a little bit correlation