library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
suv_data <- filter(mpg, class == "suv")
suv_efficiency <- select(suv_data, manufacturer, model, cty, hwy)
ggplot(suv_efficiency, aes(x = reorder(manufacturer, hwy, FUN = median), y = hwy)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "SUV Highway Fuel Economy by Manufacturer",
subtitle = "Subaru and Toyota typically lead the pack",
x = "Manufacturer", y = "Highway MPG")
# Subaru usually comes out on top for SUVs in this dataset. ## In the
mpg data set, which SUV manufacturer improved fuel economy most between
1999 and 2008?
suv_years <- filter(mpg, class == "suv", year %in% c(1999, 2008))
suv_years <- arrange(suv_years, manufacturer, year)
ggplot(suv_years, aes(x = factor(year), y = hwy, color = manufacturer)) +
geom_jitter(width = 0.1) +
facet_wrap(~manufacturer) +
labs(title = "Fuel Economy Improvement: 1999 vs 2008",
x = "Year", y = "Highway MPG")
# Some brands (like Dodge) stay flat, while others might show a slight
upward trend in their 2008 points ## In the flights data set, pick up
another variable other than carrier and analyze whether that variable
correlates with long-delay flights or not.
library(nycflights13)
clean_flights <- filter(flights, !is.na(arr_delay), arr_delay < 200)
flight_subset <- select(clean_flights, distance, arr_delay)
ggplot(flight_subset, aes(x = distance, y = arr_delay)) +
geom_point(alpha = 0.05, color = "darkblue") +
geom_smooth(color = "orange") +
labs(title = "Does Distance Correlate with Arrival Delay?",
x = "Distance (miles)", y = "Arrival Delay (minutes)")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# There is a little bit correlation