library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
Summary of findings: The distribution of arrival delays is right-skewed. The majority of flights arrive close to their scheduled time or slightly early, represented by negative numbers. There is a long tail stretching to the right, showing that a small number of flights experience extreme delays of several hours.
flights_jun_jul <- flights %>%
filter(month %in% c(6, 7), !is.na(arr_delay))
ggplot(flights_jun_jul, aes(x = arr_delay)) +
geom_histogram(binwidth = 15, fill = "steelblue", color = "white") +
theme_minimal() +
labs(title = "Histogram of Arrival Delays in June and July",
x = "Arrival Delay (minutes)",
y = "Frequency")
Summary of findings: There is a strong, positive linear relationship between departure delays and arrival delays. If a flight leaves late, it almost always arrives late by roughly the same amount of time.
ewr_first_day <- flights %>%
filter(origin == "EWR", day == 1, !is.na(arr_delay), !is.na(dep_delay))
ggplot(ewr_first_day, aes(x = dep_delay, y = arr_delay)) +
geom_smooth(method = "loess", color = "darkred") +
theme_minimal() +
labs(title = "Arrival vs. Departure Delays (EWR, 1st of each month)",
x = "Departure Delay (minutes)",
y = "Arrival Delay (minutes)")
## `geom_smooth()` using formula = 'y ~ x'
Answer: The shortest flight in the dataset is 80 miles, flying from EWR to PHL
shortest_flight <- flights %>%
filter(!is.na(dep_time)) %>%
arrange(distance) %>%
select(origin, dest, distance)
shortest_flight
## # A tibble: 328,521 × 3
## origin dest distance
## <chr> <chr> <dbl>
## 1 EWR PHL 80
## 2 EWR PHL 80
## 3 EWR PHL 80
## 4 EWR PHL 80
## 5 EWR PHL 80
## 6 EWR PHL 80
## 7 EWR PHL 80
## 8 EWR PHL 80
## 9 EWR PHL 80
## 10 EWR PHL 80
## # ℹ 328,511 more rows
Summary of findings: There are more “long-distance” flights (500 miles or more) than “short-distance” flights (under 500 miles) departing from NYC airports.
flights <- flights %>%
mutate(dist_category = ifelse(distance < 500, "short-distance", "long-distance"))
ggplot(flights, aes(x = dist_category, fill = dist_category)) +
geom_bar() +
theme_minimal() +
labs(title = "Number of Short vs. Long Distance Flights",
x = "Distance Category",
y = "Count") +
theme(legend.position = "none")
top_delays <- flights %>%
group_by(dest) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay)) %>%
head(10)
ggplot(top_delays, aes(x = reorder(dest, avg_dep_delay), y = avg_dep_delay)) +
geom_col(fill = "coral") +
coord_flip() +
theme_minimal() +
labs(title = "Top 10 Destinations by Highest Avg Departure Delay",
x = "Destination Airport",
y = "Average Departure Delay (minutes)")
Answer: CAE
flights %>%
group_by(dest) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 105 × 2
## dest avg_dep_delay
## <chr> <dbl>
## 1 CAE 35.6
## 2 TUL 34.9
## 3 OKC 30.6
## 4 BHM 29.7
## 5 TYS 28.5
## 6 JAC 26.5
## 7 DSM 26.2
## 8 RIC 23.6
## 9 ALB 23.6
## 10 MSN 23.6
## # ℹ 95 more rows
carrier_speeds <- flights %>%
mutate(speed_mph = distance / (air_time / 60)) %>%
group_by(carrier) %>%
summarize(avg_speed = mean(speed_mph, na.rm = TRUE)) %>%
arrange(desc(avg_speed))
highest <- head(carrier_speeds, 1)
highest
## # A tibble: 1 × 2
## carrier avg_speed
## <chr> <dbl>
## 1 HA 480.
lowest <- tail(carrier_speeds, 1)
lowest
## # A tibble: 1 × 2
## carrier avg_speed
## <chr> <dbl>
## 1 YV 332.
Answer: There are 4 species in the dataset. They are Cat, Dog, Goat, and Pig.
num_species <- seattlepets %>% summarize(n = n_distinct(species))
species_list <- unique(seattlepets$species)
num_species
## # A tibble: 1 × 1
## n
## <int>
## 1 4
species_list
## [1] "Dog" "Cat" "Goat" "Pig"
Answer: For cats, the most popular breed is Domestic Shorthair. For dogs, it is the Labrador Retriever
seattlepets %>%
filter(species %in% c("Cat", "Dog")) %>%
group_by(species, primary_breed) %>%
count() %>%
group_by(species) %>%
slice_max(n, n = 1)
## # A tibble: 2 × 3
## # Groups: species [2]
## species primary_breed n
## <chr> <chr> <int>
## 1 Cat Domestic Shorthair 10086
## 2 Dog Retriever, Labrador 4867
Answer: The top three overall names are usually Lucy, Charlie, and Bella (or Luna).
seattlepets %>%
filter(!is.na(animal_name), animal_name != "") %>%
count(animal_name, sort = TRUE) %>%
head(3)
## # A tibble: 3 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 439
## 2 Charlie 387
## 3 Luna 355
cat_names <- seattlepets %>%
filter(species == "Cat", !is.na(animal_name), animal_name != "") %>%
count(animal_name, sort = TRUE) %>%
head(10)
print("Top 10 Cat Names:")
## [1] "Top 10 Cat Names:"
print(cat_names)
## # A tibble: 10 × 2
## animal_name n
## <chr> <int>
## 1 Luna 111
## 2 Lucy 102
## 3 Lily 86
## 4 Max 83
## 5 Bella 82
## 6 Charlie 81
## 7 Oliver 73
## 8 Jack 65
## 9 Sophie 59
## 10 Leo 54
dog_names <- seattlepets %>%
filter(species == "Dog", !is.na(animal_name), animal_name != "") %>%
count(animal_name, sort = TRUE) %>%
head(10)
print("Top 10 Dog Names:")
## [1] "Top 10 Dog Names:"
print(dog_names)
## # A tibble: 10 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 337
## 2 Charlie 306
## 3 Bella 249
## 4 Luna 244
## 5 Daisy 221
## 6 Cooper 189
## 7 Lola 187
## 8 Max 186
## 9 Molly 186
## 10 Stella 185
popular_names <- seattlepets %>%
filter(!is.na(animal_name), animal_name != "NA", animal_name != "") %>%
count(animal_name) %>%
filter(n > 100)
nrow(popular_names)
## [1] 56
Answer: Highest - Shadow, Lowest - Riley
names_over_100 <- popular_names$animal_name
ratio_data <- seattlepets %>%
filter(animal_name %in% names_over_100, species %in% c("Cat", "Dog")) %>%
group_by(animal_name, species) %>%
count() %>%
pivot_wider(names_from = species, values_from = n, values_fill = list(n = 0)) %>%
mutate(cat_to_dog = Cat / Dog) %>%
filter(is.finite(cat_to_dog)) %>%
arrange(desc(cat_to_dog))
head(ratio_data, 1)
## # A tibble: 1 × 4
## # Groups: animal_name [1]
## animal_name Cat Dog cat_to_dog
## <chr> <int> <int> <dbl>
## 1 Shadow 53 79 0.671
tail(ratio_data, 1)
## # A tibble: 1 × 4
## # Groups: animal_name [1]
## animal_name Cat Dog cat_to_dog
## <chr> <int> <int> <dbl>
## 1 Riley 9 117 0.0769