library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
flights %>%
filter(month %in% c(6, 7) & !is.na(arr_delay)) %>%
ggplot(aes(x = arr_delay)) +
geom_histogram(bins = 30)
=> Most flights are right on time, short delays happen, but long ones are rare.
flights %>%
filter(origin == "EWR", day == 1, !is.na(dep_delay), !is.na(arr_delay)) %>%
ggplot(aes(x = dep_delay, y = arr_delay)) +
geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))
=> Departure and arrival delays follow a predictable pattern where they move together, though it’s clear that early starts help flights land ahead of schedule.
flights %>%
filter(!is.na(dep_time)) %>%
arrange(distance) %>%
select(origin, dest, distance)
## # A tibble: 328,521 × 3
## origin dest distance
## <chr> <chr> <dbl>
## 1 EWR PHL 80
## 2 EWR PHL 80
## 3 EWR PHL 80
## 4 EWR PHL 80
## 5 EWR PHL 80
## 6 EWR PHL 80
## 7 EWR PHL 80
## 8 EWR PHL 80
## 9 EWR PHL 80
## 10 EWR PHL 80
## # ℹ 328,511 more rows
=> The shortest flight that actually departed is from EWR (Newark) to PHL (Philadelphia), covering a distance of 80 miles.
flights %>%
mutate(dist_cat = ifelse(distance < 500, "short-distance", "long-distance")) %>%
ggplot(aes(x = dist_cat)) +
geom_bar()
=>Long-distance flights are much more common and the gap is huge compared to short ones, we see a predictable pattern where they move together with departure delays, proving that early starts help keep everything on track.
ggplot(flights, aes(x = dest, y = dep_delay)) +
stat_summary(fun = "mean", geom = "bar",na.rm = TRUE)
=> I can’t find which one has the longest average departure delay by the graph.
flights %>%
group_by(dest) %>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 105 × 2
## dest avg_dep_delay
## <chr> <dbl>
## 1 CAE 35.6
## 2 TUL 34.9
## 3 OKC 30.6
## 4 BHM 29.7
## 5 TYS 28.5
## 6 JAC 26.5
## 7 DSM 26.2
## 8 RIC 23.6
## 9 ALB 23.6
## 10 MSN 23.6
## # ℹ 95 more rows
=> CAE is the longest average departure delay.
flights %>%
filter(!is.na(air_time)) %>%
mutate(speed = distance * 60 / air_time) %>%
group_by(carrier) %>%
summarise(avg_speed = mean(speed, na.rm = TRUE)) %>%
filter(avg_speed == max(avg_speed) | avg_speed == min(avg_speed)) %>%
arrange(desc(avg_speed))
## # A tibble: 2 × 2
## carrier avg_speed
## <chr> <dbl>
## 1 HA 480.
## 2 YV 332.
=> the highest is HA and lowest is YV
flights %>%
mutate(weekday = weekdays(time_hour)) %>%
group_by(weekday) %>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 7 × 2
## weekday avg_dep_delay
## <chr> <dbl>
## 1 Thursday 16.1
## 2 Monday 14.8
## 3 Friday 14.7
## 4 Wednesday 11.8
## 5 Sunday 11.6
## 6 Tuesday 10.6
## 7 Saturday 7.65
=>Thursdays and Fridays had the longest departure delay on average.
table(seattlepets$species)
##
## Cat Dog Goat Pig
## 17294 35181 38 6
=> There are 4 species in this data are Cat, Dog, Goat, Pig
cat_breeds <- seattlepets %>%
filter(species == "Cat", !is.na(primary_breed)) %>%
count(primary_breed) %>%
arrange(desc(n))
cat_breeds$primary_breed[1]
## [1] "Domestic Shorthair"
dog_breeds <- seattlepets %>%
filter(species == "Dog", !is.na(primary_breed)) %>%
count(primary_breed) %>%
arrange(desc(n))
dog_breeds$primary_breed[1]
## [1] "Retriever, Labrador"
seattlepets %>%
filter(!is.na(animal_name)) %>%
count(animal_name) %>%
arrange(desc(n)) %>%
head(3)
## # A tibble: 3 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 439
## 2 Charlie 387
## 3 Luna 355
=> Lucy, Charlie and Luna are the most 3 popular pet names in Seattle
# For Cats
seattlepets %>%
filter(species == "Cat", !is.na(animal_name)) %>%
count(animal_name) %>%
arrange(desc(n)) %>%
head(10)
## # A tibble: 10 × 2
## animal_name n
## <chr> <int>
## 1 Luna 111
## 2 Lucy 102
## 3 Lily 86
## 4 Max 83
## 5 Bella 82
## 6 Charlie 81
## 7 Oliver 73
## 8 Jack 65
## 9 Sophie 59
## 10 Leo 54
# For Dogs
seattlepets %>%
filter(species == "Dog", !is.na(animal_name)) %>%
count(animal_name) %>%
arrange(desc(n)) %>%
head(10)
## # A tibble: 10 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 337
## 2 Charlie 306
## 3 Bella 249
## 4 Luna 244
## 5 Daisy 221
## 6 Cooper 189
## 7 Lola 187
## 8 Max 186
## 9 Molly 186
## 10 Stella 185
seattlepets %>%
filter(!is.na(animal_name)) %>%
count(animal_name) %>%
filter(n > 100) %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 56
=> Include NA, there are 56 names that appears more than 100 times,
seattlepets %>%
filter(!is.na(animal_name)) %>%
group_by(animal_name) %>%
summarise(
total = n(),
cats = sum(species == "Cat", na.rm = TRUE),
dogs = sum(species == "Dog", na.rm = TRUE),
.groups = "drop"
) %>%
filter(total > 100, dogs > 0) %>%
mutate(cat_to_dog = cats / dogs) %>%
arrange(desc(cat_to_dog)) %>%
slice(1, n())
## # A tibble: 2 × 5
## animal_name total cats dogs cat_to_dog
## <chr> <int> <int> <int> <dbl>
## 1 Shadow 132 53 79 0.671
## 2 Riley 126 9 117 0.0769
=>Name Shadow is the highest cat-to-dog ratio , and Riley is the lowest cat-to-dog ratio.
seattlepets %>%
filter(species %in% c("Cat", "Dog")) %>%
ggplot(aes(x = species, fill = species)) +
geom_bar() +
labs(
title = "Registered Cats vs. Dogs in Seattle",
x = "Species",
y = "Number of Registered Pets"
) +
theme(plot.title = element_text(hjust = 0.5))
=>Dogs is more than twice as tall as the bar for Cats, => that registered dogs are outnumber registered cats in Seattle.