library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata

1. Data Transformation and Visualization with the flights data set.

flights %>%
  filter(month %in% c(6, 7) & !is.na(arr_delay)) %>%
  ggplot(aes(x = arr_delay)) +
  geom_histogram(bins = 30)

=> Most flights are right on time, short delays happen, but long ones are rare.

flights %>%
  filter(origin == "EWR", day == 1, !is.na(dep_delay), !is.na(arr_delay)) %>%
  ggplot(aes(x = dep_delay, y = arr_delay)) +
  geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))

=> Departure and arrival delays follow a predictable pattern where they move together, though it’s clear that early starts help flights land ahead of schedule.

flights %>%
  filter(!is.na(dep_time)) %>%
  arrange(distance) %>%
  select(origin, dest, distance)
## # A tibble: 328,521 × 3
##    origin dest  distance
##    <chr>  <chr>    <dbl>
##  1 EWR    PHL         80
##  2 EWR    PHL         80
##  3 EWR    PHL         80
##  4 EWR    PHL         80
##  5 EWR    PHL         80
##  6 EWR    PHL         80
##  7 EWR    PHL         80
##  8 EWR    PHL         80
##  9 EWR    PHL         80
## 10 EWR    PHL         80
## # ℹ 328,511 more rows

=> The shortest flight that actually departed is from EWR (Newark) to PHL (Philadelphia), covering a distance of 80 miles.

flights %>%
  mutate(dist_cat = ifelse(distance < 500, "short-distance", "long-distance")) %>%
  ggplot(aes(x = dist_cat)) +
  geom_bar()

=>Long-distance flights are much more common and the gap is huge compared to short ones, we see a predictable pattern where they move together with departure delays, proving that early starts help keep everything on track.

ggplot(flights, aes(x = dest, y = dep_delay)) +
  stat_summary(fun = "mean", geom = "bar",na.rm = TRUE)

=> I can’t find which one has the longest average departure delay by the graph.

flights %>%
  group_by(dest) %>%
  summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_dep_delay))
## # A tibble: 105 × 2
##    dest  avg_dep_delay
##    <chr>         <dbl>
##  1 CAE            35.6
##  2 TUL            34.9
##  3 OKC            30.6
##  4 BHM            29.7
##  5 TYS            28.5
##  6 JAC            26.5
##  7 DSM            26.2
##  8 RIC            23.6
##  9 ALB            23.6
## 10 MSN            23.6
## # ℹ 95 more rows

=> CAE is the longest average departure delay.

flights %>%
  filter(!is.na(air_time)) %>%
  mutate(speed = distance * 60 / air_time) %>%
  group_by(carrier) %>%
  summarise(avg_speed = mean(speed, na.rm = TRUE)) %>%
  filter(avg_speed == max(avg_speed) | avg_speed == min(avg_speed)) %>%
  arrange(desc(avg_speed))
## # A tibble: 2 × 2
##   carrier avg_speed
##   <chr>       <dbl>
## 1 HA           480.
## 2 YV           332.

=> the highest is HA and lowest is YV

flights %>%
  mutate(weekday = weekdays(time_hour)) %>%
  group_by(weekday) %>%
  summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_dep_delay))
## # A tibble: 7 × 2
##   weekday   avg_dep_delay
##   <chr>             <dbl>
## 1 Thursday          16.1 
## 2 Monday            14.8 
## 3 Friday            14.7 
## 4 Wednesday         11.8 
## 5 Sunday            11.6 
## 6 Tuesday           10.6 
## 7 Saturday           7.65

=>Thursdays and Fridays had the longest departure delay on average.

2. Analyzing the seattlepets data set.

table(seattlepets$species)
## 
##   Cat   Dog  Goat   Pig 
## 17294 35181    38     6

=> There are 4 species in this data are Cat, Dog, Goat, Pig

cat_breeds <- seattlepets %>%
  filter(species == "Cat", !is.na(primary_breed)) %>%
  count(primary_breed) %>%
  arrange(desc(n))
cat_breeds$primary_breed[1]
## [1] "Domestic Shorthair"
dog_breeds <- seattlepets %>%
  filter(species == "Dog", !is.na(primary_breed)) %>%
  count(primary_breed) %>%
  arrange(desc(n))
dog_breeds$primary_breed[1]
## [1] "Retriever, Labrador"
seattlepets %>%
  filter(!is.na(animal_name)) %>%
  count(animal_name) %>%
  arrange(desc(n)) %>%
  head(3)
## # A tibble: 3 × 2
##   animal_name     n
##   <chr>       <int>
## 1 Lucy          439
## 2 Charlie       387
## 3 Luna          355

=> Lucy, Charlie and Luna are the most 3 popular pet names in Seattle

# For Cats
seattlepets %>%
  filter(species == "Cat", !is.na(animal_name)) %>%
  count(animal_name) %>%
  arrange(desc(n)) %>%
  head(10)
## # A tibble: 10 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Luna          111
##  2 Lucy          102
##  3 Lily           86
##  4 Max            83
##  5 Bella          82
##  6 Charlie        81
##  7 Oliver         73
##  8 Jack           65
##  9 Sophie         59
## 10 Leo            54
# For Dogs
seattlepets %>%
  filter(species == "Dog", !is.na(animal_name)) %>%
  count(animal_name) %>%
  arrange(desc(n)) %>%
  head(10)
## # A tibble: 10 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Lucy          337
##  2 Charlie       306
##  3 Bella         249
##  4 Luna          244
##  5 Daisy         221
##  6 Cooper        189
##  7 Lola          187
##  8 Max           186
##  9 Molly         186
## 10 Stella        185
seattlepets %>%
  filter(!is.na(animal_name)) %>%
  count(animal_name) %>%
  filter(n > 100) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    56

=> Include NA, there are 56 names that appears more than 100 times,

seattlepets %>%
  filter(!is.na(animal_name)) %>%
  group_by(animal_name) %>%
  summarise(
    total = n(),
    cats = sum(species == "Cat", na.rm = TRUE),
    dogs = sum(species == "Dog", na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(total > 100, dogs > 0) %>%
  mutate(cat_to_dog = cats / dogs) %>%
  arrange(desc(cat_to_dog)) %>%
  slice(1, n())
## # A tibble: 2 × 5
##   animal_name total  cats  dogs cat_to_dog
##   <chr>       <int> <int> <int>      <dbl>
## 1 Shadow        132    53    79     0.671 
## 2 Riley         126     9   117     0.0769

=>Name Shadow is the highest cat-to-dog ratio , and Riley is the lowest cat-to-dog ratio.

seattlepets %>%
  filter(species %in% c("Cat", "Dog")) %>%
  ggplot(aes(x = species, fill = species)) +
  geom_bar() +
  labs(
    title = "Registered Cats vs. Dogs in Seattle",
    x = "Species",
    y = "Number of Registered Pets"
  ) +
  theme(plot.title = element_text(hjust = 0.5))

=>Dogs is more than twice as tall as the bar for Cats, => that registered dogs are outnumber registered cats in Seattle.