DV Homework 2

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'tidyr' was built under R version 4.4.3

## Warning: package 'purrr' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(nycflights13)
library(openintro)

## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata

1. Data Transformation and Visualization with the flights data set
1a: Create a histogram of arrival delays (excluding NAs) for all flights in June and July. Summarize your findings.

flights %>%
  filter(month %in% c(6, 7), !is.na(arr_delay)) %>%
  ggplot(aes(x = arr_delay)) +
  geom_histogram(bins = 50, fill = "steelblue", color = 'black') +
  labs(title = 'Arrival Delays in June and July',
       x = "Arrival Delay (min)")

Summary: The histogram is heavily right-skewed. Most of the flight arrived on time or even earlier than expected.

1b: Create a smooth line graph of arrival delays vs departure delays for all flights departing from EWR on the first day of each month. Summarize your findings.

flights %>%
  filter(origin == "EWR", day == 1, !is.na(arr_delay), !is.na(dep_delay)) %>%
  ggplot(aes(x = dep_delay, y = arr_delay)) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Arrival vs Departure Delays from EWR",
       x = "Departure Delay (min)",
       y = "Arrival Delay (min)")

## `geom_smooth()` using formula = 'y ~ x'

Summary: There is a positive linear relationship between departure and arrival delay, and they are similar to each other.

1c: Find the flights that actually departed with the shortest travel distance. What is its origin and destination airport?

flights %>%
  filter(!is.na(distance)) %>%
  arrange(distance) %>%
  select(origin, dest, distance) %>%
  head(1)

## # A tibble: 1 × 3
##   origin dest  distance
##   <chr>  <chr>    <dbl>
## 1 EWR    LGA         17

1d: Create a bar plot to compare the number of flights in each category. Summarize your findings.

flights %>%
  filter(!is.na(distance)) %>%
  mutate(distance_type = ifelse(distance < 500, "short-distance", "long-distance")) %>%
  ggplot(aes(x = distance_type, fill = distance_type)) +
  geom_bar() +
  labs(title = "Short vs Long Distance Flights",
       x = "Distance Category",
       y = "Number of Flights")

Summary: While short distance flights are around <100,000 fights, long distance flight were more than 250,000 flights.

1e: Find the destination airport that has the longest average departure delay by creating a graph.

flights %>%
  group_by(dest) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_dep_delay)) %>%
  head(1) %>% 
  ggplot(aes(x = reorder(dest, avg_dep_delay), y = avg_dep_delay)) +
  geom_col(fill = "coral") +
  coord_flip() + 
  labs(title = "Destination that has the longest average departure delay",
       x = "Destination Airport",
       y = "Average Departure Delay (min)")

1f: Answer the question in (e) without creating a graph.

flights %>%
  group_by(dest) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_dep_delay)) %>%
  head(1)

## # A tibble: 1 × 2
##   dest  avg_dep_delay
##   <chr>         <dbl>
## 1 CAE            35.6

1g: Find the carriers with the highest and the lowest average flight speed for all their flights in the data set.

speed_data <- flights %>%
  filter(!is.na(distance), !is.na(air_time)) %>%
  mutate(speed_mph = distance / (air_time / 60)) %>%
  group_by(carrier) %>%
  summarize(avg_speed = mean(speed_mph)) %>%
  arrange(desc(avg_speed))

# Highest speed
head(speed_data, 1)

## # A tibble: 1 × 2
##   carrier avg_speed
##   <chr>       <dbl>
## 1 HA           480.

# Lowest speed
tail(speed_data, 1)

## # A tibble: 1 × 2
##   carrier avg_speed
##   <chr>       <dbl>
## 1 YV           332.

1h: Bonus

flights %>%
  mutate(date = make_date(year, month, day),
         weekday = wday(date, label = TRUE)) %>%
  group_by(weekday) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_dep_delay))

## # A tibble: 7 × 2
##   weekday avg_dep_delay
##   <ord>           <dbl>
## 1 Thu             16.1 
## 2 Mon             14.8 
## 3 Fri             14.7 
## 4 Wed             11.8 
## 5 Sun             11.6 
## 6 Tue             10.6 
## 7 Sat              7.65

2. Analyzing the seattlepets data set
2a: How many species are there in the data set? What are they?

seattlepets %>%
  distinct(species)

## # A tibble: 4 × 1
##   species
##   <chr>  
## 1 Dog    
## 2 Cat    
## 3 Goat   
## 4 Pig

2b: What are the most popular primary breeds for cats and dogs, respectively?

# For Cats
seattlepets %>%
  filter(species == "Cat") %>%
  count(primary_breed)

## # A tibble: 58 × 2
##    primary_breed          n
##    <chr>              <int>
##  1 Abyssinian            48
##  2 American Bobtail       5
##  3 American Curl          7
##  4 American Shorthair   860
##  5 American Wirehair      5
##  6 Angora                 6
##  7 Asian Shorthair        2
##  8 Balinese              27
##  9 Bengal                66
## 10 Birman                26
## # ℹ 48 more rows

# For Dogs
seattlepets %>%
  filter(species == "Dog") %>%
  count(primary_breed)

## # A tibble: 275 × 2
##    primary_breed                  n
##    <chr>                      <int>
##  1 Abruzzese Mastiff              1
##  2 Affenpinscher                  5
##  3 Afghan Hound                   6
##  4 Akbash                         4
##  5 Akita                         66
##  6 Alapaha Blue Blood Bulldog     2
##  7 Alaskan Husky                 80
##  8 Alaskan Klee Kai              24
##  9 Alaskan Malamute              95
## 10 American Bandogge Mastiff      1
## # ℹ 265 more rows

2c: What are the three most common pet names in Seattle?

seattlepets %>%
  filter(!is.na(animal_name)) %>%
  count(animal_name, sort = TRUE) %>%
  head(3)

## # A tibble: 3 × 2
##   animal_name     n
##   <chr>       <int>
## 1 Lucy          439
## 2 Charlie       387
## 3 Luna          355

2d: What are the ten most common pet names for cats? What are the ten most common pet names for dogs?

# Top 10 Cat Names
seattlepets %>%
  filter(species == "Cat", !is.na(animal_name)) %>%
  count(animal_name, sort = TRUE) %>%
  head(10)

## # A tibble: 10 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Luna          111
##  2 Lucy          102
##  3 Lily           86
##  4 Max            83
##  5 Bella          82
##  6 Charlie        81
##  7 Oliver         73
##  8 Jack           65
##  9 Sophie         59
## 10 Leo            54

# Top 10 Dog Names
seattlepets %>%
  filter(species == "Dog", !is.na(animal_name)) %>%
  count(animal_name, sort = TRUE) %>%
  head(10)

## # A tibble: 10 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Lucy          337
##  2 Charlie       306
##  3 Bella         249
##  4 Luna          244
##  5 Daisy         221
##  6 Cooper        189
##  7 Lola          187
##  8 Max           186
##  9 Molly         186
## 10 Stella        185

2e: How many names appear more than 100 times in the data set excluding “NA”?

seattlepets %>%
  filter(!is.na(animal_name)) %>%
  count(animal_name) %>%
  filter(n > 100) %>%
  nrow()

## [1] 56

DV Homework 2

Duc Vinh Hoang

2026-05-05