Homework 2

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(nycflights13)
library(openintro)

## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata

1. Data Transformation and Visualization with the flights data set

a. Create a histogram of arrival delays (excluding NAs) for all flights in June and July. Summarize your findings.

Summary of findings: The distribution of arrival delays is right-skewed. The majority of flights arrive close to their scheduled time or slightly early, represented by negative numbers. There is a long tail stretching to the right, showing that a small number of flights experience extreme delays of several hours.

flights_jun_jul <- flights %>% 
  filter(month %in% c(6, 7), !is.na(arr_delay))

ggplot(flights_jun_jul, aes(x = arr_delay)) + 
  geom_histogram(binwidth = 15, fill = "steelblue", color = "white") + 
  theme_minimal() + 
  labs(title = "Histogram of Arrival Delays in June and July",
       x = "Arrival Delay (minutes)",
       y = "Frequency")

b. Create a smooth line graph of arrival delays vs departure delays for all flights departing from EWR on the first day of each month. Summarize your findings.

Summary of findings: There is a strong, positive linear relationship between departure delays and arrival delays. If a flight leaves late, it almost always arrives late by roughly the same amount of time.

ewr_first_day <- flights %>% 
  filter(origin == "EWR", day == 1, !is.na(arr_delay), !is.na(dep_delay))

ggplot(ewr_first_day, aes(x = dep_delay, y = arr_delay)) + 
  geom_smooth(method = "loess", color = "darkred") + 
  theme_minimal() + 
  labs(title = "Arrival vs. Departure Delays (EWR, 1st of each month)",
       x = "Departure Delay (minutes)",
       y = "Arrival Delay (minutes)")

## `geom_smooth()` using formula = 'y ~ x'

c.Find the flights that actually departed with the shortest travel distance. What is its origin and destination airport?

Answer: The shortest flight in the dataset is 80 miles, flying from EWR to PHL

shortest_flight <- flights %>% 
  filter(!is.na(dep_time)) %>%
  arrange(distance) %>% 
  select(origin, dest, distance)

shortest_flight

## # A tibble: 328,521 × 3
##    origin dest  distance
##    <chr>  <chr>    <dbl>
##  1 EWR    PHL         80
##  2 EWR    PHL         80
##  3 EWR    PHL         80
##  4 EWR    PHL         80
##  5 EWR    PHL         80
##  6 EWR    PHL         80
##  7 EWR    PHL         80
##  8 EWR    PHL         80
##  9 EWR    PHL         80
## 10 EWR    PHL         80
## # ℹ 328,511 more rows

d. Create a new categorical variable with two labels. Flights with a travel distance shorter than 500 miles are marked as “short-distance”, and otherwise “long-distance”. Create a bar plot to compare the number of flights in each category. Summarize your findings.

Summary of findings: There are more “long-distance” flights (500 miles or more) than “short-distance” flights (under 500 miles) departing from NYC airports.

flights <- flights %>% 
  mutate(dist_category = ifelse(distance < 500, "short-distance", "long-distance"))

ggplot(flights, aes(x = dist_category, fill = dist_category)) + 
  geom_bar() + 
  theme_minimal() + 
  labs(title = "Number of Short vs. Long Distance Flights",
       x = "Distance Category",
       y = "Count") +
  theme(legend.position = "none")

e. Find the destination airport that has the longest average departure delay by creating a graph.

top_delays <- flights %>% 
  group_by(dest) %>% 
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>% 
  arrange(desc(avg_dep_delay)) %>% 
  head(10) 

ggplot(top_delays, aes(x = reorder(dest, avg_dep_delay), y = avg_dep_delay)) + 
  geom_col(fill = "coral") + 
  coord_flip() + 
  theme_minimal() + 
  labs(title = "Top 10 Destinations by Highest Avg Departure Delay",
       x = "Destination Airport",
       y = "Average Departure Delay (minutes)")

f. Answer the question in (e) without creating a graph.

Answer: CAE

flights %>% 
  group_by(dest) %>% 
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>% 
  arrange(desc(avg_dep_delay))

## # A tibble: 105 × 2
##    dest  avg_dep_delay
##    <chr>         <dbl>
##  1 CAE            35.6
##  2 TUL            34.9
##  3 OKC            30.6
##  4 BHM            29.7
##  5 TYS            28.5
##  6 JAC            26.5
##  7 DSM            26.2
##  8 RIC            23.6
##  9 ALB            23.6
## 10 MSN            23.6
## # ℹ 95 more rows

g. Find the carriers with the highest and the lowest average flight speed for all their flights in the data set.

carrier_speeds <- flights %>% 
  mutate(speed_mph = distance / (air_time / 60)) %>% 
  group_by(carrier) %>% 
  summarize(avg_speed = mean(speed_mph, na.rm = TRUE)) %>% 
  arrange(desc(avg_speed))

highest <- head(carrier_speeds, 1)
highest

## # A tibble: 1 × 2
##   carrier avg_speed
##   <chr>       <dbl>
## 1 HA           480.

lowest <- tail(carrier_speeds, 1)
lowest

## # A tibble: 1 × 2
##   carrier avg_speed
##   <chr>       <dbl>
## 1 YV           332.

2. Analyzing the seattlepets data set

a. How many species are there in the data set? What are they?

Answer: There are 4 species in the dataset. They are Cat, Dog, Goat, and Pig.

num_species <- seattlepets %>% summarize(n = n_distinct(species))
species_list <- unique(seattlepets$species)

num_species

## # A tibble: 1 × 1
##       n
##   <int>
## 1     4

species_list

## [1] "Dog"  "Cat"  "Goat" "Pig"

b.What are the most popular primary breeds for cats and dogs, respectively?

Answer: For cats, the most popular breed is Domestic Shorthair. For dogs, it is the Labrador Retriever

seattlepets %>% 
  filter(species %in% c("Cat", "Dog")) %>% 
  group_by(species, primary_breed) %>% 
  count() %>% 
  group_by(species) %>% 
  slice_max(n, n = 1)

## # A tibble: 2 × 3
## # Groups:   species [2]
##   species primary_breed           n
##   <chr>   <chr>               <int>
## 1 Cat     Domestic Shorthair  10086
## 2 Dog     Retriever, Labrador  4867

c. What are the three most common pet names in Seattle?

Answer: The top three overall names are usually Lucy, Charlie, and Bella (or Luna).

seattlepets %>% 
  filter(!is.na(animal_name), animal_name != "") %>% 
  count(animal_name, sort = TRUE) %>% 
  head(3)

## # A tibble: 3 × 2
##   animal_name     n
##   <chr>       <int>
## 1 Lucy          439
## 2 Charlie       387
## 3 Luna          355

d. What are the ten most common pet names for cats? What are the ten most common pet names for dogs? Write a code to print the result and their frequencies.

cat_names <- seattlepets %>% 
  filter(species == "Cat", !is.na(animal_name), animal_name != "") %>% 
  count(animal_name, sort = TRUE) %>% 
  head(10)

print("Top 10 Cat Names:")

## [1] "Top 10 Cat Names:"

print(cat_names)

## # A tibble: 10 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Luna          111
##  2 Lucy          102
##  3 Lily           86
##  4 Max            83
##  5 Bella          82
##  6 Charlie        81
##  7 Oliver         73
##  8 Jack           65
##  9 Sophie         59
## 10 Leo            54

dog_names <- seattlepets %>% 
  filter(species == "Dog", !is.na(animal_name), animal_name != "") %>% 
  count(animal_name, sort = TRUE) %>% 
  head(10)

print("Top 10 Dog Names:")

## [1] "Top 10 Dog Names:"

print(dog_names)

## # A tibble: 10 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Lucy          337
##  2 Charlie       306
##  3 Bella         249
##  4 Luna          244
##  5 Daisy         221
##  6 Cooper        189
##  7 Lola          187
##  8 Max           186
##  9 Molly         186
## 10 Stella        185

e. How many names appear more than 100 times in the data set excluding “NA”?

popular_names <- seattlepets %>% 
  filter(!is.na(animal_name), animal_name != "NA", animal_name != "") %>% 
  count(animal_name) %>% 
  filter(n > 100)

nrow(popular_names)

## [1] 56

f. For all names that appear more than 100 times in the data set, which has the highest “cat_to_dog” ratio? Which has the lowest? The “cat_to_dog” ratio can be computed this way - if a name appears 200 times, in which 150 are for cats and 50 are for dogs, the ratio is 150/50 = 3.

Answer: Highest - Shadow, Lowest - Riley

names_over_100 <- popular_names$animal_name

ratio_data <- seattlepets %>% 
  filter(animal_name %in% names_over_100, species %in% c("Cat", "Dog")) %>% 
  group_by(animal_name, species) %>% 
  count() %>% 
  pivot_wider(names_from = species, values_from = n, values_fill = list(n = 0)) %>% 
  mutate(cat_to_dog = Cat / Dog) %>% 
  filter(is.finite(cat_to_dog)) %>% 
  arrange(desc(cat_to_dog))


head(ratio_data, 1)

## # A tibble: 1 × 4
## # Groups:   animal_name [1]
##   animal_name   Cat   Dog cat_to_dog
##   <chr>       <int> <int>      <dbl>
## 1 Shadow         53    79      0.671

tail(ratio_data, 1)

## # A tibble: 1 × 4
## # Groups:   animal_name [1]
##   animal_name   Cat   Dog cat_to_dog
##   <chr>       <int> <int>      <dbl>
## 1 Riley           9   117     0.0769