library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
1. Data Transformation and Visualization with the flights data
set
1a: Create a histogram of arrival delays (excluding NAs) for all
flights in June and July. Summarize your findings.
flights %>%
filter(month %in% c(6, 7), !is.na(arr_delay)) %>%
ggplot(aes(x = arr_delay)) +
geom_histogram(bins = 50, fill = "steelblue", color = 'black') +
labs(title = 'Arrival Delays in June and July',
x = "Arrival Delay (min)")
Summary: The histogram is heavily right-skewed. Most of the flight
arrived on time or even earlier than expected.
1b: Create a smooth line graph of arrival delays vs departure
delays for all flights departing from EWR on the first day of each
month. Summarize your findings.
flights %>%
filter(origin == "EWR", day == 1, !is.na(arr_delay), !is.na(dep_delay)) %>%
ggplot(aes(x = dep_delay, y = arr_delay)) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Arrival vs Departure Delays from EWR",
x = "Departure Delay (min)",
y = "Arrival Delay (min)")
## `geom_smooth()` using formula = 'y ~ x'
Summary: There is a positive linear relationship between departure
and arrival delay, and they are similar to each other.
1c: Find the flights that actually departed with the shortest
travel distance. What is its origin and destination airport?
flights %>%
filter(!is.na(distance)) %>%
arrange(distance) %>%
select(origin, dest, distance) %>%
head(1)
## # A tibble: 1 × 3
## origin dest distance
## <chr> <chr> <dbl>
## 1 EWR LGA 17
1d: Create a bar plot to compare the number of flights in each
category. Summarize your findings.
flights %>%
filter(!is.na(distance)) %>%
mutate(distance_type = ifelse(distance < 500, "short-distance", "long-distance")) %>%
ggplot(aes(x = distance_type, fill = distance_type)) +
geom_bar() +
labs(title = "Short vs Long Distance Flights",
x = "Distance Category",
y = "Number of Flights")
Summary: While short distance flights are around <100,000 fights,
long distance flight were more than 250,000 flights.
1e: Find the destination airport that has the longest average
departure delay by creating a graph.
flights %>%
group_by(dest) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay)) %>%
head(1) %>%
ggplot(aes(x = reorder(dest, avg_dep_delay), y = avg_dep_delay)) +
geom_col(fill = "coral") +
coord_flip() +
labs(title = "Destination that has the longest average departure delay",
x = "Destination Airport",
y = "Average Departure Delay (min)")
1f: Answer the question in (e) without creating a graph.
flights %>%
group_by(dest) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay)) %>%
head(1)
## # A tibble: 1 × 2
## dest avg_dep_delay
## <chr> <dbl>
## 1 CAE 35.6
1g: Find the carriers with the highest and the lowest average
flight speed for all their flights in the data set.
speed_data <- flights %>%
filter(!is.na(distance), !is.na(air_time)) %>%
mutate(speed_mph = distance / (air_time / 60)) %>%
group_by(carrier) %>%
summarize(avg_speed = mean(speed_mph)) %>%
arrange(desc(avg_speed))
# Highest speed
head(speed_data, 1)
## # A tibble: 1 × 2
## carrier avg_speed
## <chr> <dbl>
## 1 HA 480.
# Lowest speed
tail(speed_data, 1)
## # A tibble: 1 × 2
## carrier avg_speed
## <chr> <dbl>
## 1 YV 332.
1h: Bonus
flights %>%
mutate(date = make_date(year, month, day),
weekday = wday(date, label = TRUE)) %>%
group_by(weekday) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 7 × 2
## weekday avg_dep_delay
## <ord> <dbl>
## 1 Thu 16.1
## 2 Mon 14.8
## 3 Fri 14.7
## 4 Wed 11.8
## 5 Sun 11.6
## 6 Tue 10.6
## 7 Sat 7.65
2. Analyzing the seattlepets data set
2a: How many species
are there in the data set? What are they?
seattlepets %>%
distinct(species)
## # A tibble: 4 × 1
## species
## <chr>
## 1 Dog
## 2 Cat
## 3 Goat
## 4 Pig
2b: What are the most popular primary breeds for cats and dogs,
respectively?
# For Cats
seattlepets %>%
filter(species == "Cat") %>%
count(primary_breed)
## # A tibble: 58 × 2
## primary_breed n
## <chr> <int>
## 1 Abyssinian 48
## 2 American Bobtail 5
## 3 American Curl 7
## 4 American Shorthair 860
## 5 American Wirehair 5
## 6 Angora 6
## 7 Asian Shorthair 2
## 8 Balinese 27
## 9 Bengal 66
## 10 Birman 26
## # ℹ 48 more rows
# For Dogs
seattlepets %>%
filter(species == "Dog") %>%
count(primary_breed)
## # A tibble: 275 × 2
## primary_breed n
## <chr> <int>
## 1 Abruzzese Mastiff 1
## 2 Affenpinscher 5
## 3 Afghan Hound 6
## 4 Akbash 4
## 5 Akita 66
## 6 Alapaha Blue Blood Bulldog 2
## 7 Alaskan Husky 80
## 8 Alaskan Klee Kai 24
## 9 Alaskan Malamute 95
## 10 American Bandogge Mastiff 1
## # ℹ 265 more rows
2c: What are the three most common pet names in Seattle?
seattlepets %>%
filter(!is.na(animal_name)) %>%
count(animal_name, sort = TRUE) %>%
head(3)
## # A tibble: 3 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 439
## 2 Charlie 387
## 3 Luna 355
2d: What are the ten most common pet names for cats? What are the
ten most common pet names for dogs?
# Top 10 Cat Names
seattlepets %>%
filter(species == "Cat", !is.na(animal_name)) %>%
count(animal_name, sort = TRUE) %>%
head(10)
## # A tibble: 10 × 2
## animal_name n
## <chr> <int>
## 1 Luna 111
## 2 Lucy 102
## 3 Lily 86
## 4 Max 83
## 5 Bella 82
## 6 Charlie 81
## 7 Oliver 73
## 8 Jack 65
## 9 Sophie 59
## 10 Leo 54
# Top 10 Dog Names
seattlepets %>%
filter(species == "Dog", !is.na(animal_name)) %>%
count(animal_name, sort = TRUE) %>%
head(10)
## # A tibble: 10 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 337
## 2 Charlie 306
## 3 Bella 249
## 4 Luna 244
## 5 Daisy 221
## 6 Cooper 189
## 7 Lola 187
## 8 Max 186
## 9 Molly 186
## 10 Stella 185
2e: How many names appear more than 100 times in the data set
excluding “NA”?
seattlepets %>%
filter(!is.na(animal_name)) %>%
count(animal_name) %>%
filter(n > 100) %>%
nrow()
## [1] 56