Working with the data set flights in the package nycflights13, answer the following questions by performing necessary data transformation/visualization.
flights%>%
filter(month %in% c(6,7))%>%
filter(!is.na(arr_delay))%>%
ggplot(aes(x=arr_delay))+
geom_histogram(bins=50,
fill="blue",
color="black")+
xlim(c(-30,200))+
labs(
title="arrival delays (excluding NAs) for all flights in June and July",
x="arrival delay (minutes)",
y="number of flights"
)
The histogram shows a right skew, suggesting flights that are on time is common, and delays are not common.
flights%>%
filter(origin == "EWR",
day == 1,
!is.na(arr_delay),
!is.na(dep_delay))%>%
ggplot(aes(x=arr_delay, y=dep_delay))+
geom_point()+
geom_smooth(method = "lm", se=F, color="red")+
labs(
title="arrival delays vs departure delays for all flights departing from EWR on the first day of each month",
x="arrival delays (minutes)",
y="departure delays (minutes)"
)
The plot shows a trend where arrival and departure delays are closely related, suggesting delays in on arrival will delay departure times.
flights%>%
filter(!is.na(dep_time),
!is.na(distance))%>%
arrange(distance)%>%
select(origin, dest, distance)
## # A tibble: 328,521 × 3
## origin dest distance
## <chr> <chr> <dbl>
## 1 EWR PHL 80
## 2 EWR PHL 80
## 3 EWR PHL 80
## 4 EWR PHL 80
## 5 EWR PHL 80
## 6 EWR PHL 80
## 7 EWR PHL 80
## 8 EWR PHL 80
## 9 EWR PHL 80
## 10 EWR PHL 80
## # ℹ 328,511 more rows
Shortest distance: from EWR to PHL, distance of 80 miles.
flight_dist <-flights%>%
mutate(flight_dist = if_else(distance<500,
"short-distance",
"long-distance"
))
flight_dist%>%
ggplot(aes(x=flight_dist, fill=flight_dist))+
geom_bar()+
labs(
title = "Number of Short- vs Long-Distance Flights",
x = "Flight distance category",
y = "Number of flights"
)
long distance flights are more common compared to short distance flights.
flights%>%
group_by(dest)%>%
summarise(avg_dep_delay=mean(dep_delay, na.rm=T))%>%
ggplot(aes(x=avg_dep_delay, y=reorder(dest, avg_dep_delay), fill=avg_dep_delay))+
geom_col()+
labs(
title = "destination airport that has the longest average departure delay",
x="average departure delay (minutes)",
y="destination airport"
)
flights%>%
group_by(dest)%>%
summarise(avg_dep_delay=mean(dep_delay, na.rm=T))%>%
arrange(desc(avg_dep_delay), dest)
## # A tibble: 105 × 2
## dest avg_dep_delay
## <chr> <dbl>
## 1 CAE 35.6
## 2 TUL 34.9
## 3 OKC 30.6
## 4 BHM 29.7
## 5 TYS 28.5
## 6 JAC 26.5
## 7 DSM 26.2
## 8 RIC 23.6
## 9 ALB 23.6
## 10 MSN 23.6
## # ℹ 95 more rows
carrier_speed <- flights %>%
filter(!is.na(distance), !is.na(air_time)) %>%
mutate(speed = distance/air_time) %>%
group_by(carrier) %>%
summarise(
avg_speed = mean(speed, na.rm = TRUE),
n_flights = n(),
)
# Highest average speed carriers
carrier_speed %>%
slice_max(avg_speed, n = 1)
## # A tibble: 1 × 3
## carrier avg_speed n_flights
## <chr> <dbl> <int>
## 1 HA 8.01 342
# Lowest average speed carriers
carrier_speed %>%
slice_min(avg_speed, n = 1)
## # A tibble: 1 × 3
## carrier avg_speed n_flights
## <chr> <dbl> <int>
## 1 YV 5.53 544
For the following questions, analyze the data set seattlepets in the package openintro. Read the help document and make sure that you understand the basic information about the data set before analysis.
table(seattlepets$species)
##
## Cat Dog Goat Pig
## 17294 35181 38 6
there are 4 species, they are: cat, dog, goat, and pig.
seattlepets%>%
filter(species %in% c("Cat", "Dog"))%>%
group_by(species, primary_breed)%>%
summarise(n=n())%>%
arrange(species)%>%
slice_max(n)
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 3
## # Groups: species [2]
## species primary_breed n
## <chr> <chr> <int>
## 1 Cat Domestic Shorthair 10086
## 2 Dog Retriever, Labrador 4867
seattlepets%>%
filter(!is.na(animal_name))%>%
group_by(animal_name)%>%
summarise(n=n(), .groups = "drop")%>%
arrange(desc(n))%>%
slice_head(n=3)
## # A tibble: 3 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 439
## 2 Charlie 387
## 3 Luna 355
three most common pet names are: Lucy, Charlie, Luna.
# ten most common pet names for cats
seattlepets%>%
filter(!is.na(animal_name))%>%
filter(species == "Cat")%>%
group_by(species, animal_name)%>%
summarise(n=n(), .groups="drop")%>%
slice_max(n, n=10)%>%
arrange(species, desc(n), animal_name)
## # A tibble: 11 × 3
## species animal_name n
## <chr> <chr> <int>
## 1 Cat Luna 111
## 2 Cat Lucy 102
## 3 Cat Lily 86
## 4 Cat Max 83
## 5 Cat Bella 82
## 6 Cat Charlie 81
## 7 Cat Oliver 73
## 8 Cat Jack 65
## 9 Cat Sophie 59
## 10 Cat Leo 54
## 11 Cat Molly 54
# ten most common pet names for dogs
seattlepets%>%
filter(!is.na(animal_name))%>%
filter(species == "Dog")%>%
group_by(species, animal_name)%>%
summarise(n=n(), .groups="drop")%>%
slice_max(n, n=10)%>%
arrange(species, desc(n), animal_name)
## # A tibble: 10 × 3
## species animal_name n
## <chr> <chr> <int>
## 1 Dog Lucy 337
## 2 Dog Charlie 306
## 3 Dog Bella 249
## 4 Dog Luna 244
## 5 Dog Daisy 221
## 6 Dog Cooper 189
## 7 Dog Lola 187
## 8 Dog Max 186
## 9 Dog Molly 186
## 10 Dog Stella 185
seattlepets%>%
filter(!is.na(animal_name))%>%
group_by(animal_name)%>%
summarise(n=n(), .groups="drop")%>%
filter(n>100)%>%
summarise(num_names=n())
## # A tibble: 1 × 1
## num_names
## <int>
## 1 56
cat_dog_ratio <- seattlepets%>%
filter(!is.na(animal_name), species %in% c("Cat", "Dog"))%>%
group_by(species, animal_name)%>%
summarise(n=n(), .groups="drop")%>%
pivot_wider(
names_from = species,
values_from = n,
)%>%
mutate(total=Cat+Dog)%>%
filter(total>100, Dog>0)%>%
mutate(cat_to_dog = Cat/Dog)
# highest cat to dog ratio
cat_dog_ratio%>%
arrange(desc(cat_to_dog))%>%
slice(1)
## # A tibble: 1 × 5
## animal_name Cat Dog total cat_to_dog
## <chr> <int> <int> <int> <dbl>
## 1 Shadow 53 79 132 0.671
# lowest cat to dog ratio
cat_dog_ratio%>%
arrange(cat_to_dog)%>%
slice(1)
## # A tibble: 1 × 5
## animal_name Cat Dog total cat_to_dog
## <chr> <int> <int> <int> <dbl>
## 1 Riley 9 117 126 0.0769