data_1a <- flights %>%
filter(month == c(6, 7), !is.na(arr_delay)) %>%
select(month, day, arr_delay) %>%
mutate(date = paste(month, day, sep = "_"))
ggplot(data_1a) +
geom_histogram(aes(x = arr_delay, fill = as.factor(month)), position = "dodge") +
labs(
title = "Arrival Delays in June and July",
x = "Count",
y = "Arrival Delay"
) +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
Answer: Most of the flights from June and July did not delay their arrival. A very small amount of flights arrival pretty late.
data_1b <- flights %>%
filter(origin == "EWR", day == 1) %>%
select(arr_delay, dep_delay, month) %>%
pivot_longer(
cols = c(arr_delay, dep_delay),
names_to = "delay_type",
values_to = "delay"
)
ggplot(data_1b) +
geom_smooth(aes(month, delay, colour = delay_type)) +
labs(
title = "Arrival Delays VS Departure Delay For Flights From EWR",
x = "Month",
y = "Delay Time"
) +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
Answer: The graph shows that there is strong correlation between EWR flight arrival delays time and flight departure delays.
data_1c <- flights %>%
arrange(distance)
print(head(data_1c))
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 27 NA 106 NA NA 245
## 2 2013 1 3 2127 2129 -2 2222 2224
## 3 2013 1 4 1240 1200 40 1333 1306
## 4 2013 1 4 1829 1615 134 1937 1721
## 5 2013 1 4 2128 2129 -1 2218 2224
## 6 2013 1 5 1155 1200 -5 1241 1306
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
c(data_1c[1,]$origin, data_1c[1,]$dest, data_1c[1,]$distance)
## [1] "EWR" "LGA" "17"
c(data_1c[2,]$origin, data_1c[2,]$dest, data_1c[2,]$distance)
## [1] "EWR" "PHL" "80"
Answer: The shortest travel distant is 17 from EWR to LGA. However, it did not provide the time data. In case the data is an error, the second shortest travel distant is 80 from EWR to PHL.
data_1d <- flights %>%
mutate(distant_group = cut(distance, breaks = c(0, 500, Inf), labels = c("short-distance", "long-distance")))
ggplot(data_1d) +
geom_bar(aes(distant_group)) +
labs(
title = "Count of Distant Level of Flights",
x = "Count",
y = "Distant Level"
) +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
Answer: There are more long distant flights than the short distant flights.
data_1e <- flights %>%
filter(!is.na(dep_delay)) %>%
group_by(dest) %>%
mutate(avg_dep_delay = mean(dep_delay)) %>%
select(dest, avg_dep_delay)
ggplot(data_1e) +
geom_point(aes(dest, avg_dep_delay)) +
labs(
title = "Average Departure Delay Of Each Destination Airport",
x = "Destionation Airport",
y = "Averge Departure Delay time"
) +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)),
axis.text.x = element_text(angle = 45))
Answer: The destination airport that has the longest average departure delay is CAE from the graph.
data_1f <- flights %>%
filter(!is.na(dep_delay)) %>%
group_by(dest) %>%
mutate(avg_dep_delay = mean(dep_delay)) %>%
select(dest, avg_dep_delay) %>%
arrange(desc(avg_dep_delay))
data_1f[1,]$dest
## [1] "CAE"
data_1g <- flights %>%
filter(!is.na(air_time), !is.na(distance)) %>%
mutate(avg_fs = distance/air_time) %>%
select(carrier, air_time, distance, avg_fs)
g_1 <- data_1g %>%
arrange(avg_fs)
g_1[1,]$carrier
## [1] "US"
g_2 <- data_1g %>%
arrange(desc(avg_fs))
g_2[1,]$carrier
## [1] "DL"
Answer: The lowest average flight speed of the carriers is US; The highest average flight speed of carrier is DL.
unique(seattlepets$species)
## [1] "Dog" "Cat" "Goat" "Pig"
Answers: There are totally four species in the data set. They are dog, cat, goat, pig.
data_2b_dog <- seattlepets %>%
filter(species == c("Dog")) %>%
group_by(primary_breed) %>%
summarise(count = length(species)) %>%
arrange(desc(count))
data_2b_cat <- seattlepets %>%
filter(species == c("Cat")) %>%
group_by(primary_breed) %>%
summarise(count = length(species)) %>%
arrange(desc(count))
paste("The most popular primary breeds for cat:",
data_2b_cat[1,]$primary_breed, ";",
"The most popular primary breeds for dog:",
data_2b_dog[1,]$primary_breed)
## [1] "The most popular primary breeds for cat: Domestic Shorthair ; The most popular primary breeds for dog: Retriever, Labrador"
data_2c <- seattlepets %>%
filter(!is.na(animal_name)) %>%
group_by(animal_name) %>%
summarise(count = length(animal_name)) %>%
arrange(desc(count))
paste(data_2c[1,]$animal_name,
data_2c[2,]$animal_name,
data_2c[3,]$animal_name, sep = ",")
## [1] "Lucy,Charlie,Luna"
Answer: The three most popular pet names are Lucy, Charlie and Luna.
data_2d_dog <- seattlepets %>%
filter(species == c("Dog"), !is.na(animal_name)) %>%
group_by(animal_name) %>%
summarise(count = length(animal_name)) %>%
arrange(desc(count))
data_2d_cat <- seattlepets %>%
filter(species == c("Cat"), !is.na(animal_name)) %>%
group_by(animal_name) %>%
summarise(count = length(animal_name)) %>%
arrange(desc(count))
print("10 most common pet names for Cats and Dogs:")
## [1] "10 most common pet names for Cats and Dogs:"
print("Dog:")
## [1] "Dog:"
paste(data_2d_dog[1:10,]$animal_name, data_2d_dog[1:10,]$count ,seq = ",")
## [1] "Lucy 337 ," "Charlie 306 ," "Bella 249 ," "Luna 244 ,"
## [5] "Daisy 221 ," "Cooper 189 ," "Lola 187 ," "Max 186 ,"
## [9] "Molly 186 ," "Stella 185 ,"
print("Cat:")
## [1] "Cat:"
paste(data_2d_cat[1:10,]$animal_name, data_2d_cat[1:10,]$count, seq = ",")
## [1] "Luna 111 ," "Lucy 102 ," "Lily 86 ," "Max 83 ," "Bella 82 ,"
## [6] "Charlie 81 ," "Oliver 73 ," "Jack 65 ," "Sophie 59 ," "Leo 54 ,"
data_2e <- seattlepets %>%
filter(!is.na(animal_name)) %>%
group_by(animal_name) %>%
summarise(count = length(animal_name)) %>%
filter(count > 100)
print("Names appeared more than 100 times:")
## [1] "Names appeared more than 100 times:"
paste(data_2e$animal_name)
## [1] "Abby" "Bailey" "Bear" "Bella" "Buddy" "Charlie" "Chloe"
## [8] "Coco" "Cooper" "Daisy" "Ellie" "Finn" "Ginger" "Gracie"
## [15] "Gus" "Henry" "Jack" "Jake" "Jasper" "Kona" "Leo"
## [22] "Lily" "Loki" "Lola" "Louie" "Lucy" "Lulu" "Luna"
## [29] "Maggie" "Max" "Mia" "Milo" "Molly" "Murphy" "Olive"
## [36] "Oliver" "Ollie" "Oscar" "Penny" "Pepper" "Riley" "Rocky"
## [43] "Rosie" "Ruby" "Sadie" "Sam" "Scout" "Shadow" "Sophie"
## [50] "Stella" "Teddy" "Toby" "Tucker" "Winston" "Zoe" "Zoey"
data_2f_cat <- seattlepets %>%
filter(animal_name == data_2e$animal_name, species == "Cat") %>%
group_by(animal_name) %>%
summarise(cat_count = length(animal_name))
data_2f_dog <- seattlepets %>%
filter(animal_name == data_2e$animal_name, species == "Dog") %>%
group_by(animal_name) %>%
summarise(dog_count = length(animal_name))
data_2f <- left_join(data_2f_cat, data_2f_dog)
summarise(data_2f, petnames = animal_name, cat_to_dog_ratio = cat_count/dog_count)
## # A tibble: 27 × 2
## petnames cat_to_dog_ratio
## <chr> <dbl>
## 1 Abby NA
## 2 Bear NA
## 3 Bella 0.333
## 4 Ellie 0.4
## 5 Ginger 0.5
## 6 Henry 1
## 7 Jack 0.25
## 8 Lily 1.25
## 9 Loki NA
## 10 Lola 0.333
## # ℹ 17 more rows
Comments: For NA ratio, the name is being taken for either only dog or cat.