flightsmy_data <- (flights) %>%
filter(between(month,6,7),!is.na(arr_delay)) %>%
print()
## # A tibble: 55,368 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 6 1 2 2359 3 341 350 -9 B6
## 2 2013 6 1 451 500 -9 624 640 -16 US
## 3 2013 6 1 506 515 -9 715 800 -45 UA
## 4 2013 6 1 534 545 -11 800 829 -29 UA
## 5 2013 6 1 538 545 -7 925 922 3 B6
## 6 2013 6 1 539 540 -1 832 840 -8 AA
## 7 2013 6 1 546 600 -14 850 910 -20 UA
## 8 2013 6 1 551 600 -9 828 850 -22 AA
## 9 2013 6 1 552 600 -8 647 655 -8 US
## 10 2013 6 1 553 600 -7 700 711 -11 EV
## # … with 55,358 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
ggplot(my_data) + geom_histogram(aes(arr_delay),binwidth = 25, boundary = 100.5) +
labs(title = "Arrival delay distribution of June and July", x = "arrival delay in minutes") +
theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1))) + xlim(0,500)
my_data <- (flights) %>%
filter(between(month,6,7), arr_delay>0) %>%
print()
## # A tibble: 25,794 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 6 1 538 545 -7 925 922 3 B6
## 2 2013 6 1 559 600 -1 658 656 2 US
## 3 2013 6 1 624 600 24 727 720 7 MQ
## 4 2013 6 1 644 642 2 824 819 5 EV
## 5 2013 6 1 655 655 0 915 914 1 UA
## 6 2013 6 1 706 630 36 844 803 41 EV
## 7 2013 6 1 729 635 54 943 904 39 DL
## 8 2013 6 1 729 730 -1 1048 1045 3 VX
## 9 2013 6 1 732 735 -3 901 900 1 WN
## 10 2013 6 1 735 710 25 944 925 19 WN
## # … with 25,784 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
ggplot(my_data) + geom_histogram(aes(arr_delay),binwidth = 25, boundary = 100.5) + labs(title = "Arrival delay distribution of June and July", x = "arrival delay in minutes") +
theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1))) + xlim(0,500)
my_data <- (flights) %>%
filter(!is.na(arr_delay)) %>%
print()
## # A tibble: 327,346 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 327,336 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
ggplot(my_data) + stat_summary(mapping = aes(x= as.factor(month), y = arr_delay, fill = as.factor(month)), geom = "bar", fun = "mean") +
labs(title = "Arrival delay distribution of year 2013", x = "Arrival delay by month", y = "arrival delay in minutes") +
theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1)))
my_data <- (flights) %>%
filter(arr_delay>0) %>%
print()
## # A tibble: 133,004 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 554 558 -4 740 728 12 UA
## 5 2013 1 1 555 600 -5 913 854 19 B6
## 6 2013 1 1 558 600 -2 753 745 8 AA
## 7 2013 1 1 558 600 -2 924 917 7 UA
## 8 2013 1 1 559 600 -1 941 910 31 AA
## 9 2013 1 1 600 600 0 837 825 12 MQ
## 10 2013 1 1 602 605 -3 821 805 16 MQ
## # … with 132,994 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
ggplot(my_data) + stat_summary(mapping = aes(x= as.factor(month), y = arr_delay, fill = as.factor(month)), geom = "bar", fun = "mean") +
labs(title = "Arrival delay distribution of year 2013", x = "Arrival delay by month", y = "arrival delay in minutes") +
theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1)))
my_data <- (flights) %>%
filter(!is.na(arr_delay), between(month,6,7)) %>%
mutate(delay_flag = ifelse(arr_delay < 60, "short_delay", "long_delay"))
ggplot(my_data) + geom_bar(aes(delay_flag, fill = delay_flag)) +
labs(title = "Short delay vs Long delay") +
theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1)))
ggplot(my_data) + geom_bar(aes(delay_flag,y=after_stat(count/sum(count)), fill = delay_flag)) +
labs(title = "Short delay vs Long delay by ratio", y = "Ratio") +
theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1)))
Reasoning: From the first graph, we can tell that the distribution of arrival delay of June and July is a right skewed curve,then we could tell that most delays concentrate within 1 hour range, this is also verified by the last two graphs; and from the graph of 2013 delay distribution, we could have a general idea that usually there are more delays during June and July. since the weather condition is the key reason causing the delay, then I checked the weather record for both months in 2013, but nothing severe showed up; the air and ground congestion is another major reason causing delay, since usually the June and July are the hottest season taking a vacation, there might be a busier schedule for air traffic control,if a scheduled flight pushes back from the gate late, that flight could impact the arrival of other flights.
my_data <- (flights) %>%
filter(origin == "EWR", day == 1,!is.na(arr_delay), !is.na(dep_delay)) %>%
print()
## # A tibble: 3,847 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 554 558 -4 740 728 12 UA
## 3 2013 1 1 555 600 -5 913 854 19 B6
## 4 2013 1 1 558 600 -2 923 937 -14 UA
## 5 2013 1 1 559 600 -1 854 902 -8 UA
## 6 2013 1 1 601 600 1 844 850 -6 B6
## 7 2013 1 1 606 610 -4 858 910 -12 AA
## 8 2013 1 1 607 607 0 858 915 -17 UA
## 9 2013 1 1 608 600 8 807 735 32 MQ
## 10 2013 1 1 615 615 0 833 842 -9 DL
## # … with 3,837 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
ggplot(my_data) + geom_smooth(aes(arr_delay,dep_delay)) +
labs(title = "Arrival vs Departure delay on the first days in 2013", x = "arrival delay in minutes",y = "departure delay in minutes") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
Reasoning: The graph shows a positive relationship between the arrival and departure delay for all first days in year of 2013 at EWR airport. it makes sense because if arrival flight delayed, it still takes about the same time to get maintain, equipped,etc., and also need air traffic controller to set a new schedule, it’s like a chain reaction.
my_data <- (flights) %>%
filter( distance == min(distance)) %>%
select(origin, dest,distance,everything()) %>%
print()
## # A tibble: 1 × 19
## origin dest dista…¹ year month day dep_t…² sched…³ dep_d…⁴ arr_t…⁵ sched…⁶
## <chr> <chr> <dbl> <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 EWR LGA 17 2013 7 27 NA 106 NA NA 245
## # … with 8 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, air_time <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # and abbreviated variable names ¹distance, ²dep_time, ³sched_dep_time,
## # ⁴dep_delay, ⁵arr_time, ⁶sched_arr_time
Answer: The origin is ewr, the destination is lga, and the distance is 17 miles.
my_data <- (flights) %>%
mutate(distance_flag = ifelse(distance < 500, "short-distanced", "long-distanced")) %>%
select(distance_flag,everything()) %>%
print()
## # A tibble: 336,776 × 20
## distance_…¹ year month day dep_t…² sched…³ dep_d…⁴ arr_t…⁵ sched…⁶ arr_d…⁷
## <chr> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl>
## 1 long-dista… 2013 1 1 517 515 2 830 819 11
## 2 long-dista… 2013 1 1 533 529 4 850 830 20
## 3 long-dista… 2013 1 1 542 540 2 923 850 33
## 4 long-dista… 2013 1 1 544 545 -1 1004 1022 -18
## 5 long-dista… 2013 1 1 554 600 -6 812 837 -25
## 6 long-dista… 2013 1 1 554 558 -4 740 728 12
## 7 long-dista… 2013 1 1 555 600 -5 913 854 19
## 8 short-dist… 2013 1 1 557 600 -3 709 723 -14
## 9 long-dista… 2013 1 1 557 600 -3 838 846 -8
## 10 long-dista… 2013 1 1 558 600 -2 753 745 8
## # … with 336,766 more rows, 10 more variables: carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹distance_flag, ²dep_time, ³sched_dep_time, ⁴dep_delay, ⁵arr_time,
## # ⁶sched_arr_time, ⁷arr_delay
my_data <- (flights) %>%
filter(!is.na(dep_delay)) %>%
group_by(dest) %>%
summarise(mean_delay = mean(dep_delay)) %>%
arrange(desc(mean_delay)) %>%
head(9) %>%
print()
## # A tibble: 9 × 2
## dest mean_delay
## <chr> <dbl>
## 1 CAE 35.6
## 2 TUL 34.9
## 3 OKC 30.6
## 4 BHM 29.7
## 5 TYS 28.5
## 6 JAC 26.5
## 7 DSM 26.2
## 8 RIC 23.6
## 9 ALB 23.6
ggplot(my_data) + stat_summary(mapping = aes(x= dest, y = mean_delay, fill = dest), geom = "bar", fun = "mean")
Answer: The destination airport CAE has the longest average departure delay.
my_data <- (flights) %>%
filter(!is.na(dep_delay)) %>%
group_by(dest) %>%
summarise(mean_delay = mean(dep_delay)) %>%
arrange(desc(mean_delay)) %>%
print()
## # A tibble: 104 × 2
## dest mean_delay
## <chr> <dbl>
## 1 CAE 35.6
## 2 TUL 34.9
## 3 OKC 30.6
## 4 BHM 29.7
## 5 TYS 28.5
## 6 JAC 26.5
## 7 DSM 26.2
## 8 RIC 23.6
## 9 ALB 23.6
## 10 MSN 23.6
## # … with 94 more rows
Answer: The destination airport CAE has the longest average departure delay.
my_data <- (flights) %>%
filter(!is.na(air_time)) %>%
mutate(speed = distance %/% air_time *60) %>%
group_by(carrier) %>%
summarise(min_speed = min(speed),max_speed = max(speed)) %>%
print()
## # A tibble: 16 × 3
## carrier min_speed max_speed
## <chr> <dbl> <dbl>
## 1 9E 60 480
## 2 AA 60 540
## 3 AS 360 480
## 4 B6 60 540
## 5 DL 120 660
## 6 EV 120 600
## 7 F9 300 480
## 8 FL 240 480
## 9 HA 420 480
## 10 MQ 120 480
## 11 OO 240 360
## 12 UA 60 540
## 13 US 60 480
## 14 VX 360 480
## 15 WN 120 480
## 16 YV 120 420
seattlepetsmy_data <- count(seattlepets,species)
my_data
## # A tibble: 4 × 2
## species n
## <chr> <int>
## 1 Cat 17294
## 2 Dog 35181
## 3 Goat 38
## 4 Pig 6
ggplot(my_data) + stat_summary(mapping = aes(x= species, y = n, fill = species),geom = "bar")
## No summary function supplied, defaulting to `mean_se()`
Answer: There are total 4 species, they are cat,dog,goat and pig.
my_data <- (seattlepets) %>%
filter(!is.na(animal_name)) %>%
count(animal_name) %>%
arrange(desc(n)) %>%
print()
## # A tibble: 13,929 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 439
## 2 Charlie 387
## 3 Luna 355
## 4 Bella 331
## 5 Max 270
## 6 Daisy 261
## 7 Molly 240
## 8 Jack 232
## 9 Lily 232
## 10 Stella 227
## # … with 13,919 more rows
Answer: The most three common pet names are Lucy, Charlie and Luna.
my_data <- (seattlepets) %>%
filter(species=="Cat", !is.na(animal_name)) %>%
count(animal_name) %>%
arrange(desc(n)) %>%
select(animal_name, n) %>%
head(10) %>%
print()
## # A tibble: 10 × 2
## animal_name n
## <chr> <int>
## 1 Luna 111
## 2 Lucy 102
## 3 Lily 86
## 4 Max 83
## 5 Bella 82
## 6 Charlie 81
## 7 Oliver 73
## 8 Jack 65
## 9 Sophie 59
## 10 Leo 54
my_data <- (seattlepets) %>%
filter(species=="Dog", !is.na(animal_name)) %>%
count(animal_name) %>%
arrange(desc(n)) %>%
select(animal_name, n) %>%
head(10) %>%
print()
## # A tibble: 10 × 2
## animal_name n
## <chr> <int>
## 1 Lucy 337
## 2 Charlie 306
## 3 Bella 249
## 4 Luna 244
## 5 Daisy 221
## 6 Cooper 189
## 7 Lola 187
## 8 Max 186
## 9 Molly 186
## 10 Stella 185
my_data <- (seattlepets) %>%
filter(!is.na(animal_name)) %>%
count(animal_name) %>%
filter(n>100) %>%
print()
## # A tibble: 56 × 2
## animal_name n
## <chr> <int>
## 1 Abby 115
## 2 Bailey 157
## 3 Bear 109
## 4 Bella 331
## 5 Buddy 218
## 6 Charlie 387
## 7 Chloe 173
## 8 Coco 147
## 9 Cooper 205
## 10 Daisy 261
## # … with 46 more rows
Answer: There are total 56 names which appear more than 100 times.
my_data <- (seattlepets) %>%
filter(!is.na(animal_name)) %>%
group_by(animal_name) %>%
summarise(num=n()) %>%
filter()