1. Data set flights


Question a: Create a histogram of arrival delays (excluding NAs) for all flights in June and July. Make sense of your result.

my_data <- (flights) %>%
  filter(between(month,6,7),!is.na(arr_delay)) %>%
  print()
## # A tibble: 55,368 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     6     1        2       2359       3     341     350      -9 B6     
##  2  2013     6     1      451        500      -9     624     640     -16 US     
##  3  2013     6     1      506        515      -9     715     800     -45 UA     
##  4  2013     6     1      534        545     -11     800     829     -29 UA     
##  5  2013     6     1      538        545      -7     925     922       3 B6     
##  6  2013     6     1      539        540      -1     832     840      -8 AA     
##  7  2013     6     1      546        600     -14     850     910     -20 UA     
##  8  2013     6     1      551        600      -9     828     850     -22 AA     
##  9  2013     6     1      552        600      -8     647     655      -8 US     
## 10  2013     6     1      553        600      -7     700     711     -11 EV     
## # … with 55,358 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
ggplot(my_data) + geom_histogram(aes(arr_delay),binwidth = 25, boundary = 100.5) + 
  labs(title = "Arrival delay distribution of June and July", x = "arrival delay in minutes") +
  theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1))) + xlim(0,500)

my_data <- (flights) %>%
  filter(between(month,6,7), arr_delay>0) %>%
  print()
## # A tibble: 25,794 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     6     1      538        545      -7     925     922       3 B6     
##  2  2013     6     1      559        600      -1     658     656       2 US     
##  3  2013     6     1      624        600      24     727     720       7 MQ     
##  4  2013     6     1      644        642       2     824     819       5 EV     
##  5  2013     6     1      655        655       0     915     914       1 UA     
##  6  2013     6     1      706        630      36     844     803      41 EV     
##  7  2013     6     1      729        635      54     943     904      39 DL     
##  8  2013     6     1      729        730      -1    1048    1045       3 VX     
##  9  2013     6     1      732        735      -3     901     900       1 WN     
## 10  2013     6     1      735        710      25     944     925      19 WN     
## # … with 25,784 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
 ggplot(my_data) + geom_histogram(aes(arr_delay),binwidth = 25, boundary = 100.5) + labs(title = "Arrival delay distribution of June and July", x = "arrival delay in minutes") +
  theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1))) + xlim(0,500)

my_data <- (flights) %>%
  filter(!is.na(arr_delay)) %>%
  print()
## # A tibble: 327,346 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 327,336 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
ggplot(my_data) + stat_summary(mapping = aes(x= as.factor(month), y = arr_delay, fill = as.factor(month)), geom = "bar", fun = "mean") + 
  labs(title = "Arrival delay distribution of year 2013", x = "Arrival delay by month", y = "arrival delay in minutes") +
  theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1)))

my_data <- (flights) %>%
  filter(arr_delay>0) %>%
  print()
## # A tibble: 133,004 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      554        558      -4     740     728      12 UA     
##  5  2013     1     1      555        600      -5     913     854      19 B6     
##  6  2013     1     1      558        600      -2     753     745       8 AA     
##  7  2013     1     1      558        600      -2     924     917       7 UA     
##  8  2013     1     1      559        600      -1     941     910      31 AA     
##  9  2013     1     1      600        600       0     837     825      12 MQ     
## 10  2013     1     1      602        605      -3     821     805      16 MQ     
## # … with 132,994 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
ggplot(my_data) + stat_summary(mapping = aes(x= as.factor(month), y = arr_delay, fill = as.factor(month)), geom = "bar", fun = "mean") + 
  labs(title = "Arrival delay distribution of year 2013", x = "Arrival delay by month", y = "arrival delay in minutes") +
  theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1)))

my_data <- (flights) %>%
  filter(!is.na(arr_delay), between(month,6,7)) %>%
  mutate(delay_flag = ifelse(arr_delay < 60, "short_delay", "long_delay"))

ggplot(my_data) + geom_bar(aes(delay_flag, fill = delay_flag)) +
  labs(title = "Short delay vs Long delay") +
  theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1)))

ggplot(my_data) + geom_bar(aes(delay_flag,y=after_stat(count/sum(count)), fill = delay_flag)) +
  labs(title = "Short delay vs Long delay by ratio", y = "Ratio") +
  theme(plot.title = element_text(hjust = 0.5,size = rel(1.3)),axis.title = element_text(size = rel(1.1)))

Reasoning: From the first graph, we can tell that the distribution of arrival delay of June and July is a right skewed curve,then we could tell that most delays concentrate within 1 hour range, this is also verified by the last two graphs; and from the graph of 2013 delay distribution, we could have a general idea that usually there are more delays during June and July. since the weather condition is the key reason causing the delay, then I checked the weather record for both months in 2013, but nothing severe showed up; the air and ground congestion is another major reason causing delay, since usually the June and July are the hottest season taking a vacation, there might be a busier schedule for air traffic control,if a scheduled flight pushes back from the gate late, that flight could impact the arrival of other flights.


Question b: Create a line graph of arrival delays vs departure delays for all flights departing from EWR on the first day of each month. Make sense of your result.

my_data <- (flights) %>%
  filter(origin == "EWR", day == 1,!is.na(arr_delay), !is.na(dep_delay)) %>%
  print()
## # A tibble: 3,847 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      554        558      -4     740     728      12 UA     
##  3  2013     1     1      555        600      -5     913     854      19 B6     
##  4  2013     1     1      558        600      -2     923     937     -14 UA     
##  5  2013     1     1      559        600      -1     854     902      -8 UA     
##  6  2013     1     1      601        600       1     844     850      -6 B6     
##  7  2013     1     1      606        610      -4     858     910     -12 AA     
##  8  2013     1     1      607        607       0     858     915     -17 UA     
##  9  2013     1     1      608        600       8     807     735      32 MQ     
## 10  2013     1     1      615        615       0     833     842      -9 DL     
## # … with 3,837 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
ggplot(my_data) + geom_smooth(aes(arr_delay,dep_delay)) +
  labs(title = "Arrival vs Departure delay on the first days in 2013", x = "arrival delay in minutes",y = "departure delay in minutes") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

Reasoning: The graph shows a positive relationship between the arrival and departure delay for all first days in year of 2013 at EWR airport. it makes sense because if arrival flight delayed, it still takes about the same time to get maintain, equipped,etc., and also need air traffic controller to set a new schedule, it’s like a chain reaction.


Question c: Find the flights that actually departed with the shortest travel distance. What is its origin and destination airport?

my_data <- (flights) %>%
  filter( distance == min(distance)) %>%
  select(origin, dest,distance,everything()) %>%
  print()
## # A tibble: 1 × 19
##   origin dest  dista…¹  year month   day dep_t…² sched…³ dep_d…⁴ arr_t…⁵ sched…⁶
##   <chr>  <chr>   <dbl> <int> <int> <int>   <int>   <int>   <dbl>   <int>   <int>
## 1 EWR    LGA        17  2013     7    27      NA     106      NA      NA     245
## # … with 8 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, air_time <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   and abbreviated variable names ¹​distance, ²​dep_time, ³​sched_dep_time,
## #   ⁴​dep_delay, ⁵​arr_time, ⁶​sched_arr_time

Answer: The origin is ewr, the destination is lga, and the distance is 17 miles.


Question d: Create a new categorical variable with two labels. Flights with a travel distance shorter than 500 miles are marked as “short-distanced”, and otherwise “long-distanced”.

my_data <- (flights) %>%
  mutate(distance_flag = ifelse(distance < 500, "short-distanced", "long-distanced")) %>%
  select(distance_flag,everything()) %>%
  print()
## # A tibble: 336,776 × 20
##    distance_…¹  year month   day dep_t…² sched…³ dep_d…⁴ arr_t…⁵ sched…⁶ arr_d…⁷
##    <chr>       <int> <int> <int>   <int>   <int>   <dbl>   <int>   <int>   <dbl>
##  1 long-dista…  2013     1     1     517     515       2     830     819      11
##  2 long-dista…  2013     1     1     533     529       4     850     830      20
##  3 long-dista…  2013     1     1     542     540       2     923     850      33
##  4 long-dista…  2013     1     1     544     545      -1    1004    1022     -18
##  5 long-dista…  2013     1     1     554     600      -6     812     837     -25
##  6 long-dista…  2013     1     1     554     558      -4     740     728      12
##  7 long-dista…  2013     1     1     555     600      -5     913     854      19
##  8 short-dist…  2013     1     1     557     600      -3     709     723     -14
##  9 long-dista…  2013     1     1     557     600      -3     838     846      -8
## 10 long-dista…  2013     1     1     558     600      -2     753     745       8
## # … with 336,766 more rows, 10 more variables: carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​distance_flag, ²​dep_time, ³​sched_dep_time, ⁴​dep_delay, ⁵​arr_time,
## #   ⁶​sched_arr_time, ⁷​arr_delay


Question e: Find the destination airport that has the longest average departure delay by creating a graph.

my_data <- (flights) %>%
  filter(!is.na(dep_delay)) %>%
  group_by(dest) %>%
  summarise(mean_delay = mean(dep_delay)) %>%
  arrange(desc(mean_delay)) %>%
  head(9) %>%
  print()
## # A tibble: 9 × 2
##   dest  mean_delay
##   <chr>      <dbl>
## 1 CAE         35.6
## 2 TUL         34.9
## 3 OKC         30.6
## 4 BHM         29.7
## 5 TYS         28.5
## 6 JAC         26.5
## 7 DSM         26.2
## 8 RIC         23.6
## 9 ALB         23.6
ggplot(my_data) + stat_summary(mapping = aes(x= dest, y = mean_delay, fill = dest), geom = "bar", fun = "mean")

Answer: The destination airport CAE has the longest average departure delay.


Question f: Answer the question in (e) without creating a graph.

my_data <- (flights) %>%
  filter(!is.na(dep_delay)) %>%
  group_by(dest) %>%
  summarise(mean_delay = mean(dep_delay)) %>%
  arrange(desc(mean_delay)) %>%
  print()
## # A tibble: 104 × 2
##    dest  mean_delay
##    <chr>      <dbl>
##  1 CAE         35.6
##  2 TUL         34.9
##  3 OKC         30.6
##  4 BHM         29.7
##  5 TYS         28.5
##  6 JAC         26.5
##  7 DSM         26.2
##  8 RIC         23.6
##  9 ALB         23.6
## 10 MSN         23.6
## # … with 94 more rows

Answer: The destination airport CAE has the longest average departure delay.


Question g: Find the carriers with the highest and the lowest average flight speed for all their flights in the data set.

my_data <- (flights) %>%
  filter(!is.na(air_time)) %>%
  mutate(speed = distance %/% air_time *60) %>%
  group_by(carrier) %>%
  summarise(min_speed = min(speed),max_speed = max(speed)) %>%

  

  print()
## # A tibble: 16 × 3
##    carrier min_speed max_speed
##    <chr>       <dbl>     <dbl>
##  1 9E             60       480
##  2 AA             60       540
##  3 AS            360       480
##  4 B6             60       540
##  5 DL            120       660
##  6 EV            120       600
##  7 F9            300       480
##  8 FL            240       480
##  9 HA            420       480
## 10 MQ            120       480
## 11 OO            240       360
## 12 UA             60       540
## 13 US             60       480
## 14 VX            360       480
## 15 WN            120       480
## 16 YV            120       420


Question h: Find flights on which weekday (from Monday to Sunday) had the longest departure delay on average.


2. Data set seattlepets


Question a: How many species are there in the data set? What are they?

my_data <- count(seattlepets,species)
my_data
## # A tibble: 4 × 2
##   species     n
##   <chr>   <int>
## 1 Cat     17294
## 2 Dog     35181
## 3 Goat       38
## 4 Pig         6
ggplot(my_data) + stat_summary(mapping = aes(x= species, y = n, fill = species),geom = "bar")
## No summary function supplied, defaulting to `mean_se()`

Answer: There are total 4 species, they are cat,dog,goat and pig.


Question b: What are the three most common pet names in Seattle?

  my_data <- (seattlepets) %>%
   filter(!is.na(animal_name)) %>%
   count(animal_name) %>%
   arrange(desc(n)) %>%
   print()
## # A tibble: 13,929 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Lucy          439
##  2 Charlie       387
##  3 Luna          355
##  4 Bella         331
##  5 Max           270
##  6 Daisy         261
##  7 Molly         240
##  8 Jack          232
##  9 Lily          232
## 10 Stella        227
## # … with 13,919 more rows

Answer: The most three common pet names are Lucy, Charlie and Luna.


Question c: What are the ten most common pet names for cats? What are the ten most common pet names for dogs? Write a code to print the result and their frequencies.

my_data <- (seattlepets) %>%
  filter(species=="Cat", !is.na(animal_name)) %>%
  count(animal_name) %>%
  arrange(desc(n)) %>%
  select(animal_name, n) %>%
  head(10) %>%
  print()
## # A tibble: 10 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Luna          111
##  2 Lucy          102
##  3 Lily           86
##  4 Max            83
##  5 Bella          82
##  6 Charlie        81
##  7 Oliver         73
##  8 Jack           65
##  9 Sophie         59
## 10 Leo            54
my_data <- (seattlepets) %>%
  filter(species=="Dog", !is.na(animal_name)) %>%
  count(animal_name) %>%
  arrange(desc(n)) %>%
  select(animal_name, n) %>%
  head(10) %>%
  print()
## # A tibble: 10 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Lucy          337
##  2 Charlie       306
##  3 Bella         249
##  4 Luna          244
##  5 Daisy         221
##  6 Cooper        189
##  7 Lola          187
##  8 Max           186
##  9 Molly         186
## 10 Stella        185


Question d: How many names appear more than 100 times in the data set excluding “NA”?

my_data <- (seattlepets) %>%
  filter(!is.na(animal_name)) %>%
  count(animal_name) %>%
  filter(n>100) %>%
  print()
## # A tibble: 56 × 2
##    animal_name     n
##    <chr>       <int>
##  1 Abby          115
##  2 Bailey        157
##  3 Bear          109
##  4 Bella         331
##  5 Buddy         218
##  6 Charlie       387
##  7 Chloe         173
##  8 Coco          147
##  9 Cooper        205
## 10 Daisy         261
## # … with 46 more rows

Answer: There are total 56 names which appear more than 100 times.


Question e: For all names that appear more than 100 times in the data set, which has the highest “cat_to_dog” ratio? Which has the lowest? The “cat_to_dog” ratio can be computed this way - if a name appears 200 times, in which 150 are for cats and 50 are for dogs, the ratio is 150/50 = 3.

my_data <- (seattlepets) %>%
  filter(!is.na(animal_name)) %>%
  group_by(animal_name) %>%
  summarise(num=n()) %>%
  filter()