install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
install.packages("nycflights13")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(nycflights13)
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
#Q1: Find all flights on Jan 1. select, arrange, filter, mutate, transmutate, slice which one should we use?
flights %>% filter(month == 1, day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q2: Try to find arrival delay (arr_delay) more than 120 minutes and show arr_delay in the fourth column
flights %>% filter(arr_delay >= 120) %>%
select(year:day, arr_delay, everything())
## # A tibble: 10,200 × 19
## year month day arr_delay dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 1 137 811 630 101 1047
## 2 2013 1 1 851 848 1835 853 1001
## 3 2013 1 1 123 957 733 144 1056
## 4 2013 1 1 145 1114 900 134 1447
## 5 2013 1 1 127 1505 1310 115 1638
## 6 2013 1 1 125 1525 1340 105 1831
## 7 2013 1 1 136 1549 1445 64 1912
## 8 2013 1 1 123 1558 1359 119 1718
## 9 2013 1 1 123 1732 1630 62 2028
## 10 2013 1 1 138 1803 1620 103 2008
## # … with 10,190 more rows, and 11 more variables: sched_arr_time <int>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q3: We need to find the longest arr_delay and its carrier, flight information. We need to think which function can rank data in descending order. Use arrange!
flights %>% arrange(desc(arr_delay)) %>%
select(carrier, flight, arr_delay, everything()) %>%
slice(1)
## # A tibble: 1 × 19
## carrier flight arr_delay year month day dep_time sched_dep_time dep_delay
## <chr> <int> <dbl> <int> <int> <int> <int> <int> <dbl>
## 1 HA 51 1272 2013 1 9 641 900 1301
## # … with 10 more variables: arr_time <int>, sched_arr_time <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
#Q4: Which carrier has the worst average arr_delay? its value? Use group_by, summarise/mutate and arrange
flights %>% group_by(carrier) %>%
# mutate(arr_delay_avg = mean(arr.delay,na.rm = TRUE )
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 16 × 2
## carrier arr_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
#Q5: which hour should we take by avoiding the possible arr_delay (average arr_delay)? group_by(hour)
flights %>% group_by(hour) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% arrange(desc(arr_delay))
## # A tibble: 20 × 2
## hour arr_delay
## <dbl> <dbl>
## 1 21 18.4
## 2 20 16.7
## 3 19 16.7
## 4 17 16.0
## 5 22 16.0
## 6 18 14.8
## 7 16 12.6
## 8 15 12.3
## 9 23 11.8
## 10 14 9.20
## 11 13 6.54
## 12 12 3.49
## 13 11 1.48
## 14 10 0.954
## 15 8 -1.11
## 16 9 -1.45
## 17 6 -3.38
## 18 5 -4.80
## 19 7 -5.30
## 20 1 NaN
#Q6: Based on flights, select the first 100 data of year, month, day, hour, origin, dest, tailnum, carrier. Then Use left_join() to add airline names to answers in the above and save the joined resutls as ‘answer06’. Finally show the first 6 rows of data.
flights %>% slice(1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y = airlines, by = "carrier") %>% head()
## # A tibble: 6 × 9
## year month day hour origin dest tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 LGA IAH N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 JFK MIA N619AA AA American Airlines Inc.
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlue Airways
## 5 2013 1 1 6 LGA ATL N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 EWR ORD N39463 UA United Air Lines Inc.
answer06 <- flights %>% slice(1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y = airlines, by = "carrier") %>% head()
#Q7: Use left_join() to combine ‘answer06’ and ‘weather’, and then combine with ‘airports’. Hint: left_join(…, by = c(“dest” = “faa”))
answer06 %>% left_join(weather) %>%
left_join(airports, by = c("dest" = "faa"))
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 6 × 26
## year month day hour origin dest tailnum carrier name.x temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA United… 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA United… 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA Americ… 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlu… 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL Delta … 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA United… 39.0 28.0 64.4
## # … with 14 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, name.y <chr>,
## # lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
#Q8: Count the number of flights for dest = ALB, BDL and BTV in each month. (use n())
flights %>% filter(dest %in% c("ALB", "BDL", "BTV")) %>%
group_by(year, month, dest) %>%
summarise(count = n())
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
## # A tibble: 36 × 4
## # Groups: year, month [12]
## year month dest count
## <int> <int> <chr> <int>
## 1 2013 1 ALB 64
## 2 2013 1 BDL 37
## 3 2013 1 BTV 223
## 4 2013 2 ALB 58
## 5 2013 2 BDL 46
## 6 2013 2 BTV 189
## 7 2013 3 ALB 57
## 8 2013 3 BDL 62
## 9 2013 3 BTV 257
## 10 2013 4 ALB 13
## # … with 26 more rows
#Q9: Use carrier, month, origin to compute average departure delay time (dep_delay).
flights %>% select(carrier, month, origin, dep_delay) %>%
group_by(carrier, month, origin) %>%
mutate(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 336,776 × 5
## # Groups: carrier, month, origin [399]
## carrier month origin dep_delay avg_dep_delay
## <chr> <int> <chr> <dbl> <dbl>
## 1 OO 1 LGA 67 67
## 2 OO 8 LGA -10 64
## 3 OO 8 LGA 154 64
## 4 OO 8 LGA 27 64
## 5 OO 8 LGA 85 64
## 6 OO 6 EWR -9 61
## 7 OO 6 EWR 131 61
## 8 HA 1 JFK -3 54.4
## 9 HA 1 JFK 9 54.4
## 10 HA 1 JFK 14 54.4
## # … with 336,766 more rows