library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
#p6 E4
ggplot(mpg,aes(x=hwy,y=cyl))+geom_point()
#p12, E2 ?mpg mpg #
#p12, E3
ggplot(mpg, aes(x=displ, y=hwy, colour=cty))+geom_point()
ggplot(mpg, aes(x=displ, y=hwy, size=cty))+geom_point()
Continuous variable can not be mapped. #p20, E1
geom_line()
## geom_line: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
geom_boxplot()
## geom_boxplot: outlier.colour = NULL, outlier.fill = NULL, outlier.shape = 19, outlier.size = 1.5, outlier.stroke = 0.5, outlier.alpha = NULL, notch = FALSE, notchwidth = 0.5, varwidth = FALSE, na.rm = FALSE
## stat_boxplot: na.rm = FALSE
## position_dodge2
geom_histogram()
## geom_bar: na.rm = FALSE
## stat_bin: binwidth = NULL, bins = NULL, na.rm = FALSE, pad = FALSE
## position_stack
geom_area()
## geom_area: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_stack
Histogram can be used to draw a line chart. #p21, E6
ggplot(mpg,aes(x=displ,y=hwy))+geom_point()+geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg,aes(x=displ,y=hwy))+geom_smooth(mapping = aes(group=drv),se=FALSE)+geom_point()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy, colour = drv)) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(colour = drv)) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(colour = drv)) +
geom_smooth(aes(linetype = drv), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(size = 4, color = "white") +
geom_point(aes(colour = drv))
#p49, E1
filter(flights,arr_delay >=120)
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 811 630 101 1047
## 2 2013 1 1 848 1835 853 1001
## 3 2013 1 1 957 733 144 1056
## 4 2013 1 1 1114 900 134 1447
## 5 2013 1 1 1505 1310 115 1638
## 6 2013 1 1 1525 1340 105 1831
## 7 2013 1 1 1549 1445 64 1912
## 8 2013 1 1 1558 1359 119 1718
## 9 2013 1 1 1732 1630 62 2028
## 10 2013 1 1 1803 1620 103 2008
## # ... with 10,190 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights,arr_delay >=120)
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 811 630 101 1047
## 2 2013 1 1 848 1835 853 1001
## 3 2013 1 1 957 733 144 1056
## 4 2013 1 1 1114 900 134 1447
## 5 2013 1 1 1505 1310 115 1638
## 6 2013 1 1 1525 1340 105 1831
## 7 2013 1 1 1549 1445 64 1912
## 8 2013 1 1 1558 1359 119 1718
## 9 2013 1 1 1732 1630 62 2028
## 10 2013 1 1 1803 1620 103 2008
## # ... with 10,190 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights,dest=="IAH"| dest=="HOU")
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 623 627 -4 933
## 4 2013 1 1 728 732 -4 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2 1350
## 8 2013 1 1 1044 1045 -1 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5 1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, carrier %in% c("AA", "DL", "UA"))
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 554 600 -6 812
## 5 2013 1 1 554 558 -4 740
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 558 600 -2 923
## 9 2013 1 1 559 600 -1 941
## 10 2013 1 1 559 600 -1 854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month>=7, month<=9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, arr_delay>120, dep_delay<=0)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 27 1419 1420 -1 1754
## 2 2013 10 7 1350 1350 0 1736
## 3 2013 10 7 1357 1359 -2 1858
## 4 2013 10 16 657 700 -3 1258
## 5 2013 11 1 658 700 -2 1329
## 6 2013 3 18 1844 1847 -3 39
## 7 2013 4 17 1635 1640 -5 2049
## 8 2013 4 18 558 600 -2 1149
## 9 2013 4 18 655 700 -5 1213
## 10 2013 5 22 1827 1830 -3 2217
## # ... with 19 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dep_delay>=60, dep_delay-arr_delay>30)
## # A tibble: 1,844 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 2205 1720 285 46
## 2 2013 1 1 2326 2130 116 131
## 3 2013 1 3 1503 1221 162 1803
## 4 2013 1 3 1839 1700 99 2056
## 5 2013 1 3 1850 1745 65 2148
## 6 2013 1 3 1941 1759 102 2246
## 7 2013 1 3 1950 1845 65 2228
## 8 2013 1 3 2015 1915 60 2135
## 9 2013 1 3 2257 2000 177 45
## 10 2013 1 4 1917 1700 137 2135
## # ... with 1,834 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
summary(flights$dep_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 907 1401 1349 1744 2400 8255
filter(flights, dep_time<=600|dep_time==2400)
## # A tibble: 9,373 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 9,363 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#p50, E2, E3
filter(flights, between(month, 7, 9))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, is.na(dep_time))
## # A tibble: 8,255 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 NA 1630 NA NA
## 2 2013 1 1 NA 1935 NA NA
## 3 2013 1 1 NA 1500 NA NA
## 4 2013 1 1 NA 600 NA NA
## 5 2013 1 2 NA 1540 NA NA
## 6 2013 1 2 NA 1620 NA NA
## 7 2013 1 2 NA 1355 NA NA
## 8 2013 1 2 NA 1420 NA NA
## 9 2013 1 2 NA 1321 NA NA
## 10 2013 1 2 NA 1545 NA NA
## # ... with 8,245 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#p51 E1, E2, E3, E4
arrange(flights, desc(is.na(dep_time)), dep_time)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 NA 1630 NA NA
## 2 2013 1 1 NA 1935 NA NA
## 3 2013 1 1 NA 1500 NA NA
## 4 2013 1 1 NA 600 NA NA
## 5 2013 1 2 NA 1540 NA NA
## 6 2013 1 2 NA 1620 NA NA
## 7 2013 1 2 NA 1355 NA NA
## 8 2013 1 2 NA 1420 NA NA
## 9 2013 1 2 NA 1321 NA NA
## 10 2013 1 2 NA 1545 NA NA
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 7 2040 2123 -43 40
## 2 2013 2 3 2022 2055 -33 2240
## 3 2013 11 10 1408 1440 -32 1549
## 4 2013 1 11 1900 1930 -30 2233
## 5 2013 1 29 1703 1730 -27 1947
## 6 2013 8 9 729 755 -26 1002
## 7 2013 10 23 1907 1932 -25 2143
## 8 2013 3 30 2030 2055 -25 2213
## 9 2013 3 2 1431 1455 -24 1601
## 10 2013 5 5 934 958 -24 1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
fastest_flights <- mutate(flights, mph = distance / air_time * 60)
fastest_flights <- select(
fastest_flights, mph, distance, air_time,
flight, origin, dest, year, month, day)
head(arrange(fastest_flights, desc(mph)))
## # A tibble: 6 x 9
## mph distance air_time flight origin dest year month day
## <dbl> <dbl> <dbl> <int> <chr> <chr> <int> <int> <int>
## 1 703. 762 65 1499 LGA ATL 2013 5 25
## 2 650. 1008 93 4667 EWR MSP 2013 7 2
## 3 648 594 55 4292 EWR GSP 2013 5 13
## 4 641. 748 70 3805 EWR BNA 2013 3 23
## 5 591. 1035 105 1902 LGA PBI 2013 1 12
## 6 564 1598 170 315 JFK SJU 2013 11 17
arrange(flights, desc(air_time))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 3 17 1337 1335 2 1937
## 2 2013 2 6 853 900 -7 1542
## 3 2013 3 15 1001 1000 1 1551
## 4 2013 3 17 1006 1000 6 1607
## 5 2013 3 16 1001 1000 1 1544
## 6 2013 2 5 900 900 0 1555
## 7 2013 11 12 936 930 6 1630
## 8 2013 3 14 958 1000 -2 1542
## 9 2013 11 20 1006 1000 6 1639
## 10 2013 3 15 1342 1335 7 1924
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights, air_time)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 16 1355 1315 40 1442
## 2 2013 4 13 537 527 10 622
## 3 2013 12 6 922 851 31 1021
## 4 2013 2 3 2153 2129 24 2247
## 5 2013 2 5 1303 1315 -12 1342
## 6 2013 2 12 2123 2130 -7 2211
## 7 2013 3 2 1450 1500 -10 1547
## 8 2013 3 8 2026 1935 51 2131
## 9 2013 3 18 1456 1329 87 1533
## 10 2013 3 19 2226 2145 41 2305
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#p58, E4
rankme <- tibble(
x = c(10, 5, 1, 5, 5)
)
rankme <- mutate(rankme,
x_row_number = row_number(x),
x_min_rank = min_rank(x),
x_dense_rank = dense_rank(x)
)
arrange(rankme, x)
## # A tibble: 5 x 4
## x x_row_number x_min_rank x_dense_rank
## <dbl> <int> <int> <int>
## 1 1 1 1 1
## 2 5 2 2 2
## 3 5 3 2 2
## 4 5 4 2 2
## 5 10 5 5 3
flights_delayed <- mutate(flights,
dep_delay_min_rank = min_rank(desc(dep_delay)),
dep_delay_row_number = row_number(desc(dep_delay)),
dep_delay_dense_rank = dense_rank(desc(dep_delay))
)
flights_delayed <- filter(
flights_delayed,
!(dep_delay_min_rank > 10 | dep_delay_row_number > 10 |
dep_delay_dense_rank > 10)
)
flights_delayed <- arrange(flights_delayed, dep_delay_min_rank)
print(select(
flights_delayed, month, day, carrier, flight, dep_delay,
dep_delay_min_rank, dep_delay_row_number, dep_delay_dense_rank
),
n = Inf
)
## # A tibble: 10 x 8
## month day carrier flight dep_delay dep_delay_min_r~ dep_delay_row_n~
## <int> <int> <chr> <int> <dbl> <int> <int>
## 1 1 9 HA 51 1301 1 1
## 2 6 15 MQ 3535 1137 2 2
## 3 1 10 MQ 3695 1126 3 3
## 4 9 20 AA 177 1014 4 4
## 5 7 22 MQ 3075 1005 5 5
## 6 4 10 DL 2391 960 6 6
## 7 3 17 DL 2119 911 7 7
## 8 6 27 DL 2007 899 8 8
## 9 7 22 DL 2047 898 9 9
## 10 12 5 AA 172 896 10 10
## # ... with 1 more variable: dep_delay_dense_rank <int>
#p73, E4
cancelled_per_day <-
flights %>%
mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
group_by(year, month, day) %>%
summarise(
cancelled_num = sum(cancelled),
flights_num = n(),
)
ggplot(cancelled_per_day) +
geom_point(aes(x = flights_num, y = cancelled_num))
cancelled_and_delays <-
flights %>%
mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
group_by(year, month, day) %>%
summarise(
cancelled_prop = mean(cancelled),
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)
) %>%
ungroup()
ggplot(cancelled_and_delays) +
geom_point(aes(x = avg_dep_delay, y = cancelled_prop))
ggplot(cancelled_and_delays) +
geom_point(aes(x = avg_arr_delay, y = cancelled_prop))
#p73, E5
flights %>%
filter(!is.na(arr_delay)) %>%
group_by(origin, dest, carrier) %>%
summarise(
arr_delay = sum(arr_delay),
flights = n()
) %>%
group_by(origin, dest) %>%
mutate(
arr_delay_total = sum(arr_delay),
flights_total = sum(flights)
) %>%
ungroup() %>%
mutate(
arr_delay_others = (arr_delay_total - arr_delay) /
(flights_total - flights),
arr_delay_mean = arr_delay / flights,
arr_delay_diff = arr_delay_mean - arr_delay_others
) %>%
filter(is.finite(arr_delay_diff)) %>%
group_by(carrier) %>%
summarise(arr_delay_diff = mean(arr_delay_diff)) %>%
arrange(desc(arr_delay_diff))
## # A tibble: 15 x 2
## carrier arr_delay_diff
## <chr> <dbl>
## 1 OO 27.3
## 2 F9 17.3
## 3 EV 11.0
## 4 B6 6.41
## 5 FL 2.57
## 6 VX -0.202
## 7 AA -0.970
## 8 WN -1.27
## 9 UA -1.86
## 10 MQ -2.48
## 11 YV -2.81
## 12 9E -3.54
## 13 US -4.14
## 14 DL -10.2
## 15 AS -15.8
#p73, E7 The sort argument to count() sorts the results in order of n. You could use this anytime you would run count() followed by arrange(). #P75, E2
flights %>%
filter(!is.na(tailnum)) %>%
mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
group_by(tailnum) %>%
summarise(on_time = mean(on_time), n = n()) %>%
filter(min_rank(on_time) == 1)
## # A tibble: 110 x 3
## tailnum on_time n
## <chr> <dbl> <int>
## 1 N121DE 0 2
## 2 N136DL 0 1
## 3 N143DA 0 1
## 4 N17627 0 2
## 5 N240AT 0 5
## 6 N26906 0 1
## 7 N295AT 0 4
## 8 N302AS 0 1
## 9 N303AS 0 1
## 10 N32626 0 1
## # ... with 100 more rows
quantile(count(flights, tailnum)$n)
## 0% 25% 50% 75% 100%
## 1 23 54 110 2512
flights %>%
filter(!is.na(tailnum)) %>%
mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
group_by(tailnum) %>%
summarise(on_time = mean(on_time), n = n()) %>%
filter(n >= 20) %>%
filter(min_rank(on_time) == 1)
## # A tibble: 1 x 3
## tailnum on_time n
## <chr> <dbl> <int>
## 1 N988AT 0.189 37
flights %>%
group_by(tailnum) %>%
summarise(arr_delay = mean(arr_delay), n = n()) %>%
filter(n >= 20) %>%
filter(min_rank(desc(arr_delay)) == 1)
## # A tibble: 1 x 3
## tailnum arr_delay n
## <chr> <dbl> <int>
## 1 N203FR 59.1 41
#p75, E3
flights %>%
group_by(hour) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(arr_delay)
## # A tibble: 20 x 2
## hour arr_delay
## <dbl> <dbl>
## 1 7 -5.30
## 2 5 -4.80
## 3 6 -3.38
## 4 9 -1.45
## 5 8 -1.11
## 6 10 0.954
## 7 11 1.48
## 8 12 3.49
## 9 13 6.54
## 10 14 9.20
## 11 23 11.8
## 12 15 12.3
## 13 16 12.6
## 14 18 14.8
## 15 22 16.0
## 16 17 16.0
## 17 19 16.7
## 18 20 16.7
## 19 21 18.4
## 20 1 NaN
#p75, E4
flights %>%
filter(arr_delay > 0) %>%
group_by(dest) %>%
mutate(
arr_delay_total = sum(arr_delay),
arr_delay_prop = arr_delay / arr_delay_total
) %>%
select(
dest, month, day, dep_time, carrier, flight,
arr_delay, arr_delay_prop
) %>%
arrange(dest, desc(arr_delay_prop))
## # A tibble: 133,004 x 8
## # Groups: dest [103]
## dest month day dep_time carrier flight arr_delay arr_delay_prop
## <chr> <int> <int> <int> <chr> <int> <dbl> <dbl>
## 1 ABQ 7 22 2145 B6 1505 153 0.0341
## 2 ABQ 12 14 2223 B6 65 149 0.0332
## 3 ABQ 10 15 2146 B6 65 138 0.0308
## 4 ABQ 7 23 2206 B6 1505 137 0.0305
## 5 ABQ 12 17 2220 B6 65 136 0.0303
## 6 ABQ 7 10 2025 B6 1505 126 0.0281
## 7 ABQ 7 30 2212 B6 1505 118 0.0263
## 8 ABQ 7 28 2038 B6 1505 117 0.0261
## 9 ABQ 12 8 2049 B6 65 114 0.0254
## 10 ABQ 9 2 2212 B6 1505 109 0.0243
## # ... with 132,994 more rows
flights %>%
filter(arr_delay > 0) %>%
group_by(dest, origin, carrier, flight) %>%
summarise(arr_delay = sum(arr_delay)) %>%
group_by(dest) %>%
mutate(
arr_delay_prop = arr_delay / sum(arr_delay)
) %>%
arrange(dest, desc(arr_delay_prop)) %>%
select(carrier, flight, origin, dest, arr_delay_prop)
## # A tibble: 8,834 x 5
## # Groups: dest [103]
## carrier flight origin dest arr_delay_prop
## <chr> <int> <chr> <chr> <dbl>
## 1 B6 1505 JFK ABQ 0.567
## 2 B6 65 JFK ABQ 0.433
## 3 B6 1191 JFK ACK 0.475
## 4 B6 1491 JFK ACK 0.414
## 5 B6 1291 JFK ACK 0.0898
## 6 B6 1195 JFK ACK 0.0208
## 7 EV 4309 EWR ALB 0.174
## 8 EV 4271 EWR ALB 0.137
## 9 EV 4117 EWR ALB 0.0951
## 10 EV 4088 EWR ALB 0.0865
## # ... with 8,824 more rows