library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
ggplot(mpg, aes(x = hwy, y = cyl)) +
geom_point()
# page 12:E2
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), colour = "blue")
ggplot(mpg, aes(x = displ, y = hwy, colour = cty)) +
geom_point()
# page 21 exercise 6
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth(mapping = aes(group = drv), se = FALSE) +
geom_point()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy, colour = drv)) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(colour = drv)) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(colour = drv)) +
geom_smooth(aes(linetype = drv), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(size = 4, color = "white") +
geom_point(aes(colour = drv))
# page 49 exercise 1
filter(flights, dest == "IAH" | dest == "HOU")
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 623 627 -4 933
## 4 2013 1 1 728 732 -4 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2 1350
## 8 2013 1 1 1044 1045 -1 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5 1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 623 627 -4 933
## 4 2013 1 1 728 732 -4 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2 1350
## 8 2013 1 1 1044 1045 -1 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5 1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, carrier %in% c("AA", "DL", "UA"))
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 554 600 -6 812
## 5 2013 1 1 554 558 -4 740
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 558 600 -2 923
## 9 2013 1 1 559 600 -1 941
## 10 2013 1 1 559 600 -1 854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month >= 7, month <= 9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month %in% 7:9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month == 7 | month == 8 | month == 9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, arr_delay > 120, dep_delay <= 0)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 27 1419 1420 -1 1754
## 2 2013 10 7 1350 1350 0 1736
## 3 2013 10 7 1357 1359 -2 1858
## 4 2013 10 16 657 700 -3 1258
## 5 2013 11 1 658 700 -2 1329
## 6 2013 3 18 1844 1847 -3 39
## 7 2013 4 17 1635 1640 -5 2049
## 8 2013 4 18 558 600 -2 1149
## 9 2013 4 18 655 700 -5 1213
## 10 2013 5 22 1827 1830 -3 2217
## # ... with 19 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)
## # A tibble: 1,844 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 2205 1720 285 46
## 2 2013 1 1 2326 2130 116 131
## 3 2013 1 3 1503 1221 162 1803
## 4 2013 1 3 1839 1700 99 2056
## 5 2013 1 3 1850 1745 65 2148
## 6 2013 1 3 1941 1759 102 2246
## 7 2013 1 3 1950 1845 65 2228
## 8 2013 1 3 2015 1915 60 2135
## 9 2013 1 3 2257 2000 177 45
## 10 2013 1 4 1917 1700 137 2135
## # ... with 1,834 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
summary(flights$dep_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 907 1401 1349 1744 2400 8255
filter(flights, dep_time <= 600 | dep_time == 2400)
## # A tibble: 9,373 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 9,363 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
rankme <- tibble(
x = c(10, 5, 1, 5, 5)
)
rankme <- mutate(rankme,
x_row_number = row_number(x),
x_min_rank = min_rank(x),
x_dense_rank = dense_rank(x)
)
arrange(rankme, x)
## # A tibble: 5 x 4
## x x_row_number x_min_rank x_dense_rank
## <dbl> <int> <int> <int>
## 1 1 1 1 1
## 2 5 2 2 2
## 3 5 3 2 2
## 4 5 4 2 2
## 5 10 5 5 3
flights_delayed <- mutate(flights,
dep_delay_min_rank = min_rank(desc(dep_delay)),
dep_delay_row_number = row_number(desc(dep_delay)),
dep_delay_dense_rank = dense_rank(desc(dep_delay))
)
flights_delayed <- filter(
flights_delayed,
!(dep_delay_min_rank > 10 | dep_delay_row_number > 10 |
dep_delay_dense_rank > 10)
)
flights_delayed <- arrange(flights_delayed, dep_delay_min_rank)
print(select(
flights_delayed, month, day, carrier, flight, dep_delay,
dep_delay_min_rank, dep_delay_row_number, dep_delay_dense_rank
),
n = Inf
)
## # A tibble: 10 x 8
## month day carrier flight dep_delay dep_delay_min_r~ dep_delay_row_n~
## <int> <int> <chr> <int> <dbl> <int> <int>
## 1 1 9 HA 51 1301 1 1
## 2 6 15 MQ 3535 1137 2 2
## 3 1 10 MQ 3695 1126 3 3
## 4 9 20 AA 177 1014 4 4
## 5 7 22 MQ 3075 1005 5 5
## 6 4 10 DL 2391 960 6 6
## 7 3 17 DL 2119 911 7 7
## 8 6 27 DL 2007 899 8 8
## 9 7 22 DL 2047 898 9 9
## 10 12 5 AA 172 896 10 10
## # ... with 1 more variable: dep_delay_dense_rank <int>
cancelled_per_day <-
flights %>%
mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
group_by(year, month, day) %>%
summarise(
cancelled_num = sum(cancelled),
flights_num = n(),
)
ggplot(cancelled_per_day) +
geom_point(aes(x = flights_num, y = cancelled_num))
cancelled_and_delays <-
flights %>%
mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
group_by(year, month, day) %>%
summarise(
cancelled_prop = mean(cancelled),
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)
) %>%
ungroup()
ggplot(cancelled_and_delays) +
geom_point(aes(x = avg_dep_delay, y = cancelled_prop))
ggplot(cancelled_and_delays) +
geom_point(aes(x = avg_arr_delay, y = cancelled_prop))
# page 73 E5
flights %>%
group_by(carrier) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 16 x 2
## carrier arr_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
flights %>%
filter(!is.na(arr_delay)) %>%
# Total delay by carrier within each origin, dest
group_by(origin, dest, carrier) %>%
summarise(
arr_delay = sum(arr_delay),
flights = n()
) %>%
# Total delay within each origin dest
group_by(origin, dest) %>%
mutate(
arr_delay_total = sum(arr_delay),
flights_total = sum(flights)
) %>%
# average delay of each carrier - average delay of other carriers
ungroup() %>%
mutate(
arr_delay_others = (arr_delay_total - arr_delay) /
(flights_total - flights),
arr_delay_mean = arr_delay / flights,
arr_delay_diff = arr_delay_mean - arr_delay_others
) %>%
# remove NaN values (when there is only one carrier)
filter(is.finite(arr_delay_diff)) %>%
# average over all airports it flies to
group_by(carrier) %>%
summarise(arr_delay_diff = mean(arr_delay_diff)) %>%
arrange(desc(arr_delay_diff))
## # A tibble: 15 x 2
## carrier arr_delay_diff
## <chr> <dbl>
## 1 OO 27.3
## 2 F9 17.3
## 3 EV 11.0
## 4 B6 6.41
## 5 FL 2.57
## 6 VX -0.202
## 7 AA -0.970
## 8 WN -1.27
## 9 UA -1.86
## 10 MQ -2.48
## 11 YV -2.81
## 12 9E -3.54
## 13 US -4.14
## 14 DL -10.2
## 15 AS -15.8
tibble(
x = 1:9,
group = rep(c("a", "b", "c"), each = 3)
) %>%
mutate(x_mean = mean(x)) %>%
group_by(group) %>%
mutate(x_mean_2 = mean(x))
## # A tibble: 9 x 4
## # Groups: group [3]
## x group x_mean x_mean_2
## <int> <chr> <dbl> <dbl>
## 1 1 a 5 2
## 2 2 a 5 2
## 3 3 a 5 2
## 4 4 b 5 5
## 5 5 b 5 5
## 6 6 b 5 5
## 7 7 c 5 8
## 8 8 c 5 8
## 9 9 c 5 8
tibble(
x = 1:9,
group = rep(c("a", "b", "c"), each = 3)
) %>%
mutate(y = x + 2) %>%
group_by(group) %>%
mutate(z = x + 2)
## # A tibble: 9 x 4
## # Groups: group [3]
## x group y z
## <int> <chr> <dbl> <dbl>
## 1 1 a 3 3
## 2 2 a 4 4
## 3 3 a 5 5
## 4 4 b 6 6
## 5 5 b 7 7
## 6 6 b 8 8
## 7 7 c 9 9
## 8 8 c 10 10
## 9 9 c 11 11
tibble(
x = 1:9,
group = rep(c("a", "b", "c"), each = 3)
) %>%
mutate(y = x %% 2) %>%
group_by(group) %>%
mutate(z = x %% 2)
## # A tibble: 9 x 4
## # Groups: group [3]
## x group y z
## <int> <chr> <dbl> <dbl>
## 1 1 a 1 1
## 2 2 a 0 0
## 3 3 a 1 1
## 4 4 b 0 0
## 5 5 b 1 1
## 6 6 b 0 0
## 7 7 c 1 1
## 8 8 c 0 0
## 9 9 c 1 1
tibble(
x = 1:9,
group = rep(c("a", "b", "c"), each = 3)
) %>%
mutate(y = log(x)) %>%
group_by(group) %>%
mutate(z = log(x))
## # A tibble: 9 x 4
## # Groups: group [3]
## x group y z
## <int> <chr> <dbl> <dbl>
## 1 1 a 0 0
## 2 2 a 0.693 0.693
## 3 3 a 1.10 1.10
## 4 4 b 1.39 1.39
## 5 5 b 1.61 1.61
## 6 6 b 1.79 1.79
## 7 7 c 1.95 1.95
## 8 8 c 2.08 2.08
## 9 9 c 2.20 2.20
flights %>%
filter(!is.na(tailnum)) %>%
mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
group_by(tailnum) %>%
summarise(on_time = mean(on_time), n = n()) %>%
filter(min_rank(on_time) == 1)
## # A tibble: 110 x 3
## tailnum on_time n
## <chr> <dbl> <int>
## 1 N121DE 0 2
## 2 N136DL 0 1
## 3 N143DA 0 1
## 4 N17627 0 2
## 5 N240AT 0 5
## 6 N26906 0 1
## 7 N295AT 0 4
## 8 N302AS 0 1
## 9 N303AS 0 1
## 10 N32626 0 1
## # ... with 100 more rows
quantile(count(flights, tailnum)$n)
## 0% 25% 50% 75% 100%
## 1 23 54 110 2512
flights %>%
filter(!is.na(tailnum)) %>%
mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
group_by(tailnum) %>%
summarise(on_time = mean(on_time), n = n()) %>%
filter(n >= 20) %>%
filter(min_rank(on_time) == 1)
## # A tibble: 1 x 3
## tailnum on_time n
## <chr> <dbl> <int>
## 1 N988AT 0.189 37
flights %>%
group_by(tailnum) %>%
summarise(arr_delay = mean(arr_delay), n = n()) %>%
filter(n >= 20) %>%
filter(min_rank(desc(arr_delay)) == 1)
## # A tibble: 1 x 3
## tailnum arr_delay n
## <chr> <dbl> <int>
## 1 N203FR 59.1 41
flights %>%
group_by(hour) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(arr_delay)
## # A tibble: 20 x 2
## hour arr_delay
## <dbl> <dbl>
## 1 7 -5.30
## 2 5 -4.80
## 3 6 -3.38
## 4 9 -1.45
## 5 8 -1.11
## 6 10 0.954
## 7 11 1.48
## 8 12 3.49
## 9 13 6.54
## 10 14 9.20
## 11 23 11.8
## 12 15 12.3
## 13 16 12.6
## 14 18 14.8
## 15 22 16.0
## 16 17 16.0
## 17 19 16.7
## 18 20 16.7
## 19 21 18.4
## 20 1 NaN
flights %>%
filter(arr_delay > 0) %>%
group_by(dest) %>%
mutate(
arr_delay_total = sum(arr_delay),
arr_delay_prop = arr_delay / arr_delay_total
) %>%
select(
dest, month, day, dep_time, carrier, flight,
arr_delay, arr_delay_prop
) %>%
arrange(dest, desc(arr_delay_prop))
## # A tibble: 133,004 x 8
## # Groups: dest [103]
## dest month day dep_time carrier flight arr_delay arr_delay_prop
## <chr> <int> <int> <int> <chr> <int> <dbl> <dbl>
## 1 ABQ 7 22 2145 B6 1505 153 0.0341
## 2 ABQ 12 14 2223 B6 65 149 0.0332
## 3 ABQ 10 15 2146 B6 65 138 0.0308
## 4 ABQ 7 23 2206 B6 1505 137 0.0305
## 5 ABQ 12 17 2220 B6 65 136 0.0303
## 6 ABQ 7 10 2025 B6 1505 126 0.0281
## 7 ABQ 7 30 2212 B6 1505 118 0.0263
## 8 ABQ 7 28 2038 B6 1505 117 0.0261
## 9 ABQ 12 8 2049 B6 65 114 0.0254
## 10 ABQ 9 2 2212 B6 1505 109 0.0243
## # ... with 132,994 more rows
flights %>%
filter(arr_delay > 0) %>%
group_by(dest, origin, carrier, flight) %>%
summarise(arr_delay = sum(arr_delay)) %>%
group_by(dest) %>%
mutate(
arr_delay_prop = arr_delay / sum(arr_delay)
) %>%
arrange(dest, desc(arr_delay_prop)) %>%
select(carrier, flight, origin, dest, arr_delay_prop)
## # A tibble: 8,834 x 5
## # Groups: dest [103]
## carrier flight origin dest arr_delay_prop
## <chr> <int> <chr> <chr> <dbl>
## 1 B6 1505 JFK ABQ 0.567
## 2 B6 65 JFK ABQ 0.433
## 3 B6 1191 JFK ACK 0.475
## 4 B6 1491 JFK ACK 0.414
## 5 B6 1291 JFK ACK 0.0898
## 6 B6 1195 JFK ACK 0.0208
## 7 EV 4309 EWR ALB 0.174
## 8 EV 4271 EWR ALB 0.137
## 9 EV 4117 EWR ALB 0.0951
## 10 EV 4088 EWR ALB 0.0865
## # ... with 8,824 more rows