library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)

exercise 4

ggplot(mpg, aes(x = hwy, y = cyl)) +
  geom_point()

# page 12:E2

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy), colour = "blue")

ggplot(mpg, aes(x = displ, y = hwy, colour = cty)) +
  geom_point()

# page 21 exercise 6

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(mapping = aes(group = drv), se = FALSE) +
  geom_point()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  ggplot(mpg, aes(x = displ, y = hwy, colour = drv)) +
  geom_point() +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(colour = drv)) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(colour = drv)) +
  geom_smooth(aes(linetype = drv), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(size = 4, color = "white") +
  geom_point(aes(colour = drv))

# page 49 exercise 1

filter(flights, dest == "IAH" | dest == "HOU")
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      623            627        -4      933
##  4  2013     1     1      728            732        -4     1041
##  5  2013     1     1      739            739         0     1104
##  6  2013     1     1      908            908         0     1228
##  7  2013     1     1     1028           1026         2     1350
##  8  2013     1     1     1044           1045        -1     1352
##  9  2013     1     1     1114            900       134     1447
## 10  2013     1     1     1205           1200         5     1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      623            627        -4      933
##  4  2013     1     1      728            732        -4     1041
##  5  2013     1     1      739            739         0     1104
##  6  2013     1     1      908            908         0     1228
##  7  2013     1     1     1028           1026         2     1350
##  8  2013     1     1     1044           1045        -1     1352
##  9  2013     1     1     1114            900       134     1447
## 10  2013     1     1     1205           1200         5     1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, carrier %in% c("AA", "DL", "UA"))
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      554            600        -6      812
##  5  2013     1     1      554            558        -4      740
##  6  2013     1     1      558            600        -2      753
##  7  2013     1     1      558            600        -2      924
##  8  2013     1     1      558            600        -2      923
##  9  2013     1     1      559            600        -1      941
## 10  2013     1     1      559            600        -1      854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month >= 7, month <= 9)
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029       212      236
##  2  2013     7     1        2           2359         3      344
##  3  2013     7     1       29           2245       104      151
##  4  2013     7     1       43           2130       193      322
##  5  2013     7     1       44           2150       174      300
##  6  2013     7     1       46           2051       235      304
##  7  2013     7     1       48           2001       287      308
##  8  2013     7     1       58           2155       183      335
##  9  2013     7     1      100           2146       194      327
## 10  2013     7     1      100           2245       135      337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month %in% 7:9)
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029       212      236
##  2  2013     7     1        2           2359         3      344
##  3  2013     7     1       29           2245       104      151
##  4  2013     7     1       43           2130       193      322
##  5  2013     7     1       44           2150       174      300
##  6  2013     7     1       46           2051       235      304
##  7  2013     7     1       48           2001       287      308
##  8  2013     7     1       58           2155       183      335
##  9  2013     7     1      100           2146       194      327
## 10  2013     7     1      100           2245       135      337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month == 7 | month == 8 | month == 9)
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029       212      236
##  2  2013     7     1        2           2359         3      344
##  3  2013     7     1       29           2245       104      151
##  4  2013     7     1       43           2130       193      322
##  5  2013     7     1       44           2150       174      300
##  6  2013     7     1       46           2051       235      304
##  7  2013     7     1       48           2001       287      308
##  8  2013     7     1       58           2155       183      335
##  9  2013     7     1      100           2146       194      327
## 10  2013     7     1      100           2245       135      337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, arr_delay > 120, dep_delay <= 0)
## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1    27     1419           1420        -1     1754
##  2  2013    10     7     1350           1350         0     1736
##  3  2013    10     7     1357           1359        -2     1858
##  4  2013    10    16      657            700        -3     1258
##  5  2013    11     1      658            700        -2     1329
##  6  2013     3    18     1844           1847        -3       39
##  7  2013     4    17     1635           1640        -5     2049
##  8  2013     4    18      558            600        -2     1149
##  9  2013     4    18      655            700        -5     1213
## 10  2013     5    22     1827           1830        -3     2217
## # ... with 19 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)
## # A tibble: 1,844 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     2205           1720       285       46
##  2  2013     1     1     2326           2130       116      131
##  3  2013     1     3     1503           1221       162     1803
##  4  2013     1     3     1839           1700        99     2056
##  5  2013     1     3     1850           1745        65     2148
##  6  2013     1     3     1941           1759       102     2246
##  7  2013     1     3     1950           1845        65     2228
##  8  2013     1     3     2015           1915        60     2135
##  9  2013     1     3     2257           2000       177       45
## 10  2013     1     4     1917           1700       137     2135
## # ... with 1,834 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
summary(flights$dep_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       1     907    1401    1349    1744    2400    8255
filter(flights, dep_time <= 600 | dep_time == 2400)
## # A tibble: 9,373 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 9,363 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
rankme <- tibble(
  x = c(10, 5, 1, 5, 5)
)
rankme <- mutate(rankme,
  x_row_number = row_number(x),
  x_min_rank = min_rank(x),
  x_dense_rank = dense_rank(x)
)
arrange(rankme, x)
## # A tibble: 5 x 4
##       x x_row_number x_min_rank x_dense_rank
##   <dbl>        <int>      <int>        <int>
## 1     1            1          1            1
## 2     5            2          2            2
## 3     5            3          2            2
## 4     5            4          2            2
## 5    10            5          5            3
flights_delayed <- mutate(flights,
  dep_delay_min_rank = min_rank(desc(dep_delay)),
  dep_delay_row_number = row_number(desc(dep_delay)),
  dep_delay_dense_rank = dense_rank(desc(dep_delay))
)
flights_delayed <- filter(
  flights_delayed,
  !(dep_delay_min_rank > 10 | dep_delay_row_number > 10 |
    dep_delay_dense_rank > 10)
)
flights_delayed <- arrange(flights_delayed, dep_delay_min_rank)
print(select(
  flights_delayed, month, day, carrier, flight, dep_delay,
  dep_delay_min_rank, dep_delay_row_number, dep_delay_dense_rank
),
n = Inf
)
## # A tibble: 10 x 8
##    month   day carrier flight dep_delay dep_delay_min_r~ dep_delay_row_n~
##    <int> <int> <chr>    <int>     <dbl>            <int>            <int>
##  1     1     9 HA          51      1301                1                1
##  2     6    15 MQ        3535      1137                2                2
##  3     1    10 MQ        3695      1126                3                3
##  4     9    20 AA         177      1014                4                4
##  5     7    22 MQ        3075      1005                5                5
##  6     4    10 DL        2391       960                6                6
##  7     3    17 DL        2119       911                7                7
##  8     6    27 DL        2007       899                8                8
##  9     7    22 DL        2047       898                9                9
## 10    12     5 AA         172       896               10               10
## # ... with 1 more variable: dep_delay_dense_rank <int>

page 73 E4

cancelled_per_day <-
  flights %>%
  mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
  group_by(year, month, day) %>%
  summarise(
    cancelled_num = sum(cancelled),
    flights_num = n(),
  )
ggplot(cancelled_per_day) +
  geom_point(aes(x = flights_num, y = cancelled_num))

cancelled_and_delays <-
  flights %>%
  mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
  group_by(year, month, day) %>%
  summarise(
    cancelled_prop = mean(cancelled),
    avg_dep_delay = mean(dep_delay, na.rm = TRUE),
    avg_arr_delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  ungroup()
ggplot(cancelled_and_delays) +
  geom_point(aes(x = avg_dep_delay, y = cancelled_prop))

ggplot(cancelled_and_delays) +
  geom_point(aes(x = avg_arr_delay, y = cancelled_prop))

# page 73 E5

flights %>%
  group_by(carrier) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(arr_delay))
## # A tibble: 16 x 2
##    carrier arr_delay
##    <chr>       <dbl>
##  1 F9         21.9  
##  2 FL         20.1  
##  3 EV         15.8  
##  4 YV         15.6  
##  5 OO         11.9  
##  6 MQ         10.8  
##  7 WN          9.65 
##  8 B6          9.46 
##  9 9E          7.38 
## 10 UA          3.56 
## 11 US          2.13 
## 12 VX          1.76 
## 13 DL          1.64 
## 14 AA          0.364
## 15 HA         -6.92 
## 16 AS         -9.93
flights %>%
  filter(!is.na(arr_delay)) %>%
  # Total delay by carrier within each origin, dest
  group_by(origin, dest, carrier) %>%
  summarise(
    arr_delay = sum(arr_delay),
    flights = n()
  ) %>%
  # Total delay within each origin dest
  group_by(origin, dest) %>%
  mutate(
    arr_delay_total = sum(arr_delay),
    flights_total = sum(flights)
  ) %>%
  # average delay of each carrier - average delay of other carriers
  ungroup() %>%
  mutate(
    arr_delay_others = (arr_delay_total - arr_delay) /
      (flights_total - flights),
    arr_delay_mean = arr_delay / flights,
    arr_delay_diff = arr_delay_mean - arr_delay_others
  ) %>%
  # remove NaN values (when there is only one carrier)
  filter(is.finite(arr_delay_diff)) %>%
  # average over all airports it flies to
  group_by(carrier) %>%
  summarise(arr_delay_diff = mean(arr_delay_diff)) %>%
  arrange(desc(arr_delay_diff))
## # A tibble: 15 x 2
##    carrier arr_delay_diff
##    <chr>            <dbl>
##  1 OO              27.3  
##  2 F9              17.3  
##  3 EV              11.0  
##  4 B6               6.41 
##  5 FL               2.57 
##  6 VX              -0.202
##  7 AA              -0.970
##  8 WN              -1.27 
##  9 UA              -1.86 
## 10 MQ              -2.48 
## 11 YV              -2.81 
## 12 9E              -3.54 
## 13 US              -4.14 
## 14 DL             -10.2  
## 15 AS             -15.8

page 73 E7

tibble(
  x = 1:9,
  group = rep(c("a", "b", "c"), each = 3)
) %>%
  mutate(x_mean = mean(x)) %>%
  group_by(group) %>%
  mutate(x_mean_2 = mean(x))
## # A tibble: 9 x 4
## # Groups:   group [3]
##       x group x_mean x_mean_2
##   <int> <chr>  <dbl>    <dbl>
## 1     1 a          5        2
## 2     2 a          5        2
## 3     3 a          5        2
## 4     4 b          5        5
## 5     5 b          5        5
## 6     6 b          5        5
## 7     7 c          5        8
## 8     8 c          5        8
## 9     9 c          5        8
tibble(
  x = 1:9,
  group = rep(c("a", "b", "c"), each = 3)
) %>%
  mutate(y = x + 2) %>%
  group_by(group) %>%
  mutate(z = x + 2)
## # A tibble: 9 x 4
## # Groups:   group [3]
##       x group     y     z
##   <int> <chr> <dbl> <dbl>
## 1     1 a         3     3
## 2     2 a         4     4
## 3     3 a         5     5
## 4     4 b         6     6
## 5     5 b         7     7
## 6     6 b         8     8
## 7     7 c         9     9
## 8     8 c        10    10
## 9     9 c        11    11
tibble(
  x = 1:9,
  group = rep(c("a", "b", "c"), each = 3)
) %>%
  mutate(y = x %% 2) %>%
  group_by(group) %>%
  mutate(z = x %% 2)
## # A tibble: 9 x 4
## # Groups:   group [3]
##       x group     y     z
##   <int> <chr> <dbl> <dbl>
## 1     1 a         1     1
## 2     2 a         0     0
## 3     3 a         1     1
## 4     4 b         0     0
## 5     5 b         1     1
## 6     6 b         0     0
## 7     7 c         1     1
## 8     8 c         0     0
## 9     9 c         1     1
tibble(
  x = 1:9,
  group = rep(c("a", "b", "c"), each = 3)
) %>%
  mutate(y = log(x)) %>%
  group_by(group) %>%
  mutate(z = log(x))
## # A tibble: 9 x 4
## # Groups:   group [3]
##       x group     y     z
##   <int> <chr> <dbl> <dbl>
## 1     1 a     0     0    
## 2     2 a     0.693 0.693
## 3     3 a     1.10  1.10 
## 4     4 b     1.39  1.39 
## 5     5 b     1.61  1.61 
## 6     6 b     1.79  1.79 
## 7     7 c     1.95  1.95 
## 8     8 c     2.08  2.08 
## 9     9 c     2.20  2.20

page 75 E2

flights %>%
  filter(!is.na(tailnum)) %>%
  mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
  group_by(tailnum) %>%
  summarise(on_time = mean(on_time), n = n()) %>%
  filter(min_rank(on_time) == 1)
## # A tibble: 110 x 3
##    tailnum on_time     n
##    <chr>     <dbl> <int>
##  1 N121DE        0     2
##  2 N136DL        0     1
##  3 N143DA        0     1
##  4 N17627        0     2
##  5 N240AT        0     5
##  6 N26906        0     1
##  7 N295AT        0     4
##  8 N302AS        0     1
##  9 N303AS        0     1
## 10 N32626        0     1
## # ... with 100 more rows
quantile(count(flights, tailnum)$n)
##   0%  25%  50%  75% 100% 
##    1   23   54  110 2512
flights %>%
  filter(!is.na(tailnum)) %>%
  mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
  group_by(tailnum) %>%
  summarise(on_time = mean(on_time), n = n()) %>%
  filter(n >= 20) %>%
  filter(min_rank(on_time) == 1)
## # A tibble: 1 x 3
##   tailnum on_time     n
##   <chr>     <dbl> <int>
## 1 N988AT    0.189    37
flights %>%
  group_by(tailnum) %>%
  summarise(arr_delay = mean(arr_delay), n = n()) %>%
  filter(n >= 20) %>%
  filter(min_rank(desc(arr_delay)) == 1)
## # A tibble: 1 x 3
##   tailnum arr_delay     n
##   <chr>       <dbl> <int>
## 1 N203FR       59.1    41

page 75 E3

flights %>%
  group_by(hour) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(arr_delay)
## # A tibble: 20 x 2
##     hour arr_delay
##    <dbl>     <dbl>
##  1     7    -5.30 
##  2     5    -4.80 
##  3     6    -3.38 
##  4     9    -1.45 
##  5     8    -1.11 
##  6    10     0.954
##  7    11     1.48 
##  8    12     3.49 
##  9    13     6.54 
## 10    14     9.20 
## 11    23    11.8  
## 12    15    12.3  
## 13    16    12.6  
## 14    18    14.8  
## 15    22    16.0  
## 16    17    16.0  
## 17    19    16.7  
## 18    20    16.7  
## 19    21    18.4  
## 20     1   NaN

page 75 E4

flights %>%
  filter(arr_delay > 0) %>%
  group_by(dest) %>%
  mutate(
    arr_delay_total = sum(arr_delay),
    arr_delay_prop = arr_delay / arr_delay_total
  ) %>%
  select(
    dest, month, day, dep_time, carrier, flight,
    arr_delay, arr_delay_prop
  ) %>%
  arrange(dest, desc(arr_delay_prop))
## # A tibble: 133,004 x 8
## # Groups:   dest [103]
##    dest  month   day dep_time carrier flight arr_delay arr_delay_prop
##    <chr> <int> <int>    <int> <chr>    <int>     <dbl>          <dbl>
##  1 ABQ       7    22     2145 B6        1505       153         0.0341
##  2 ABQ      12    14     2223 B6          65       149         0.0332
##  3 ABQ      10    15     2146 B6          65       138         0.0308
##  4 ABQ       7    23     2206 B6        1505       137         0.0305
##  5 ABQ      12    17     2220 B6          65       136         0.0303
##  6 ABQ       7    10     2025 B6        1505       126         0.0281
##  7 ABQ       7    30     2212 B6        1505       118         0.0263
##  8 ABQ       7    28     2038 B6        1505       117         0.0261
##  9 ABQ      12     8     2049 B6          65       114         0.0254
## 10 ABQ       9     2     2212 B6        1505       109         0.0243
## # ... with 132,994 more rows
flights %>%
  filter(arr_delay > 0) %>%
  group_by(dest, origin, carrier, flight) %>%
  summarise(arr_delay = sum(arr_delay)) %>%
  group_by(dest) %>%
  mutate(
    arr_delay_prop = arr_delay / sum(arr_delay)
  ) %>%
  arrange(dest, desc(arr_delay_prop)) %>%
  select(carrier, flight, origin, dest, arr_delay_prop)
## # A tibble: 8,834 x 5
## # Groups:   dest [103]
##    carrier flight origin dest  arr_delay_prop
##    <chr>    <int> <chr>  <chr>          <dbl>
##  1 B6        1505 JFK    ABQ           0.567 
##  2 B6          65 JFK    ABQ           0.433 
##  3 B6        1191 JFK    ACK           0.475 
##  4 B6        1491 JFK    ACK           0.414 
##  5 B6        1291 JFK    ACK           0.0898
##  6 B6        1195 JFK    ACK           0.0208
##  7 EV        4309 EWR    ALB           0.174 
##  8 EV        4271 EWR    ALB           0.137 
##  9 EV        4117 EWR    ALB           0.0951
## 10 EV        4088 EWR    ALB           0.0865
## # ... with 8,824 more rows