Datasets can be transformed with dplyr, which can be loaded with the tidyverse package or by itself.
Data can be subsetted with filter(). The following examples use the flights dataset. The code makes use of the pipe symbol, %>%. This can be thought of as a replacement for “then”.
> library(nycflights13) #contains flights dataset
> library(tidyverse) #contains dplyr
> names(flights) #column names [1] "year" "month" "day" "dep_time"
[5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
[9] "arr_delay" "carrier" "flight" "tailnum"
[13] "origin" "dest" "air_time" "distance"
[17] "hour" "minute" "time_hour"
> #flights dataset THEN filter for Jan 1st
> #wrapping the whole statement in parentheses will
> #display it as well as save it as Jan1
> #otherwise it will only save
> (Jan1 <-flights %>% filter(month==1,day==1))# A tibble: 842 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ... with 832 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
The typical comparison operators can be used, >,>=,<,<=,!=(not equal),==(equal).
# A tibble: 21,838 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 7 49 2359 50 531 444
2 2013 1 7 454 500 -6 637 648
3 2013 1 7 523 525 -2 758 820
4 2013 1 7 531 540 -9 827 850
5 2013 1 7 536 540 -4 1020 1017
6 2013 1 7 544 530 14 822 829
7 2013 1 7 545 600 -15 646 709
8 2013 1 7 552 600 -8 843 904
9 2013 1 7 554 600 -6 708 715
10 2013 1 7 555 600 -5 741 801
# ... with 21,828 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
You can also use the logical operators & for "and", | for "or" and ! for "not".
# A tibble: 55,403 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 11 1 5 2359 6 352 345
2 2013 11 1 35 2250 105 123 2356
3 2013 11 1 455 500 -5 641 651
4 2013 11 1 539 545 -6 856 827
5 2013 11 1 542 545 -3 831 855
6 2013 11 1 549 600 -11 912 923
7 2013 11 1 550 600 -10 705 659
8 2013 11 1 554 600 -6 659 701
9 2013 11 1 554 600 -6 826 827
10 2013 11 1 554 600 -6 749 751
# ... with 55,393 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
We can achieve the same results with this alternate method:
> # x %in% y format
> #selects every row where x is equal to a value in y
> (nov_dec <-flights %>% filter(month %in% c(11,12)))# A tibble: 55,403 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 11 1 5 2359 6 352 345
2 2013 11 1 35 2250 105 123 2356
3 2013 11 1 455 500 -5 641 651
4 2013 11 1 539 545 -6 856 827
5 2013 11 1 542 545 -3 831 855
6 2013 11 1 549 600 -11 912 923
7 2013 11 1 550 600 -10 705 659
8 2013 11 1 554 600 -6 659 701
9 2013 11 1 554 600 -6 826 827
10 2013 11 1 554 600 -6 749 751
# ... with 55,393 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
De Morgan’s law:
To find flights that weren’t delayed by more than 2 hours either of the following could be used:
filter() will exlcude NA values.
# A tibble: 1 x 1
x
<dbl>
1 8
# A tibble: 2 x 1
x
<dbl>
1 NA
2 8
> library(tidyverse)
> library(nycflights13)
>
> #arr_delay in minutes
> flights %>% filter(arr_delay>=120) %>%
+ select(year, month, day, flight, arr_delay) %>%
+ arrange(desc(arr_delay))# A tibble: 10,200 x 5
year month day flight arr_delay
<int> <int> <int> <int> <dbl>
1 2013 1 9 51 1272
2 2013 6 15 3535 1127
3 2013 1 10 3695 1109
4 2013 9 20 177 1007
5 2013 7 22 3075 989
6 2013 4 10 2391 931
7 2013 3 17 2119 915
8 2013 7 22 2047 895
9 2013 12 5 172 878
10 2013 5 3 3744 875
# ... with 10,190 more rows
# A tibble: 9,313 x 5
year month day flight dest
<int> <int> <int> <int> <chr>
1 2013 1 1 1545 IAH
2 2013 1 1 1714 IAH
3 2013 1 1 496 IAH
4 2013 1 1 473 IAH
5 2013 1 1 1479 IAH
6 2013 1 1 1220 IAH
7 2013 1 1 1004 IAH
8 2013 1 1 455 IAH
9 2013 1 1 1086 IAH
10 2013 1 1 1461 IAH
# ... with 9,303 more rows
> flights %>% filter(carrier %in% c("UA","AA","DL")) %>%
+ select(year, month, day, flight, carrier) # A tibble: 139,504 x 5
year month day flight carrier
<int> <int> <int> <int> <chr>
1 2013 1 1 1545 UA
2 2013 1 1 1714 UA
3 2013 1 1 1141 AA
4 2013 1 1 461 DL
5 2013 1 1 1696 UA
6 2013 1 1 301 AA
7 2013 1 1 194 UA
8 2013 1 1 1124 UA
9 2013 1 1 707 AA
10 2013 1 1 1187 UA
# ... with 139,494 more rows
# A tibble: 86,326 x 4
year month day flight
<int> <int> <int> <int>
1 2013 7 1 915
2 2013 7 1 1503
3 2013 7 1 234
4 2013 7 1 1371
5 2013 7 1 185
6 2013 7 1 165
7 2013 7 1 415
8 2013 7 1 425
9 2013 7 1 1183
10 2013 7 1 623
# ... with 86,316 more rows
> flights %>% filter(arr_delay>120 & dep_delay<=0) %>%
+ select(year, month, day, dep_delay, arr_delay) %>%
+ arrange(desc(dep_delay))# A tibble: 29 x 5
year month day dep_delay arr_delay
<int> <int> <int> <dbl> <dbl>
1 2013 10 7 0 130
2 2013 5 23 0 128
3 2013 7 1 0 140
4 2013 1 27 -1 124
5 2013 7 7 -1 147
6 2013 7 28 -1 129
7 2013 10 7 -2 124
8 2013 11 1 -2 194
9 2013 4 18 -2 179
10 2013 6 14 -2 132
# ... with 19 more rows
> flights %>% filter(dep_delay>=60 & arr_delay<(dep_delay-30)) %>%
+ select(year, month, day, dep_delay, arr_delay)# A tibble: 1,844 x 5
year month day dep_delay arr_delay
<int> <int> <int> <dbl> <dbl>
1 2013 1 1 285 246
2 2013 1 1 116 73
3 2013 1 3 162 128
4 2013 1 3 99 66
5 2013 1 3 65 28
6 2013 1 3 102 67
7 2013 1 3 65 1
8 2013 1 3 60 24
9 2013 1 3 177 141
10 2013 1 4 137 105
# ... with 1,834 more rows
> #midnight is recorded as 2400. 12:01 AM is recorded as 1.
> #6:00 AM is recorded as 600. <=600 will capture 12:01 AM to 6:00 AM
> flights %>% filter(dep_time==2400 | dep_time<=600) %>%
+ select(year, month, day, flight, dep_time)# A tibble: 9,373 x 5
year month day flight dep_time
<int> <int> <int> <int> <int>
1 2013 1 1 1545 517
2 2013 1 1 1714 533
3 2013 1 1 1141 542
4 2013 1 1 725 544
5 2013 1 1 461 554
6 2013 1 1 1696 554
7 2013 1 1 507 555
8 2013 1 1 5708 557
9 2013 1 1 79 557
10 2013 1 1 301 558
# ... with 9,363 more rows
between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?between() is a shortcut for x >= left & x <= right# A tibble: 86,326 x 4
year month day flight
<int> <int> <int> <int>
1 2013 7 1 915
2 2013 7 1 1503
3 2013 7 1 234
4 2013 7 1 1371
5 2013 7 1 185
6 2013 7 1 165
7 2013 7 1 415
8 2013 7 1 425
9 2013 7 1 1183
10 2013 7 1 623
# ... with 86,316 more rows
dep_time? What other variables are missing? What might these rows represent?arr_time and the delay variables are missing# A tibble: 8,255 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 NA 1630 NA NA 1815
2 2013 1 1 NA 1935 NA NA 2240
3 2013 1 1 NA 1500 NA NA 1825
4 2013 1 1 NA 600 NA NA 901
5 2013 1 2 NA 1540 NA NA 1747
6 2013 1 2 NA 1620 NA NA 1746
7 2013 1 2 NA 1355 NA NA 1459
8 2013 1 2 NA 1420 NA NA 1644
9 2013 1 2 NA 1321 NA NA 1536
10 2013 1 2 NA 1545 NA NA 1910
# ... with 8,245 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!)[1] 1
[1] TRUE
[1] FALSE
[1] NA
[1] NaN
arrange() can be used to to change the order of rows.
# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
desc can be used for descending order
# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 9 641 900 1301 1242 1530
2 2013 6 15 1432 1935 1137 1607 2120
3 2013 1 10 1121 1635 1126 1239 1810
4 2013 9 20 1139 1845 1014 1457 2210
5 2013 7 22 845 1600 1005 1044 1815
6 2013 4 10 1100 1900 960 1342 2211
7 2013 3 17 2321 810 911 135 1020
8 2013 6 27 959 1900 899 1236 2226
9 2013 7 22 2257 759 898 121 1026
10 2013 12 5 756 1700 896 1058 2020
# ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Missing values are always sorted at the end unless you ask for them to be moved up.
# A tibble: 3 x 1
x
<dbl>
1 4
2 9
3 NA
# A tibble: 3 x 1
x
<dbl>
1 9
2 4
3 NA
# A tibble: 3 x 1
x
<dbl>
1 NA
2 4
3 9
arrange() to sort all missing values to the start?# A tibble: 3 x 1
x
<dbl>
1 NA
2 4
3 9
flights to find the most delayed flights. Find the flights that left earliest.> #most delayed
> flights %>% arrange(desc(dep_delay)) %>%
+ select(year, month, day, flight, dep_delay)# A tibble: 336,776 x 5
year month day flight dep_delay
<int> <int> <int> <int> <dbl>
1 2013 1 9 51 1301
2 2013 6 15 3535 1137
3 2013 1 10 3695 1126
4 2013 9 20 177 1014
5 2013 7 22 3075 1005
6 2013 4 10 2391 960
7 2013 3 17 2119 911
8 2013 6 27 2007 899
9 2013 7 22 2047 898
10 2013 12 5 172 896
# ... with 336,766 more rows
# A tibble: 336,776 x 5
year month day flight dep_time
<int> <int> <int> <int> <int>
1 2013 1 13 22 1
2 2013 1 31 530 1
3 2013 11 13 1503 1
4 2013 12 16 839 1
5 2013 12 20 1503 1
6 2013 12 26 1503 1
7 2013 12 30 839 1
8 2013 2 11 530 1
9 2013 2 24 608 1
10 2013 3 8 739 1
# ... with 336,766 more rows
flights to find the fastest flights.> flights %>% mutate(speed=distance/air_time*60) %>%
+ arrange(desc(speed)) %>%
+ select(year, month, day, flight, speed)# A tibble: 336,776 x 5
year month day flight speed
<int> <int> <int> <int> <dbl>
1 2013 5 25 1499 703.
2 2013 7 2 4667 650.
3 2013 5 13 4292 648
4 2013 3 23 3805 641.
5 2013 1 12 1902 591.
6 2013 11 17 315 564
7 2013 2 21 707 557.
8 2013 11 17 936 556.
9 2013 11 16 347 554.
10 2013 11 16 1503 554.
# ... with 336,766 more rows
> #longest
> flights %>% arrange(desc(distance)) %>%
+ select(year, month, day, flight, distance, origin, dest)# A tibble: 336,776 x 7
year month day flight distance origin dest
<int> <int> <int> <int> <dbl> <chr> <chr>
1 2013 1 1 51 4983 JFK HNL
2 2013 1 2 51 4983 JFK HNL
3 2013 1 3 51 4983 JFK HNL
4 2013 1 4 51 4983 JFK HNL
5 2013 1 5 51 4983 JFK HNL
6 2013 1 6 51 4983 JFK HNL
7 2013 1 7 51 4983 JFK HNL
8 2013 1 8 51 4983 JFK HNL
9 2013 1 9 51 4983 JFK HNL
10 2013 1 10 51 4983 JFK HNL
# ... with 336,766 more rows
> #shortest
> flights %>% arrange(distance) %>%
+ select(year, month, day, flight, distance, origin, dest)# A tibble: 336,776 x 7
year month day flight distance origin dest
<int> <int> <int> <int> <dbl> <chr> <chr>
1 2013 7 27 1632 17 EWR LGA
2 2013 1 3 3833 80 EWR PHL
3 2013 1 4 4193 80 EWR PHL
4 2013 1 4 4502 80 EWR PHL
5 2013 1 4 4645 80 EWR PHL
6 2013 1 5 4193 80 EWR PHL
7 2013 1 6 4619 80 EWR PHL
8 2013 1 7 4619 80 EWR PHL
9 2013 1 8 4619 80 EWR PHL
10 2013 1 9 4619 80 EWR PHL
# ... with 336,766 more rows
select() allows you to choose columns.
# A tibble: 336,776 x 3
year month day
<int> <int> <int>
1 2013 1 1
2 2013 1 1
3 2013 1 1
4 2013 1 1
5 2013 1 1
6 2013 1 1
7 2013 1 1
8 2013 1 1
9 2013 1 1
10 2013 1 1
# ... with 336,766 more rows
# A tibble: 336,776 x 3
year month day
<int> <int> <int>
1 2013 1 1
2 2013 1 1
3 2013 1 1
4 2013 1 1
5 2013 1 1
6 2013 1 1
7 2013 1 1
8 2013 1 1
9 2013 1 1
10 2013 1 1
# ... with 336,766 more rows
# A tibble: 336,776 x 16
dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
<int> <int> <dbl> <int> <int> <dbl> <chr>
1 517 515 2 830 819 11 UA
2 533 529 4 850 830 20 UA
3 542 540 2 923 850 33 AA
4 544 545 -1 1004 1022 -18 B6
5 554 600 -6 812 837 -25 DL
6 554 558 -4 740 728 12 UA
7 555 600 -5 913 854 19 B6
8 557 600 -3 709 723 -14 EV
9 557 600 -3 838 846 -8 B6
10 558 600 -2 753 745 8 AA
# ... with 336,766 more rows, and 9 more variables: flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
select() can also be used with helper functions such as
The entire list can be viewed with ?select.
Some examples using the iris dataset are provided below.
> # convert iris datset to tible for viewing
> iris <- as_tibble(iris)
> #select columns that don't end in Width
> iris %>% select(!ends_with("Width"))# A tibble: 150 x 3
Sepal.Length Petal.Length Species
<dbl> <dbl> <fct>
1 5.1 1.4 setosa
2 4.9 1.4 setosa
3 4.7 1.3 setosa
4 4.6 1.5 setosa
5 5 1.4 setosa
6 5.4 1.7 setosa
7 4.6 1.4 setosa
8 5 1.5 setosa
9 4.4 1.4 setosa
10 4.9 1.5 setosa
# ... with 140 more rows
> #starts with Petal and ends with Width
> iris %>% select(starts_with("Petal") & ends_with("Width"))# A tibble: 150 x 1
Petal.Width
<dbl>
1 0.2
2 0.2
3 0.2
4 0.2
5 0.2
6 0.4
7 0.3
8 0.2
9 0.2
10 0.1
# ... with 140 more rows
> #starts with Petal or ends with Width
> iris %>% select(starts_with("Petal") | ends_with("Width"))# A tibble: 150 x 3
Petal.Length Petal.Width Sepal.Width
<dbl> <dbl> <dbl>
1 1.4 0.2 3.5
2 1.4 0.2 3
3 1.3 0.2 3.2
4 1.5 0.2 3.1
5 1.4 0.2 3.6
6 1.7 0.4 3.9
7 1.4 0.3 3.4
8 1.5 0.2 3.4
9 1.4 0.2 2.9
10 1.5 0.1 3.1
# ... with 140 more rows
> #starts with Petal and does not end with Width
> iris %>% select(starts_with("Petal") & !ends_with("Width"))# A tibble: 150 x 1
Petal.Length
<dbl>
1 1.4
2 1.4
3 1.3
4 1.5
5 1.4
6 1.7
7 1.4
8 1.5
9 1.4
10 1.5
# ... with 140 more rows
If you want to move columns to the front everything() is useful.
# A tibble: 1 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5.1 3.5 1.4 0.2 setosa
# A tibble: 150 x 5
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
<fct> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.1 3.5 1.4 0.2
2 setosa 4.9 3 1.4 0.2
3 setosa 4.7 3.2 1.3 0.2
4 setosa 4.6 3.1 1.5 0.2
5 setosa 5 3.6 1.4 0.2
6 setosa 5.4 3.9 1.7 0.4
7 setosa 4.6 3.4 1.4 0.3
8 setosa 5 3.4 1.5 0.2
9 setosa 4.4 2.9 1.4 0.2
10 setosa 4.9 3.1 1.5 0.1
# ... with 140 more rows
> #move Species to the front, keep everything but Sepal Length
> iris %>% select(Species, everything(),-Sepal.Length)# A tibble: 150 x 4
Species Sepal.Width Petal.Length Petal.Width
<fct> <dbl> <dbl> <dbl>
1 setosa 3.5 1.4 0.2
2 setosa 3 1.4 0.2
3 setosa 3.2 1.3 0.2
4 setosa 3.1 1.5 0.2
5 setosa 3.6 1.4 0.2
6 setosa 3.9 1.7 0.4
7 setosa 3.4 1.4 0.3
8 setosa 3.4 1.5 0.2
9 setosa 2.9 1.4 0.2
10 setosa 3.1 1.5 0.1
# ... with 140 more rows
dep_time, dep_delay, arr_time, and arr_delay from flights.> flights %>% select(4,6,7,9)
> flights %>% select(dep_time, dep_delay, arr_time, arr_delay)
> flights %>% select(dep_time:arr_delay,-sched_dep_time,-sched_arr_time)
> flights %>% select(-(year:day),-(carrier:time_hour),
+ -sched_dep_time,-sched_arr_time)
> flights %>% select(starts_with("dep")|starts_with("arr"))
> flights %>% select((ends_with("time")|ends_with("delay"))
+ &!starts_with("sched")&!starts_with("air"))
> flights %>% select(matches("^(dep|arr)_(time|delay)$"))
> flights %>% select(all_of(c("dep_time", "dep_delay",
+ "arr_time", "arr_delay")))# A tibble: 336,776 x 4
dep_time dep_delay arr_time arr_delay
<int> <dbl> <int> <dbl>
1 517 2 830 11
2 533 4 850 20
3 542 2 923 33
4 544 -1 1004 -18
5 554 -6 812 -25
6 554 -4 740 12
7 555 -5 913 19
8 557 -3 709 -14
9 557 -3 838 -8
10 558 -2 753 8
# ... with 336,766 more rows
select() call?.everything().# A tibble: 336,776 x 19
day year month dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 1 2013 1 517 515 2 830 819
2 1 2013 1 533 529 4 850 830
3 1 2013 1 542 540 2 923 850
4 1 2013 1 544 545 -1 1004 1022
5 1 2013 1 554 600 -6 812 837
6 1 2013 1 554 558 -4 740 728
7 1 2013 1 555 600 -5 913 854
8 1 2013 1 557 600 -3 709 723
9 1 2013 1 557 600 -3 838 846
10 1 2013 1 558 600 -2 753 745
# ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
all_of or any_of do? Why might it be helpful in conjunction with the following vector?.> vars <- c("year","month","day","dep_delay","arr_delay","made_up")
>
> #any_of doesn't check for missing variables
> flights %>% select(any_of(vars))# A tibble: 336,776 x 5
year month day dep_delay arr_delay
<int> <int> <int> <dbl> <dbl>
1 2013 1 1 2 11
2 2013 1 1 4 20
3 2013 1 1 2 33
4 2013 1 1 -1 -18
5 2013 1 1 -6 -25
6 2013 1 1 -4 12
7 2013 1 1 -5 19
8 2013 1 1 -3 -14
9 2013 1 1 -3 -8
10 2013 1 1 -2 8
# ... with 336,766 more rows
> #With all_of if any of the variables in the character vector is missing
> #an error is produced.
> flights %>% select(all_of(vars))select(flights,contains("TIME")) How do the select helpers deal with case by default? How can you change the default?.ignore.case=FALSE.# A tibble: 336,776 x 6
dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
<int> <int> <int> <int> <dbl> <dttm>
1 517 515 830 819 227 2013-01-01 05:00:00
2 533 529 850 830 227 2013-01-01 05:00:00
3 542 540 923 850 160 2013-01-01 05:00:00
4 544 545 1004 1022 183 2013-01-01 05:00:00
5 554 600 812 837 116 2013-01-01 06:00:00
6 554 558 740 728 150 2013-01-01 05:00:00
7 555 600 913 854 158 2013-01-01 06:00:00
8 557 600 709 723 53 2013-01-01 06:00:00
9 557 600 838 846 140 2013-01-01 06:00:00
10 558 600 753 745 138 2013-01-01 06:00:00
# ... with 336,766 more rows
# A tibble: 336,776 x 0
rename() can be used to rename columns
# A tibble: 336,776 x 19
year month day DeptTime sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
You can add new columns with mutate(). The new variables are always placed at the end.
> #smaller dataset
> flights_sml <- select(flights, year:day,ends_with("delay"),
+ distance,air_time)
> #add gain and speed
> flights_sml %>% mutate(gain=arr_delay-dep_delay,
+ speed=distance/air_time*60)# A tibble: 336,776 x 9
year month day dep_delay arr_delay distance air_time gain speed
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 2 11 1400 227 9 370.
2 2013 1 1 4 20 1416 227 16 374.
3 2013 1 1 2 33 1089 160 31 408.
4 2013 1 1 -1 -18 1576 183 -17 517.
5 2013 1 1 -6 -25 762 116 -19 394.
6 2013 1 1 -4 12 719 150 16 288.
7 2013 1 1 -5 19 1065 158 24 404.
8 2013 1 1 -3 -14 229 53 -11 259.
9 2013 1 1 -3 -8 944 140 -5 405.
10 2013 1 1 -2 8 733 138 10 319.
# ... with 336,766 more rows
You can also refer to columns that were just created.
> #can refer to columns just created
> flights_sml %>% mutate(gain=arr_delay-dep_delay,
+ hours=air_time/60, gain_per_hour=gain/hours)# A tibble: 336,776 x 10
year month day dep_delay arr_delay distance air_time gain hours
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 2 11 1400 227 9 3.78
2 2013 1 1 4 20 1416 227 16 3.78
3 2013 1 1 2 33 1089 160 31 2.67
4 2013 1 1 -1 -18 1576 183 -17 3.05
5 2013 1 1 -6 -25 762 116 -19 1.93
6 2013 1 1 -4 12 719 150 16 2.5
7 2013 1 1 -5 19 1065 158 24 2.63
8 2013 1 1 -3 -14 229 53 -11 0.883
9 2013 1 1 -3 -8 944 140 -5 2.33
10 2013 1 1 -2 8 733 138 10 2.3
# ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
transmute() will only keep the new variables.
> # only keep new variables use transmute
> flights %>% transmute(gain=arr_delay-dep_delay, hours=air_time/60,
+ gain_per_hour=gain/hours)# A tibble: 336,776 x 3
gain hours gain_per_hour
<dbl> <dbl> <dbl>
1 9 3.78 2.38
2 16 3.78 4.23
3 31 2.67 11.6
4 -17 3.05 -5.57
5 -19 1.93 -9.83
6 16 2.5 6.4
7 24 2.63 9.11
8 -11 0.883 -12.5
9 -5 2.33 -2.14
10 10 2.3 4.35
# ... with 336,766 more rows
You can create new variables with the standard arithmetic operators, +,-,*,/,^, or with modular operators,%/%(integer division) and %%(remainder).
# A tibble: 336,776 x 3
dep_time hour minute
<int> <dbl> <dbl>
1 517 5 17
2 533 5 33
3 542 5 42
4 544 5 44
5 554 5 54
6 554 5 54
7 555 5 55
8 557 5 57
9 557 5 57
10 558 5 58
# ... with 336,766 more rows
You can also used logs, such as log(),log2(),and log(10)
log2() is easy to interpret because a difference of 1 is equal to a doubling on the original scale.
[1] 5.643856
[1] 6.643856
Offsets like lead() and lag() could be useful in calculating changes.
[1] 1 2 3 4 5 6 7 8 9 10
[1] NA 1 2 3 4 5 6 7 8 9
[1] 2 3 4 5 6 7 8 9 10 NA
The following cumulative calculations are available:
[1] 1 2 3 4 5 6 7 8 9 10
[1] 1 3 6 10 15 21 28 36 45 55
[1] 1.000000 1.000000 1.333333 1.750000 2.200000 2.666667 3.142857 3.625000
[9] 4.111111 4.600000
[1] 1 1 1 1 1 1 1 1 1 1
[1] 1 2 3 4 5 6 7 8 9 10
[1] 1 2 6 24 120 720 5040 40320 362880
[10] 3628800
The package RcppRoll can be used for rolling aggregates.
min_rank provides a ranking with the smallest values given to the smallest ranks. desc(x) gives the largest values to the smallest ranks.
[1] 1 2 2 NA 4 5
[1] 5 3 3 NA 2 1
row_number(): equivalent to rank(ties.method = "first")min_rank(): equivalent to rank(ties.method = "min")dense_rank(): like min_rank(), but with no gaps between rankspercent_rank(): a number between 0 and 1 computed by rescaling min_rank to [0, 1]cume_dist(): a cumulative distribution function. Proportion of all values less than or equal to the current rank.ntile(): a rough rank, which breaks the input vector into n buckets. The size of the buckets may differ by up to one, larger buckets have lower rank.[1] 5 1 4 2 3 NA
[1] 5 1 4 2 2 NA
[1] 4 1 3 2 2 NA
[1] 1.00 0.00 0.75 0.25 0.25 NA
[1] 1.0 0.2 0.8 0.6 0.6 NA
[1] 2 1 2 1 1 NA
[1] 1 1 1 2 2 2 3 3
dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.%%1440, the remainder when dividing by 1440.> #issue with midnight being 2400
> flights %>% select(flight,dep_time,sched_dep_time) %>%
+ mutate(dep_time_min=(dep_time%/%100)*60+dep_time%%100,
+ sched_dep_time_min=(sched_dep_time%/%100)*60+sched_dep_time%%100) %>%
+ arrange(desc(dep_time_min))# A tibble: 336,776 x 5
flight dep_time sched_dep_time dep_time_min sched_dep_time_min
<int> <int> <int> <dbl> <dbl>
1 839 2400 2359 1440 1439
2 745 2400 2359 1440 1439
3 1503 2400 2359 1440 1439
4 1503 2400 2359 1440 1439
5 1816 2400 2250 1440 1370
6 1503 2400 2359 1440 1439
7 1503 2400 2359 1440 1439
8 2379 2400 1700 1440 1020
9 727 2400 2359 1440 1439
10 739 2400 2359 1440 1439
# ... with 336,766 more rows
> #longest time is 1 minute before midnight
> flights %>% select(flight,dep_time,sched_dep_time) %>%
+ mutate(dep_time_min=(dep_time%/%100*60+dep_time%%100)%%1440,
+ sched_dep_time_min=(sched_dep_time%/%100*60+sched_dep_time%%100)%%1440) %>%
+ arrange(desc(dep_time_min))# A tibble: 336,776 x 5
flight dep_time sched_dep_time dep_time_min sched_dep_time_min
<int> <int> <int> <dbl> <dbl>
1 727 2359 2359 1439 1439
2 727 2359 2359 1439 1439
3 701 2359 2130 1439 1290
4 727 2359 2359 1439 1439
5 739 2359 2359 1439 1439
6 739 2359 2359 1439 1439
7 5811 2359 2110 1439 1270
8 839 2359 2359 1439 1439
9 1503 2359 2359 1439 1439
10 839 2359 2359 1439 1439
# ... with 336,766 more rows
air_time with arr_time - dep_time. What do you see? What do you expect to see?.air_time_diff is the difference between air_time and the calculation using arrival and departure times.
Problem 1 - crossing over midnight adds 1440 to air_time_diff.
Problem 2 - Timezone changes could add 60, 120, or 180 to air_time_diff.
Problem 3 - We can see from the plot that there are many differences other than 60, 120, 180, or 1440.
air_time does not include time spent on the runway, which results in Problem 3. Time on the runway would be included in arr_time and dep_time.
> flights_time <- flights %>%
+ mutate(dep_time_min=(dep_time%/%100*60+dep_time%%100)%%1440,
+ arr_time_min=(arr_time%/%100*60+arr_time%%100)%%1440,
+ time_diff=arr_time_min-dep_time_min,
+ air_time_diff=air_time-time_diff) %>%
+ select(dep_time_min,arr_time_min,time_diff,air_time,air_time_diff) %>%
+ arrange(desc(air_time_diff))
>
> flights_time# A tibble: 336,776 x 5
dep_time_min arr_time_min time_diff air_time air_time_diff
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1075 34 -1041 648 1689
2 1270 1 -1269 340 1609
3 1302 32 -1270 336 1606
4 1338 24 -1314 292 1606
5 1305 23 -1282 324 1606
6 1426 133 -1293 313 1606
7 1376 64 -1312 294 1606
8 1406 91 -1315 290 1605
9 1358 81 -1277 328 1605
10 1300 31 -1269 336 1605
# ... with 336,766 more rows
> #plot of air_time_diff
> ggplot(flights_time, aes(x = air_time_diff)) +
+ geom_histogram(binwidth = 1, fill="firebrick") +theme_light()dep-time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?> FlightsDepTime<- flights %>%
+ mutate(dep_time_min=(dep_time%/%100*60+dep_time%%100)%%1440,
+ sched_dep_time_min=(sched_dep_time%/%100*60+sched_dep_time%%100)%%1440,
+ dep_delay_diff=dep_delay-dep_time_min+sched_dep_time_min) %>%
+ select(dep_time_min,sched_dep_time_min,dep_delay,dep_delay_diff)
>
> FlightsDepTime# A tibble: 336,776 x 4
dep_time_min sched_dep_time_min dep_delay dep_delay_diff
<dbl> <dbl> <dbl> <dbl>
1 317 315 2 0
2 333 329 4 0
3 342 340 2 0
4 344 345 -1 0
5 354 360 -6 0
6 354 358 -4 0
7 355 360 -5 0
8 357 360 -3 0
9 357 360 -3 0
10 358 360 -2 0
# ... with 336,766 more rows
> ggplot(filter(FlightsDepTime, dep_delay_diff > 0),
+ aes(y = sched_dep_time_min, x = dep_delay_diff)) +
+ geom_point(color="firebrick")+theme_light()min_rank(): equivalent to rank(ties.method = “min”)> flights %>% mutate(dep_rank=min_rank(desc(dep_delay))) %>%
+ select(flight,dep_delay,dep_rank) %>% arrange(desc(dep_delay))# A tibble: 336,776 x 3
flight dep_delay dep_rank
<int> <dbl> <int>
1 51 1301 1
2 3535 1137 2
3 3695 1126 3
4 177 1014 4
5 3075 1005 5
6 2391 960 6
7 2119 911 7
8 2007 899 8
9 2047 898 9
10 172 896 10
# ... with 336,766 more rows
Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
length
[1] 2 4 6 5 7 9 8 10 12 11
[1] 2 4 6 5 7 9 8 10 12 11
?Trig to find out.summarize() can be used to obtain a single datapoint or to generate data based on a grouping.
# A tibble: 1 x 1
delay
<dbl>
1 12.6
To get the mean departure delay for each day the data has to be grouped first.
> # group to obtain daily mean
> flights %>% group_by(year,month,day) %>%
+ summarize(delay=mean(dep_delay,na.rm=TRUE))# A tibble: 365 x 4
# Groups: year, month [12]
year month day delay
<int> <int> <int> <dbl>
1 2013 1 1 11.5
2 2013 1 2 13.9
3 2013 1 3 11.0
4 2013 1 4 8.95
5 2013 1 5 5.73
6 2013 1 6 7.15
7 2013 1 7 5.42
8 2013 1 8 2.55
9 2013 1 9 2.28
10 2013 1 10 2.84
# ... with 355 more rows
The mean arrival delay at first increases with average distance and then it decreases. Flights can likely make up time in the air during longer distances.
> #count, avg. distance, and avg. arrival delay by destination
> delays <- flights %>%
+ group_by(dest) %>%
+ summarize(count=n(),dist=mean(distance,na.rm=TRUE),
+ delay=mean(arr_delay,na.rm=TRUE)) %>%
+ filter(count>20, dest !="HNL")
>
> #plotted
> ggplot(data=delays, mapping=aes(x=dist,y=delay))+
+ geom_point(aes(size=count),alpha=1/3, color='firebrick')+
+ geom_smooth(se=FALSE)Missing values must be removed or they will populate the entire column.
# A tibble: 365 x 4
# Groups: year, month [12]
year month day mean
<int> <int> <int> <dbl>
1 2013 1 1 NA
2 2013 1 2 NA
3 2013 1 3 NA
4 2013 1 4 NA
5 2013 1 5 NA
6 2013 1 6 NA
7 2013 1 7 NA
8 2013 1 8 NA
9 2013 1 9 NA
10 2013 1 10 NA
# ... with 355 more rows
Fortunately it is an easy fix.
> #with NAs removed
> flights %>%
+ group_by(year,month,day)%>%
+ summarize(mean=mean(dep_delay, na.rm=TRUE))# A tibble: 365 x 4
# Groups: year, month [12]
year month day mean
<int> <int> <int> <dbl>
1 2013 1 1 11.5
2 2013 1 2 13.9
3 2013 1 3 11.0
4 2013 1 4 8.95
5 2013 1 5 5.73
6 2013 1 6 7.15
7 2013 1 7 5.42
8 2013 1 8 2.55
9 2013 1 9 2.28
10 2013 1 10 2.84
# ... with 355 more rows
Or you could filter out the NA values before summarizing.
> #filter NA values first
> not_cancelled <-flights %>%
+ filter(!is.na(dep_delay), !is.na(arr_delay))
> not_cancelled %>% group_by(year,month,day)%>%
+ summarize(mean=mean(dep_delay))# A tibble: 365 x 4
# Groups: year, month [12]
year month day mean
<int> <int> <int> <dbl>
1 2013 1 1 11.4
2 2013 1 2 13.7
3 2013 1 3 10.9
4 2013 1 4 8.97
5 2013 1 5 5.73
6 2013 1 6 7.15
7 2013 1 7 5.42
8 2013 1 8 2.56
9 2013 1 9 2.30
10 2013 1 10 2.84
# ... with 355 more rows
If you want to omit all rows that have missing values you can use na.omit.
Including counts is helpful, especially if there’s a chance that some calculations mught be based on a very small number of observations.
n()sum(!is.na(x))In the following plot we can see that some planes have an average delay of 300 minutes.
> #average arrival delay
> delays<- not_cancelled %>%
+ group_by(tailnum) %>%
+ summarize(delay=mean(arr_delay))
>
> ggplot(data=delays,mapping=aes(x=delay))+
+ geom_freqpoly(binwidth=10, color='firebrick')However, when count is included we can see that the high variation takes place with a small number of flights.
> delays <-not_cancelled %>%
+ group_by(tailnum) %>%
+ summarize(delay=mean(arr_delay,na.rm=TRUE),n=n())
>
> ggplot(data=delays,mapping=aes(x=n, y=delay))+
+ geom_point(alpha=1/10, color='firebrick')There is much less variation when the small counts are removed.
> #filter out small counts
> delays %>%
+ filter(n>25) %>%
+ ggplot(mapping=aes(x=n, y=delay))+
+ geom_point(alpha=1/10, color='firebrick')This can also be demonstrated with the Batting dataset.
> #convert Batting dataset to tibble
> batting <- as_tibble(Lahman::Batting)
>
> #gather batting average and at bats
> batters<- batting %>%
+ group_by(playerID) %>%
+ summarize(
+ ba=sum(H, na.rm=TRUE)/sum(AB, na.rm=TRUE),
+ ab=sum(AB, na.rm=TRUE)
+ )
>
> #filter and chart
> batters %>%
+ filter(ab>100) %>%
+ ggplot(mapping=aes(x=ab, y=ba))+
+ geom_point()+geom_smooth(se=FALSE)If you didn’t filter for very low count, you would see that the best batting averages were related to luck.
# A tibble: 19,689 x 3
playerID ba ab
<chr> <dbl> <int>
1 abramge01 1 1
2 alanirj01 1 1
3 alberan01 1 1
4 banisje01 1 1
5 bartocl01 1 1
6 bassdo01 1 1
7 birasst01 1 2
8 bruneju01 1 1
9 burnscb01 1 1
10 cammaer01 1 1
# ... with 19,679 more rows
There are also many other useful summary functions, such as:
You can also combine aggregation with subsetting, as shown below:
> not_cancelled %>%
+ group_by(year, month, day) %>%
+ summarize(
+ #average delay:
+ avg_delay1 = mean(arr_delay),
+ #average positive delay
+ avg_delay2 = mean(arr_delay[arr_delay>0])
+ )# A tibble: 365 x 5
# Groups: year, month [12]
year month day avg_delay1 avg_delay2
<int> <int> <int> <dbl> <dbl>
1 2013 1 1 12.7 32.5
2 2013 1 2 12.7 32.0
3 2013 1 3 5.73 27.7
4 2013 1 4 -1.93 28.3
5 2013 1 5 -1.53 22.6
6 2013 1 6 4.24 24.4
7 2013 1 7 -4.95 27.8
8 2013 1 8 -3.23 20.8
9 2013 1 9 -0.264 25.6
10 2013 1 10 -5.90 27.3
# ... with 355 more rows
Using standard deviation:
> #distance to some destinations is more variable
> not_cancelled %>%
+ group_by(dest) %>%
+ summarize(distance_sd=sd(distance)) %>%
+ arrange(desc(distance_sd))# A tibble: 104 x 2
dest distance_sd
<chr> <dbl>
1 EGE 10.5
2 SAN 10.4
3 SFO 10.2
4 HNL 10.0
5 SEA 9.98
6 LAS 9.91
7 PDX 9.87
8 PHX 9.86
9 LAX 9.66
10 IND 9.46
# ... with 94 more rows
Measures of rank:
> #first and last flights of the day
> not_cancelled %>%
+ group_by(year, month, day) %>%
+ summarize(
+ first=min(dep_time),
+ last=max(dep_time)
+ )# A tibble: 365 x 5
# Groups: year, month [12]
year month day first last
<int> <int> <int> <int> <int>
1 2013 1 1 517 2356
2 2013 1 2 42 2354
3 2013 1 3 32 2349
4 2013 1 4 25 2358
5 2013 1 5 14 2357
6 2013 1 6 16 2355
7 2013 1 7 49 2359
8 2013 1 8 454 2351
9 2013 1 9 2 2252
10 2013 1 10 3 2320
# ... with 355 more rows
Measures of position:
> #first and last flights of the day
> not_cancelled %>%
+ group_by(year, month, day) %>%
+ summarize(
+ first_dep=first(dep_time),
+ last_dep=last(dep_time)
+ )# A tibble: 365 x 5
# Groups: year, month [12]
year month day first_dep last_dep
<int> <int> <int> <int> <int>
1 2013 1 1 517 2356
2 2013 1 2 42 2354
3 2013 1 3 32 2349
4 2013 1 4 25 2358
5 2013 1 5 14 2357
6 2013 1 6 16 2355
7 2013 1 7 49 2359
8 2013 1 8 454 2351
9 2013 1 9 2 2252
10 2013 1 10 3 2320
# ... with 355 more rows
The first and last flights can be retrieved in several ways. Another is provided below:
> #first and last flight of the day
> not_cancelled %>%
+ group_by(year, month, day) %>%
+ #highest dep_time get lowest rank
+ mutate(r=min_rank(desc(dep_time))) %>%
+ # %in% range(r) displays highest and lowest
+ filter(r %in% range(r)) %>%
+ select(year,month,day,dep_time,r)# A tibble: 770 x 5
# Groups: year, month, day [365]
year month day dep_time r
<int> <int> <int> <int> <int>
1 2013 1 1 517 831
2 2013 1 1 2356 1
3 2013 1 2 42 928
4 2013 1 2 2354 1
5 2013 1 3 32 900
6 2013 1 3 2349 1
7 2013 1 4 25 908
8 2013 1 4 2358 1
9 2013 1 4 2358 1
10 2013 1 5 14 717
# ... with 760 more rows
Distinct counts:
> # destinations by most distinct carriers
> not_cancelled %>%
+ group_by(dest) %>%
+ summarize(carriers=n_distinct(carrier))%>%
+ arrange(desc(carriers))# A tibble: 104 x 2
dest carriers
<chr> <int>
1 ATL 7
2 BOS 7
3 CLT 7
4 ORD 7
5 TPA 7
6 AUS 6
7 DCA 6
8 DTW 6
9 IAD 6
10 MSP 6
# ... with 94 more rows
A simple count:
# A tibble: 104 x 2
dest n
<chr> <int>
1 ABQ 254
2 ACK 264
3 ALB 418
4 ANC 8
5 ATL 16837
6 AUS 2411
7 AVL 261
8 BDL 412
9 BGR 358
10 BHM 269
# ... with 94 more rows
A weighting can be applied to a count. In this case we can view total miles flown for each plane:
# A tibble: 4,037 x 2
tailnum n
<chr> <dbl>
1 D942DN 3418
2 N0EGMQ 239143
3 N10156 109664
4 N102UW 25722
5 N103US 24619
6 N104UW 24616
7 N10575 139903
8 N105UW 23618
9 N107US 21677
10 N108UW 32070
# ... with 4,027 more rows
You can also use logical values, where TRUE=1 and FALSE=0. Here we can find the number of flights that left before 5 AM:
# A tibble: 365 x 4
# Groups: year, month [12]
year month day n_early
<int> <int> <int> <int>
1 2013 1 1 0
2 2013 1 2 3
3 2013 1 3 4
4 2013 1 4 3
5 2013 1 5 3
6 2013 1 6 2
7 2013 1 7 2
8 2013 1 8 1
9 2013 1 9 3
10 2013 1 10 3
# ... with 355 more rows
Proportions are also obtainable. Here is the proportion of flights delayed by more than an hour:
# A tibble: 365 x 4
# Groups: year, month [12]
year month day hour_perc
<int> <int> <int> <dbl>
1 2013 1 1 0.0722
2 2013 1 2 0.0851
3 2013 1 3 0.0567
4 2013 1 4 0.0396
5 2013 1 5 0.0349
6 2013 1 6 0.0470
7 2013 1 7 0.0333
8 2013 1 8 0.0213
9 2013 1 9 0.0202
10 2013 1 10 0.0183
# ... with 355 more rows
Grouping by multiple variables allows you to progressively summarize each level.
> #total flights per day
> daily <- flights %>% group_by(year,month,day)
> (per_day <- daily %>% summarize(flights=n()))# A tibble: 365 x 4
# Groups: year, month [12]
year month day flights
<int> <int> <int> <int>
1 2013 1 1 842
2 2013 1 2 943
3 2013 1 3 914
4 2013 1 4 915
5 2013 1 5 720
6 2013 1 6 832
7 2013 1 7 933
8 2013 1 8 899
9 2013 1 9 902
10 2013 1 10 932
# ... with 355 more rows
# A tibble: 12 x 3
# Groups: year [1]
year month flights
<int> <int> <int>
1 2013 1 27004
2 2013 2 24951
3 2013 3 28834
4 2013 4 28330
5 2013 5 28796
6 2013 6 28243
7 2013 7 29425
8 2013 8 29327
9 2013 9 27574
10 2013 10 28889
11 2013 11 27268
12 2013 12 28135
# A tibble: 1 x 2
year flights
<int> <int>
1 2013 336776
Grouping can be removed with ungroup()
# A tibble: 1 x 1
flights
<int>
1 336776
A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.
A flight is always 10 minutes late.
A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.
99% of the time a flight is on time. 1% of the time it’s 2 hours late.
Which is more important: arrival delay or departure delay?
> #1. destinations with average arrival delay <=-30 minutes (30 min early)
> #>10% of the time.
> flights %>% group_by(dest) %>% summarize(n=n(),
+ min_perc=mean(arr_delay<=-30, na.rm=TRUE)) %>%
+ filter(min_perc>0.10) %>%
+ arrange(desc(min_perc))# A tibble: 27 x 3
dest n min_perc
<chr> <int> <dbl>
1 ANC 8 0.25
2 SNA 825 0.208
3 ILM 110 0.178
4 SAT 686 0.178
5 LGB 668 0.174
6 ABQ 254 0.173
7 SEA 3923 0.171
8 OAK 312 0.165
9 HNL 707 0.164
10 SFO 13331 0.161
# ... with 17 more rows
> #2. destinations with average arrival delay >=30 minutes (30 min late)
> #>25% of the time.
> flights %>% group_by(dest) %>% summarize(n=n(),
+ min_perc=mean(arr_delay>=30, na.rm=TRUE)) %>%
+ filter(min_perc>0.25) %>%
+ arrange(desc(min_perc))# A tibble: 12 x 3
dest n min_perc
<chr> <int> <dbl>
1 CAE 116 0.481
2 OKC 346 0.390
3 TUL 315 0.384
4 JAC 25 0.381
5 TYS 631 0.308
6 DSM 569 0.275
7 BHM 297 0.275
8 MSN 572 0.273
9 RIC 2454 0.264
10 CRW 138 0.261
11 SMF 284 0.259
12 GSP 849 0.254
> #3. destinations with highest standard deviation of arrival delay
> flights %>% group_by(dest) %>% summarize(n=n(),
+ arr_delay_sd=sd(arr_delay, na.rm=TRUE)) %>%
+ arrange(desc(arr_delay_sd))# A tibble: 105 x 3
dest n arr_delay_sd
<chr> <int> <dbl>
1 HNL 707 60.8
2 TUL 315 60.3
3 TVC 101 59.3
4 CAK 864 58.3
5 TYS 631 58.0
6 SAT 686 57.8
7 DSM 569 56.9
8 MSN 572 56.8
9 BHM 297 56.2
10 CVG 3941 55.1
# ... with 95 more rows
> #4 average departure delay by airport
> flights %>% group_by(origin) %>% summarize(n=n(),
+ dep_delay_mean=mean(dep_delay, na.rm=TRUE)) # A tibble: 3 x 3
origin n dep_delay_mean
<chr> <int> <dbl>
1 EWR 120835 15.1
2 JFK 111279 12.1
3 LGA 104662 10.3
> #5 Planes with greatest average departure delay
> flights %>% group_by(tailnum) %>% summarize(n=n(),
+ dep_delay_mean=mean(dep_delay, na.rm=TRUE)) %>%
+ filter(n>5) %>%
+ arrange(desc(dep_delay_mean))# A tibble: 3,590 x 3
tailnum n dep_delay_mean
<chr> <int> <dbl>
1 N665MQ 6 177
2 N276AT 6 84.8
3 N652SW 6 79.5
4 N919FJ 6 78
5 N396SW 7 69.7
6 N354AT 10 65.1
7 N337AT 13 63.2
8 N388SW 7 59.3
9 N261AT 9 59
10 N550NW 7 58.6
# ... with 3,580 more rows
not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum,wt=distance) (without using count()).> not_cancelled <- flights %>%
+ filter(!is.na(dep_delay),!is.na(arr_delay))
>
> (x <- not_cancelled %>% count(dest))# A tibble: 104 x 2
dest n
<chr> <int>
1 ABQ 254
2 ACK 264
3 ALB 418
4 ANC 8
5 ATL 16837
6 AUS 2411
7 AVL 261
8 BDL 412
9 BGR 358
10 BHM 269
# ... with 94 more rows
# A tibble: 104 x 2
dest n
<chr> <int>
1 ABQ 254
2 ACK 264
3 ALB 418
4 ANC 8
5 ATL 16837
6 AUS 2411
7 AVL 261
8 BDL 412
9 BGR 358
10 BHM 269
# ... with 94 more rows
[1] TRUE
# A tibble: 4,037 x 2
tailnum n
<chr> <dbl>
1 D942DN 3418
2 N0EGMQ 239143
3 N10156 109664
4 N102UW 25722
5 N103US 24619
6 N104UW 24616
7 N10575 139903
8 N105UW 23618
9 N107US 21677
10 N108UW 32070
# ... with 4,027 more rows
# A tibble: 4,037 x 2
tailnum n
<chr> <dbl>
1 D942DN 3418
2 N0EGMQ 239143
3 N10156 109664
4 N102UW 25722
5 N103US 24619
6 N104UW 24616
7 N10575 139903
8 N105UW 23618
9 N107US 21677
10 N108UW 32070
# ... with 4,027 more rows
[1] TRUE
(is.na(dep_delay) | is.na(arr_delay) is suboptimal. Why? Which is the most important column?> #Some flights depart with no air time recorded
> flights %>% filter(is.na(air_time)) %>%
+ select(flight, dep_time, arr_time, dep_delay, arr_delay, air_time)# A tibble: 9,430 x 6
flight dep_time arr_time dep_delay arr_delay air_time
<int> <int> <int> <dbl> <dbl> <dbl>
1 4525 1525 1934 -5 NA NA
2 3806 1528 2002 29 NA NA
3 4413 1740 2158 -5 NA NA
4 1228 1807 2251 29 NA NA
5 3325 1939 29 59 NA NA
6 4333 1952 2358 22 NA NA
7 4204 2016 NA 46 NA NA
8 4308 NA NA NA NA NA
9 791 NA NA NA NA NA
10 1925 NA NA NA NA NA
# ... with 9,420 more rows
> #Flights that did not depart
> flights %>% filter(is.na(dep_time)) %>%
+ select(flight, dep_time, arr_time, dep_delay, arr_delay, air_time)# A tibble: 8,255 x 6
flight dep_time arr_time dep_delay arr_delay air_time
<int> <int> <int> <dbl> <dbl> <dbl>
1 4308 NA NA NA NA NA
2 791 NA NA NA NA NA
3 1925 NA NA NA NA NA
4 125 NA NA NA NA NA
5 4352 NA NA NA NA NA
6 4406 NA NA NA NA NA
7 4434 NA NA NA NA NA
8 4935 NA NA NA NA NA
9 3849 NA NA NA NA NA
10 133 NA NA NA NA NA
# ... with 8,245 more rows
> (cancelled_per_day <- flights %>%
+ mutate(cancelled = is.na(dep_time)) %>%
+ group_by(year, month, day) %>%
+ summarize(cancelled_num = sum(cancelled),flights_num = n()))# A tibble: 365 x 5
# Groups: year, month [12]
year month day cancelled_num flights_num
<int> <int> <int> <int> <int>
1 2013 1 1 4 842
2 2013 1 2 8 943
3 2013 1 3 10 914
4 2013 1 4 6 915
5 2013 1 5 3 720
6 2013 1 6 1 832
7 2013 1 7 3 933
8 2013 1 8 4 899
9 2013 1 9 5 902
10 2013 1 10 3 932
# ... with 355 more rows
> ggplot(cancelled_per_day) +
+ geom_point(aes(x = flights_num, y = cancelled_num), color="firebrick")+
+ theme_light()> (cancelled_and_delays <- flights %>%
+ mutate(cancelled = is.na(dep_time)) %>%
+ group_by(year, month, day) %>%
+ summarize(cancelled_prop = mean(cancelled),
+ avg_dep_delay = mean(dep_delay, na.rm = TRUE),
+ avg_arr_delay = mean(arr_delay, na.rm = TRUE)))# A tibble: 365 x 6
# Groups: year, month [12]
year month day cancelled_prop avg_dep_delay avg_arr_delay
<int> <int> <int> <dbl> <dbl> <dbl>
1 2013 1 1 0.00475 11.5 12.7
2 2013 1 2 0.00848 13.9 12.7
3 2013 1 3 0.0109 11.0 5.73
4 2013 1 4 0.00656 8.95 -1.93
5 2013 1 5 0.00417 5.73 -1.53
6 2013 1 6 0.00120 7.15 4.24
7 2013 1 7 0.00322 5.42 -4.95
8 2013 1 8 0.00445 2.55 -3.23
9 2013 1 9 0.00554 2.28 -0.264
10 2013 1 10 0.00322 2.84 -5.90
# ... with 355 more rows
> ggplot(cancelled_and_delays) +
+ geom_point(aes(x = avg_dep_delay, y = cancelled_prop), color="firebrick")+
+ theme_light()> flights %>%
+ group_by(carrier) %>%
+ summarize(arr_delay_mean = mean(arr_delay, na.rm = TRUE)) %>%
+ arrange(desc(arr_delay_mean)) %>% left_join(airlines)# A tibble: 16 x 3
carrier arr_delay_mean name
<chr> <dbl> <chr>
1 F9 21.9 Frontier Airlines Inc.
2 FL 20.1 AirTran Airways Corporation
3 EV 15.8 ExpressJet Airlines Inc.
4 YV 15.6 Mesa Airlines Inc.
5 OO 11.9 SkyWest Airlines Inc.
6 MQ 10.8 Envoy Air
7 WN 9.65 Southwest Airlines Co.
8 B6 9.46 JetBlue Airways
9 9E 7.38 Endeavor Air Inc.
10 UA 3.56 United Air Lines Inc.
11 US 2.13 US Airways Inc.
12 VX 1.76 Virgin America
13 DL 1.64 Delta Air Lines Inc.
14 AA 0.364 American Airlines Inc.
15 HA -6.92 Hawaiian Airlines Inc.
16 AS -9.93 Alaska Airlines Inc.
> flights %>%
+ select(tailnum, year, month,day, arr_delay) %>%
+ filter(!is.na(arr_delay)) %>%
+ #sort by plane and then day
+ arrange(tailnum, year, month, day) %>%
+ group_by(tailnum) %>%
+ # cumulative number of flights delayed over one hour
+ mutate(cum_hr_delays = cumsum(arr_delay > 60)) %>%
+ # count the number of flights == 0
+ summarize(total_flights = sum(cum_hr_delays < 1)) %>%
+ arrange(desc(total_flights))# A tibble: 4,037 x 2
tailnum total_flights
<chr> <int>
1 N717TW 119
2 N705TW 97
3 N765US 97
4 N3758Y 96
5 N12125 94
6 N320AA 94
7 N13110 91
8 N744P 90
9 N37456 88
10 N709TW 88
# ... with 4,027 more rows
sort argument to count() do? When might you use it?.# A tibble: 105 x 2
dest n
<chr> <int>
1 ORD 17283
2 ATL 17215
3 LAX 16174
4 BOS 15508
5 MCO 14082
6 CLT 14064
7 SFO 13331
8 FLL 12055
9 MIA 11728
10 DCA 9705
# ... with 95 more rows
The recently launched dplyr 1.0.0 includes a new across() function which allows you to perform the same operation across many columns. See the article:
https://www.tidyverse.org/blog/2020/04/dplyr-1-0-0-colwise/
> starwars %>%
+ group_by(species) %>%
+ filter(n() > 1) %>%
+ summarize(across(c(sex, gender, homeworld), n_distinct))# A tibble: 9 x 4
species sex gender homeworld
<chr> <int> <int> <int>
1 Droid 1 2 3
2 Gungan 1 1 1
3 Human 2 2 16
4 Kaminoan 2 2 1
5 Mirialan 1 1 1
6 Twi'lek 2 2 1
7 Wookiee 1 1 1
8 Zabrak 1 1 2
9 <NA> 1 1 3
> starwars %>%
+ group_by(homeworld) %>%
+ filter(n() > 1) %>%
+ summarize(across(where(is.numeric), mean, na.rm = TRUE), n = n())# A tibble: 10 x 5
homeworld height mass birth_year n
<chr> <dbl> <dbl> <dbl> <int>
1 Alderaan 176. 64 43 3
2 Corellia 175 78.5 25 2
3 Coruscant 174. 50 91 3
4 Kamino 208. 83.1 31.5 3
5 Kashyyyk 231 124 200 2
6 Mirial 168 53.1 49 2
7 Naboo 175. 64.2 55 11
8 Ryloth 179 55 48 2
9 Tatooine 170. 85.4 54.6 10
10 <NA> 139. 82 334. 10
In addition to summarize() you can also use mutate() and filter() with groups.
> #9 highest arr_delay by day
> flights_sml %>%
+ group_by(year,month,day) %>%
+ filter(rank(desc(arr_delay))<10)# A tibble: 3,306 x 7
# Groups: year, month, day [365]
year month day dep_delay arr_delay distance air_time
<int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 853 851 184 41
2 2013 1 1 290 338 1134 213
3 2013 1 1 260 263 266 46
4 2013 1 1 157 174 213 60
5 2013 1 1 216 222 708 121
6 2013 1 1 255 250 589 115
7 2013 1 1 285 246 1085 146
8 2013 1 1 192 191 199 44
9 2013 1 1 379 456 1092 222
10 2013 1 2 224 207 550 94
# ... with 3,296 more rows
> # Only destinations with >10,000 flights
> (popular_dests <- flights %>%
+ group_by(dest) %>%
+ filter(n()>10000))# A tibble: 131,440 x 19
# Groups: dest [9]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 542 540 2 923 850
2 2013 1 1 554 600 -6 812 837
3 2013 1 1 554 558 -4 740 728
4 2013 1 1 555 600 -5 913 854
5 2013 1 1 557 600 -3 838 846
6 2013 1 1 558 600 -2 753 745
7 2013 1 1 558 600 -2 924 917
8 2013 1 1 558 600 -2 923 937
9 2013 1 1 559 559 0 702 706
10 2013 1 1 600 600 0 851 858
# ... with 131,430 more rows, and 11 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
> #proportion delay by destination
> popular_dests %>%
+ filter(arr_delay>0) %>%
+ mutate(prop_delay = arr_delay/sum(arr_delay)) %>%
+ select(year:day, dest, arr_delay, prop_delay)# A tibble: 50,245 x 6
# Groups: dest [9]
year month day dest arr_delay prop_delay
<int> <int> <int> <chr> <dbl> <dbl>
1 2013 1 1 MIA 33 0.000235
2 2013 1 1 ORD 12 0.0000424
3 2013 1 1 FLL 19 0.0000938
4 2013 1 1 ORD 8 0.0000283
5 2013 1 1 LAX 7 0.0000344
6 2013 1 1 ATL 12 0.0000400
7 2013 1 1 ORD 32 0.000113
8 2013 1 1 SFO 14 0.0000682
9 2013 1 1 MIA 5 0.0000356
10 2013 1 1 LAX 29 0.000143
# ... with 50,235 more rows
tailnum) has the worst on-time record?> worst_time <- flights %>% group_by(tailnum) %>%
+ summarize(avg_arr_delay=mean(arr_delay, na.rm=TRUE),
+ med_arr_delay=median(arr_delay, na.rm=TRUE),
+ n=n()) %>%
+ filter(n>20)
>
> #top 5 median arr_delay
> worst_time %>% arrange(desc(med_arr_delay)) %>% head(5)# A tibble: 5 x 4
tailnum avg_arr_delay med_arr_delay n
<chr> <dbl> <dbl> <int>
1 N203FR 59.1 21 41
2 N988AT 44.3 21 37
3 N353AT 41.2 17 21
4 N731SA 27.4 17 25
5 N980AT 35.2 17 47
# A tibble: 5 x 4
tailnum avg_arr_delay med_arr_delay n
<chr> <dbl> <dbl> <int>
1 N203FR 59.1 21 41
2 N645MQ 51 6.5 25
3 N956AT 47.6 5.5 36
4 N988AT 44.3 21 37
5 N521VA 42.2 -8 27
> flights %>%
+ group_by(hour) %>%
+ summarize(avg_arr_delay = mean(arr_delay, na.rm = TRUE),
+ med_arr_delay = median(arr_delay, na.rm = TRUE)) %>%
+ arrange(avg_arr_delay) %>% head(5)# A tibble: 5 x 3
hour avg_arr_delay med_arr_delay
<dbl> <dbl> <dbl>
1 7 -5.30 -10
2 5 -4.80 -8
3 6 -3.38 -8
4 9 -1.45 -8
5 8 -1.11 -8
> flightsx <- flights %>% filter(!is.na(arr_delay),arr_delay>0) %>%
+ group_by(dest) %>%
+ mutate(tot_arr_delay = sum(arr_delay),
+ prop_arr_delay=arr_delay/tot_arr_delay) %>%
+ select(dest,flight,tot_arr_delay,prop_arr_delay)
>
> flightsx %>% group_by(dest,flight) %>%
+ summarize(tot_prop=sum(prop_arr_delay))# A tibble: 8,505 x 3
# Groups: dest [103]
dest flight tot_prop
<chr> <int> <dbl>
1 ABQ 65 0.433
2 ABQ 1505 0.567
3 ACK 1191 0.475
4 ACK 1195 0.0208
5 ACK 1291 0.0898
6 ACK 1491 0.414
7 ALB 3260 0.0116
8 ALB 3264 0.000418
9 ALB 3811 0.0625
10 ALB 3817 0.0205
# ... with 8,495 more rows
lag() explore how the delay of a flight is related to the delay of the immediately preceding flight.> lagged_delays <- flights %>%
+ arrange(origin, month, day, dep_time) %>%
+ group_by(origin) %>%
+ mutate(dep_delay_lag = lag(dep_delay)) %>%
+ filter(!is.na(dep_delay), !is.na(dep_delay_lag))
>
> lagged_delays %>% select(origin, month, day,
+ dep_time, dep_delay,dep_delay_lag)# A tibble: 327,649 x 6
# Groups: origin [3]
origin month day dep_time dep_delay dep_delay_lag
<chr> <int> <int> <int> <dbl> <dbl>
1 EWR 1 1 554 -4 2
2 EWR 1 1 555 -5 -4
3 EWR 1 1 558 -2 -5
4 EWR 1 1 559 -1 -2
5 EWR 1 1 601 1 -1
6 EWR 1 1 606 -4 1
7 EWR 1 1 607 0 -4
8 EWR 1 1 608 8 0
9 EWR 1 1 615 0 8
10 EWR 1 1 622 -8 0
# ... with 327,639 more rows
> lagged_delays %>% group_by(dep_delay_lag) %>%
+ summarize(mean_dep_delay=mean(dep_delay)) %>%
+ ggplot(mapping=aes(y=mean_dep_delay,x=dep_delay_lag))+
+ geom_point(color="firebrick")+
+ scale_x_continuous(breaks = seq(0, 1500, by = 200)) +
+ labs(y = "Departure Delay", x = "Previous Departure Delay")> (flightsy <- flights %>% filter(!is.na(air_time)) %>%
+ mutate(speed=distance/air_time*60) %>%
+ group_by(dest,flight) %>%
+ mutate(avg_speed=mean(speed),max_speed=max(speed), sd_speed=sd(speed),
+ num_sd=(speed-avg_speed)/sd_speed) %>%
+ select(dest,flight,avg_speed,max_speed,sd_speed,num_sd) %>%
+ arrange(desc(num_sd)))# A tibble: 327,346 x 6
# Groups: dest, flight [11,421]
dest flight avg_speed max_speed sd_speed num_sd
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 ATL 1499 407. 703. 36.9 8.03
2 MSP 4667 396. 650. 33.4 7.61
3 BOS 2132 300. 526. 36.6 6.17
4 CVG 4687 356. 551. 32.7 5.96
5 BOS 2142 297. 480 32.1 5.71
6 PIT 5486 344. 502. 28.6 5.53
7 BUF 2002 326. 475. 28.1 5.30
8 PBI 1902 425. 591. 32.1 5.19
9 ROC 30 309. 453. 27.8 5.18
10 BNA 3805 391. 641. 48.7 5.14
# ... with 327,336 more rows
> #plot the number of standard deviations
> ggplot(flightsy, aes(x=num_sd))+
+ geom_histogram(binwidth=0.1, fill="firebrick")> #zoom in
> ggplot(flightsy, aes(x=num_sd))+
+ geom_histogram(binwidth=0.1, fill="firebrick")+
+ coord_cartesian(ylim=c(0,5))