library(nycflights13)
library(tidyverse)

Filter

jan1<-filter(flights, month==1, day==1)

(dec25<-filter(flights, month==12, day==25)) #wrap in parentheses to save and print variable

## # A tibble: 719 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12    25      456            500        -4      649            651
##  2  2013    12    25      524            515         9      805            814
##  3  2013    12    25      542            540         2      832            850
##  4  2013    12    25      546            550        -4     1022           1027
##  5  2013    12    25      556            600        -4      730            745
##  6  2013    12    25      557            600        -3      743            752
##  7  2013    12    25      557            600        -3      818            831
##  8  2013    12    25      559            600        -1      855            856
##  9  2013    12    25      559            600        -1      849            855
## 10  2013    12    25      600            600         0      850            846
## # ... with 709 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

#filter(flights, month ==11|month==12)  # Doesn't work! Instead use:
filter(flights, month %in% c(11, 12)) # filters multiple values - month==11 OR 12

## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    11     1        5           2359         6      352            345
##  2  2013    11     1       35           2250       105      123           2356
##  3  2013    11     1      455            500        -5      641            651
##  4  2013    11     1      539            545        -6      856            827
##  5  2013    11     1      542            545        -3      831            855
##  6  2013    11     1      549            600       -11      912            923
##  7  2013    11     1      550            600       -10      705            659
##  8  2013    11     1      554            600        -6      659            701
##  9  2013    11     1      554            600        -6      826            827
## 10  2013    11     1      554            600        -6      749            751
## # ... with 55,393 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

filter(flights, !(arr_delay>120 | dep_delay>120)) #filters to flights NOT delayed by more than 120 mins

## # A tibble: 316,050 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 316,040 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

filter(flights, arr_delay<=120, dep_delay<=120) #alternate

## # A tibble: 316,050 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 316,040 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

#missing values
df<- tibble(x=c(1, NA, 3))
is.na(df) # determine any missing values

##          x
## [1,] FALSE
## [2,]  TRUE
## [3,] FALSE

filter(df, x>1) #will exclude NA values

## # A tibble: 1 x 1
##       x
##   <dbl>
## 1     3

filter(df, is.na(x)|x>1) # explicitly retain NA values

## # A tibble: 2 x 1
##       x
##   <dbl>
## 1    NA
## 2     3

Arrange

arrange(flights, year, month, day) #sorts ascending, subsequent selections break ties

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

arrange(flights, desc(arr_delay)) #sorts descending

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     7    22     2257            759       898      121           1026
##  9  2013    12     5      756           1700       896     1058           2020
## 10  2013     5     3     1133           2055       878     1250           2215
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

#Note: missing values always sorted at the end

Select

select(flights, year, month, day) #select by name

## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows

select(flights, year:day) #select all columns between year and day (inclusive)

## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows

select(flights, -(year:day)) #select all columns except those between year and day (inclusive)

## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
##       <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
##  1      517            515         2      830            819        11 UA     
##  2      533            529         4      850            830        20 UA     
##  3      542            540         2      923            850        33 AA     
##  4      544            545        -1     1004           1022       -18 B6     
##  5      554            600        -6      812            837       -25 DL     
##  6      554            558        -4      740            728        12 UA     
##  7      555            600        -5      913            854        19 B6     
##  8      557            600        -3      709            723       -14 EV     
##  9      557            600        -3      838            846        -8 B6     
## 10      558            600        -2      753            745         8 AA     
## # ... with 336,766 more rows, and 9 more variables: flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

#Helper functions
?select

## starting httpd help server ... done

select(flights, starts_with("dep")) #starts with

## # A tibble: 336,776 x 2
##    dep_time dep_delay
##       <int>     <dbl>
##  1      517         2
##  2      533         4
##  3      542         2
##  4      544        -1
##  5      554        -6
##  6      554        -4
##  7      555        -5
##  8      557        -3
##  9      557        -3
## 10      558        -2
## # ... with 336,766 more rows

select(flights, ends_with("delay")) #ends with

## # A tibble: 336,776 x 2
##    dep_delay arr_delay
##        <dbl>     <dbl>
##  1         2        11
##  2         4        20
##  3         2        33
##  4        -1       -18
##  5        -6       -25
##  6        -4        12
##  7        -5        19
##  8        -3       -14
##  9        -3        -8
## 10        -2         8
## # ... with 336,766 more rows

select(flights, contains("arr")) #contains

## # A tibble: 336,776 x 4
##    arr_time sched_arr_time arr_delay carrier
##       <int>          <int>     <dbl> <chr>  
##  1      830            819        11 UA     
##  2      850            830        20 UA     
##  3      923            850        33 AA     
##  4     1004           1022       -18 B6     
##  5      812            837       -25 DL     
##  6      740            728        12 UA     
##  7      913            854        19 B6     
##  8      709            723       -14 EV     
##  9      838            846        -8 B6     
## 10      753            745         8 AA     
## # ... with 336,766 more rows

select(flights, matches("(.)\\1")) #matches variable with repeated  characters

## # A tibble: 336,776 x 4
##    arr_time sched_arr_time arr_delay carrier
##       <int>          <int>     <dbl> <chr>  
##  1      830            819        11 UA     
##  2      850            830        20 UA     
##  3      923            850        33 AA     
##  4     1004           1022       -18 B6     
##  5      812            837       -25 DL     
##  6      740            728        12 UA     
##  7      913            854        19 B6     
##  8      709            723       -14 EV     
##  9      838            846        -8 B6     
## 10      753            745         8 AA     
## # ... with 336,766 more rows

select(flights, num_range("x", 1:3))

## # A tibble: 336,776 x 0

rename(flights, tail_num= tailnum) #changes tailnum to tail_num

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tail_num <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

select(flights, time_hour, air_time, everything()) #move selected vars to start, then everything else

## # A tibble: 336,776 x 19
##    time_hour           air_time  year month   day dep_time sched_dep_time
##    <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
##  1 2013-01-01 05:00:00      227  2013     1     1      517            515
##  2 2013-01-01 05:00:00      227  2013     1     1      533            529
##  3 2013-01-01 05:00:00      160  2013     1     1      542            540
##  4 2013-01-01 05:00:00      183  2013     1     1      544            545
##  5 2013-01-01 06:00:00      116  2013     1     1      554            600
##  6 2013-01-01 05:00:00      150  2013     1     1      554            558
##  7 2013-01-01 06:00:00      158  2013     1     1      555            600
##  8 2013-01-01 06:00:00       53  2013     1     1      557            600
##  9 2013-01-01 06:00:00      140  2013     1     1      557            600
## 10 2013-01-01 06:00:00      138  2013     1     1      558            600
## # ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
## #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## #   hour <dbl>, minute <dbl>

Mutate

#reduce dataset for easier visibility
(flights_sml<- select(flights,
                     year:day,
                     ends_with("delay"),
                     distance,
                     air_time))

## # A tibble: 336,776 x 7
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1         2        11     1400      227
##  2  2013     1     1         4        20     1416      227
##  3  2013     1     1         2        33     1089      160
##  4  2013     1     1        -1       -18     1576      183
##  5  2013     1     1        -6       -25      762      116
##  6  2013     1     1        -4        12      719      150
##  7  2013     1     1        -5        19     1065      158
##  8  2013     1     1        -3       -14      229       53
##  9  2013     1     1        -3        -8      944      140
## 10  2013     1     1        -2         8      733      138
## # ... with 336,766 more rows

#Mutate Adds new column to the end of the dataset
mutate(flights_sml, 
       gain = arr_delay - dep_delay, 
       speed = distance/air_time*60)

## # A tibble: 336,776 x 9
##     year month   day dep_delay arr_delay distance air_time  gain speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227     9  370.
##  2  2013     1     1         4        20     1416      227    16  374.
##  3  2013     1     1         2        33     1089      160    31  408.
##  4  2013     1     1        -1       -18     1576      183   -17  517.
##  5  2013     1     1        -6       -25      762      116   -19  394.
##  6  2013     1     1        -4        12      719      150    16  288.
##  7  2013     1     1        -5        19     1065      158    24  404.
##  8  2013     1     1        -3       -14      229       53   -11  259.
##  9  2013     1     1        -3        -8      944      140    -5  405.
## 10  2013     1     1        -2         8      733      138    10  319.
## # ... with 336,766 more rows

#If you only want to keep new variables, use transmute instead
transmute(flights_sml, 
       gain = arr_delay - dep_delay, 
       speed = distance/air_time*60)

## # A tibble: 336,776 x 2
##     gain speed
##    <dbl> <dbl>
##  1     9  370.
##  2    16  374.
##  3    31  408.
##  4   -17  517.
##  5   -19  394.
##  6    16  288.
##  7    24  404.
##  8   -11  259.
##  9    -5  405.
## 10    10  319.
## # ... with 336,766 more rows

#Modular arithmetic (%/% and %%)
#Allows you to break integers into pieces
transmute(flights,
          dep_time,
          hour=dep_time%/%100,
          minute=dep_time%% 100
          )

## # A tibble: 336,776 x 3
##    dep_time  hour minute
##       <int> <dbl>  <dbl>
##  1      517     5     17
##  2      533     5     33
##  3      542     5     42
##  4      544     5     44
##  5      554     5     54
##  6      554     5     54
##  7      555     5     55
##  8      557     5     57
##  9      557     5     57
## 10      558     5     58
## # ... with 336,766 more rows

Offsets

#lead and lag allow you to refer to leading or lagging values.
#They are most useful in conjunction with group_by()
(x<- 1:10)

##  [1]  1  2  3  4  5  6  7  8  9 10

lag(x)

##  [1] NA  1  2  3  4  5  6  7  8  9

lead(x)

##  [1]  2  3  4  5  6  7  8  9 10 NA

Cumulative aggregates

##  [1]  1  2  3  4  5  6  7  8  9 10

cumsum(x)

##  [1]  1  3  6 10 15 21 28 36 45 55

cummean(x)

##  [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5

Ranking

# min_rank - the default gives the smallest values the smallest ranks.
y<-c(2, 1, 2, NA, 3, 4)
min_rank(y)

## [1]  2  1  2 NA  4  5

min_rank(desc(y))

## [1]  3  5  3 NA  2  1

# other rankings
row_number(y)

## [1]  2  1  3 NA  4  5

dense_rank(y)

## [1]  2  1  2 NA  3  4

percent_rank(y)

## [1] 0.25 0.00 0.25   NA 0.75 1.00

cume_dist(y)

## [1] 0.6 0.2 0.6  NA 0.8 1.0

Grouped summaries

# summarise collapses a data frame to a single row
summarise(flights, 
          delay = mean(dep_delay, na.rm=TRUE))

## # A tibble: 1 x 1
##   delay
##   <dbl>
## 1  12.6

by_day<- group_by(flights, year, month, day)
summarise(by_day, delay=mean(dep_delay, na.rm=TRUE))

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows

Pipe

# Option 1
by_dest<-group_by(flights, dest)
delay<- summarise(by_dest, 
                  count = n(),
                  dist = mean(distance, na.rm = TRUE),
                  delay = mean(arr_delay, na.rm = TRUE))
delay<- filter(delay, count>20, dest !="HNL")

ggplot(data = delay, mapping = aes(x=dist, y=delay))+
  geom_point(aes(size = count), alpha = 1/3)+
  geom_smooth(se=FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Option 2 - simplified with %>% 
delays<- flights %>% 
  group_by(dest) %>% 
  summarise(count=n(),
            dist = mean(distance, na.rm = TRUE),
            delay = mean(arr_delay, na.rm = TRUE)) %>% 
  filter(count>20, dest != "HNL") %>% 
  ggplot(data = delay, mapping = aes(x=dist, y=delay))+
  geom_point(aes(size = count), alpha = 1/3)+
  geom_smooth(se=FALSE)

delays

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Missing values

#If there's any missing value in the input, the output will be a missing value
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean=mean(dep_delay))

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1    NA
##  2  2013     1     2    NA
##  3  2013     1     3    NA
##  4  2013     1     4    NA
##  5  2013     1     5    NA
##  6  2013     1     6    NA
##  7  2013     1     7    NA
##  8  2013     1     8    NA
##  9  2013     1     9    NA
## 10  2013     1    10    NA
## # ... with 355 more rows

#All aggregate functions have an na.rm argument to remove missing values prior to computation
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean=mean(dep_delay, na.rm=TRUE))

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows

# Where missing values represent cancelled flights, remove the cancelled flights instead
not_cancelled<- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(mean=mean(dep_delay))

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # ... with 355 more rows

Counts

# planes with highest average delays
delays<- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(delay=mean(arr_delay))

ggplot(data=delays, mapping = aes(x=delay))+
  geom_freqpoly(binwidth=10)

# instead, look at number of flights v ave delay
# there is much greater variation in ave delays when there are few flights. 
# variation decreases as sample size increases
delays<- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(delay=mean(arr_delay, na.rm=TRUE),
            n=n())

ggplot(data=delays, mapping = aes(x=n, y=delay))+
  geom_point(alpha=1/10)

# it's often useful to filter out the outlier groups so you can see more of the pattern
delays %>% 
  filter(n>25) %>% 
  ggplot(mapping = aes(x=n, y=delay))+
  geom_point(alpha=1/10)

# variation decreases with more data points, and there is a positive correlation between skill and opportunity to play

batting<- as_tibble(Lahman::Batting)

batters<- batting %>% 
  group_by(playerID) %>% 
  summarise(
    ba=sum(H, na.rm=TRUE)/sum(AB, na.rm=TRUE),
    ab=sum(AB, na.rm=TRUE)
  )

batters %>% 
  filter(ab>100) %>% 
  ggplot(mapping=aes(x=ab, y=ba))+
  geom_point()+
  geom_smooth(se=FALSE)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#This has implications for ranking. If you naively sort desc(ba), the pople with the best ba are lucky, not skilled

batters %>% arrange(desc(ba))

## # A tibble: 19,898 x 3
##    playerID     ba    ab
##    <chr>     <dbl> <int>
##  1 abramge01     1     1
##  2 alanirj01     1     1
##  3 alberan01     1     1
##  4 banisje01     1     1
##  5 bartocl01     1     1
##  6 bassdo01      1     1
##  7 birasst01     1     2
##  8 bruneju01     1     1
##  9 burnscb01     1     1
## 10 cammaer01     1     1
## # ... with 19,888 more rows

#refer http://bit.ly/Bayesbbal and http://bit.ly/notsortavg

Summary functions

#mean
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    #avg delay:
    avg_delay1=mean(arr_delay),
    #avg positive delay:
    avg_delay2=mean(arr_delay[arr_delay>0])
    )

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day avg_delay1 avg_delay2
##    <int> <int> <int>      <dbl>      <dbl>
##  1  2013     1     1     12.7         32.5
##  2  2013     1     2     12.7         32.0
##  3  2013     1     3      5.73        27.7
##  4  2013     1     4     -1.93        28.3
##  5  2013     1     5     -1.53        22.6
##  6  2013     1     6      4.24        24.4
##  7  2013     1     7     -4.95        27.8
##  8  2013     1     8     -3.23        20.8
##  9  2013     1     9     -0.264       25.6
## 10  2013     1    10     -5.90        27.3
## # ... with 355 more rows

#Measures of spread
#standard deviation=sd(x), interquartile range= IQR(x), median absolute deviation=mad(x)
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(distance_sd=sd(distance)) %>% 
  arrange(desc(distance_sd))

## # A tibble: 104 x 2
##    dest  distance_sd
##    <chr>       <dbl>
##  1 EGE         10.5 
##  2 SAN         10.4 
##  3 SFO         10.2 
##  4 HNL         10.0 
##  5 SEA          9.98
##  6 LAS          9.91
##  7 PDX          9.87
##  8 PHX          9.86
##  9 LAX          9.66
## 10 IND          9.46
## # ... with 94 more rows

#Measures of rank
#min(x), quantile(x, 0.25), max(x)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(first = min(dep_time),
            last = max(dep_time)
  )

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     1     1   517  2356
##  2  2013     1     2    42  2354
##  3  2013     1     3    32  2349
##  4  2013     1     4    25  2358
##  5  2013     1     5    14  2357
##  6  2013     1     6    16  2355
##  7  2013     1     7    49  2359
##  8  2013     1     8   454  2351
##  9  2013     1     9     2  2252
## 10  2013     1    10     3  2320
## # ... with 355 more rows

#Measures of position
# first(x), nth(x, 2), last(x)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(first(dep_time),
            last(dep_time),
            nth(dep_time, 2)
  )

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 6
## # Groups:   year, month [12]
##     year month   day `first(dep_time)` `last(dep_time)` `nth(dep_time, 2)`
##    <int> <int> <int>             <int>            <int>              <int>
##  1  2013     1     1               517             2356                533
##  2  2013     1     2                42             2354                126
##  3  2013     1     3                32             2349                 50
##  4  2013     1     4                25             2358                106
##  5  2013     1     5                14             2357                 37
##  6  2013     1     6                16             2355                458
##  7  2013     1     7                49             2359                454
##  8  2013     1     8               454             2351                524
##  9  2013     1     9                 2             2252                  8
## 10  2013     1    10                 3             2320                 16
## # ... with 355 more rows

#Filter on ranks
not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r= min_rank(desc(dep_time))) %>% 
  filter(r %in% range(r))

## # A tibble: 770 x 20
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1     2356           2359        -3      425            437
##  3  2013     1     2       42           2359        43      518            442
##  4  2013     1     2     2354           2359        -5      413            437
##  5  2013     1     3       32           2359        33      504            442
##  6  2013     1     3     2349           2359       -10      434            445
##  7  2013     1     4       25           2359        26      505            442
##  8  2013     1     4     2358           2359        -1      429            437
##  9  2013     1     4     2358           2359        -1      436            445
## 10  2013     1     5       14           2359        15      503            445
## # ... with 760 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   r <int>

#count number of distinct values
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))

## # A tibble: 104 x 2
##    dest  carriers
##    <chr>    <int>
##  1 ATL          7
##  2 BOS          7
##  3 CLT          7
##  4 ORD          7
##  5 TPA          7
##  6 AUS          6
##  7 DCA          6
##  8 DTW          6
##  9 IAD          6
## 10 MSP          6
## # ... with 94 more rows

#simple counts
not_cancelled %>% 
  count(dest)

## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows

#add weight var to sum total distance flown
not_cancelled %>% 
  count(tailnum, wt = distance)

## # A tibble: 4,037 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  239143
##  3 N10156  109664
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   24616
##  7 N10575  139903
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # ... with 4,027 more rows

#counts and proportions of logical values e.g.sum(x>10), mean(y==0)
#when used with numeric functions, sums the number of true occurrences 

#How many flights left before 5am
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(n_early = sum(dep_time<500))

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day n_early
##    <int> <int> <int>   <int>
##  1  2013     1     1       0
##  2  2013     1     2       3
##  3  2013     1     3       4
##  4  2013     1     4       3
##  5  2013     1     5       3
##  6  2013     1     6       2
##  7  2013     1     7       2
##  8  2013     1     8       1
##  9  2013     1     9       3
## 10  2013     1    10       3
## # ... with 355 more rows

#What proportion of flights are delayed more than an hour?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(hour_perc = mean(arr_delay>60))

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day hour_perc
##    <int> <int> <int>     <dbl>
##  1  2013     1     1    0.0722
##  2  2013     1     2    0.0851
##  3  2013     1     3    0.0567
##  4  2013     1     4    0.0396
##  5  2013     1     5    0.0349
##  6  2013     1     6    0.0470
##  7  2013     1     7    0.0333
##  8  2013     1     8    0.0213
##  9  2013     1     9    0.0202
## 10  2013     1    10    0.0183
## # ... with 355 more rows

Grouping multiple variables

#when you group multiple variables, each summary peels off one level of grouping 
daily<- group_by(flights, year, month, day)

(per_day<-summarise(daily, flights=n()))

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day flights
##    <int> <int> <int>   <int>
##  1  2013     1     1     842
##  2  2013     1     2     943
##  3  2013     1     3     914
##  4  2013     1     4     915
##  5  2013     1     5     720
##  6  2013     1     6     832
##  7  2013     1     7     933
##  8  2013     1     8     899
##  9  2013     1     9     902
## 10  2013     1    10     932
## # ... with 355 more rows

(per_month<- summarise(per_day, flights=sum(flights)))

## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

## # A tibble: 12 x 3
## # Groups:   year [1]
##     year month flights
##    <int> <int>   <int>
##  1  2013     1   27004
##  2  2013     2   24951
##  3  2013     3   28834
##  4  2013     4   28330
##  5  2013     5   28796
##  6  2013     6   28243
##  7  2013     7   29425
##  8  2013     8   29327
##  9  2013     9   27574
## 10  2013    10   28889
## 11  2013    11   27268
## 12  2013    12   28135

(per_year<- summarise(per_month, flights=sum(flights)))

## # A tibble: 1 x 2
##    year flights
##   <int>   <int>
## 1  2013  336776

Ungrouping

# To remove grouping and return to operations on ungrouped data
daily %>% 
  ungroup() %>% 
  summarise(flights=n())

## # A tibble: 1 x 1
##   flights
##     <int>
## 1  336776

Grouped Mutates (and Filters)

#grouping is most useful in conjunction with summarise, butyu can also do convenient operations with mutate() and filter()


#find the worst members of each group
flights_sml %>% 
  group_by(year, month, day) %>% 
  filter(rank(desc(arr_delay))<10)

## # A tibble: 3,306 x 7
## # Groups:   year, month, day [365]
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1       853       851      184       41
##  2  2013     1     1       290       338     1134      213
##  3  2013     1     1       260       263      266       46
##  4  2013     1     1       157       174      213       60
##  5  2013     1     1       216       222      708      121
##  6  2013     1     1       255       250      589      115
##  7  2013     1     1       285       246     1085      146
##  8  2013     1     1       192       191      199       44
##  9  2013     1     1       379       456     1092      222
## 10  2013     1     2       224       207      550       94
## # ... with 3,296 more rows

#find all groups bigger than a threshold
popular_dests<-flights %>% 
  group_by(dest) %>% 
  filter(n()>365)

#standardise to compute per group metrics
popular_dests %>% 
  filter(arr_delay>0) %>% 
  mutate(prop_delay=arr_delay/sum(arr_delay)) %>% 
  select(year:day, dest, arr_delay, prop_delay)

## # A tibble: 131,106 x 6
## # Groups:   dest [77]
##     year month   day dest  arr_delay prop_delay
##    <int> <int> <int> <chr>     <dbl>      <dbl>
##  1  2013     1     1 IAH          11  0.000111 
##  2  2013     1     1 IAH          20  0.000201 
##  3  2013     1     1 MIA          33  0.000235 
##  4  2013     1     1 ORD          12  0.0000424
##  5  2013     1     1 FLL          19  0.0000938
##  6  2013     1     1 ORD           8  0.0000283
##  7  2013     1     1 LAX           7  0.0000344
##  8  2013     1     1 DFW          31  0.000282 
##  9  2013     1     1 ATL          12  0.0000400
## 10  2013     1     1 DTW          16  0.000116 
## # ... with 131,096 more rows

Notes

Filter

Arrange

Select

Mutate

Offsets

Cumulative aggregates

Ranking

Grouped summaries

Pipe

Missing values

Counts

Summary functions

Grouping multiple variables

Ungrouping

Grouped Mutates (and Filters)