##Load the dplyr package.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##Load the data file nycflights
library(nycflights13)
##filter: Filter the rows based on condition(s).
#where clause in select sql command.
head(flights)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
head(filter(flights,month==11,day==3,carrier=='AA'))
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013    11     3      538            545        -7      824
## 2  2013    11     3      556            600        -4      900
## 3  2013    11     3      604            610        -6      844
## 4  2013    11     3      624            629        -5      907
## 5  2013    11     3      625            630        -5      736
## 6  2013    11     3      653            655        -2      925
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
##slice: Retrieve the set of rows by indices
slice(flights,1:10)
## # A tibble: 10 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
##arrange: Sorting the rows based on conditions.
head(arrange(flights,year,month,day,desc(air_time)))
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      857            900        -3     1516
## 2  2013     1     1     1344           1344         0     2005
## 3  2013     1     1     1029           1030        -1     1427
## 4  2013     1     1      702            700         2     1058
## 5  2013     1     1      646            645         1     1023
## 6  2013     1     1      745            745         0     1135
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
##select: Selects the specified columns to display
head(select(flights,carrier))
## # A tibble: 6 × 1
##   carrier
##     <chr>
## 1      UA
## 2      UA
## 3      AA
## 4      B6
## 5      DL
## 6      UA
##rename: to rename a column name(like alias)
head(rename(flights,airlines_carrier=carrier))
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   airlines_carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
##distinct:to retrieve distinct values of a column
distinct(select(flights,carrier))
## # A tibble: 16 × 1
##    carrier
##      <chr>
## 1       UA
## 2       AA
## 3       B6
## 4       DL
## 5       EV
## 6       MQ
## 7       US
## 8       WN
## 9       VX
## 10      FL
## 11      AS
## 12      9E
## 13      F9
## 14      HA
## 15      YV
## 16      OO
##To add a new column based on existing columns
head(mutate(flights,new_col=arr_delay-dep_delay))
## # A tibble: 6 × 20
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, new_col <dbl>
head(transmute(flights,new_col=arr_delay-dep_delay))
## # A tibble: 6 × 1
##   new_col
##     <dbl>
## 1       9
## 2      16
## 3      31
## 4     -17
## 5     -19
## 6      16
##Summarise: It is like groupby. it only genereate
#one row.
summarise(flights,avg_air_time=mean(air_time,na.rm=T))
## # A tibble: 1 × 1
##   avg_air_time
##          <dbl>
## 1     150.6865
summarise(flights,total_time=sum(air_time,na.rm=T))
## # A tibble: 1 × 1
##   total_time
##        <dbl>
## 1   49326610
##To draw a sample of size 10 from a data set
sample_n(flights,10)
## # A tibble: 10 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     5     1323           1325        -2     1358
## 2   2013     7    27     1626           1605        21     1920
## 3   2013    10    26     1603           1600         3     1830
## 4   2013    10    19     1010           1020       -10     1204
## 5   2013     5     1     2145           2145         0       31
## 6   2013     3    26     1526           1530        -4     1858
## 7   2013    10     2     1449           1455        -6     1735
## 8   2013    11     8     1114           1114         0     1355
## 9   2013     7    21     1357           1359        -2     1522
## 10  2013     3    30      837            830         7     1200
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
##To draw a fraction of rows from a data set
sample_frac(flights,.1)
## # A tibble: 33,678 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013    12     5     1858           1419       279     2134
## 2   2013    10    14      611            610         1      806
## 3   2013     9     7     1331           1340        -9     1448
## 4   2013     4    14     1953           2000        -7     2201
## 5   2013    12    30     1556           1600        -4     1914
## 6   2013    11    10     1300           1300         0     1421
## 7   2013    11    11     1508           1500         8     1810
## 8   2013    11    13      829            835        -6      951
## 9   2013    10    11     1920           1829        51     2035
## 10  2013    10    17     1805           1759         6     2050
## # ... with 33,668 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
##Pipe operator
df<-flights
r<-df %>% filter(month==11)%>%sample_n(size=5)%>%arrange(desc(arr_time))
r
## # A tibble: 5 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013    11     1     1327           1330        -3     1602
## 2  2013    11    26     1257           1300        -3     1534
## 3  2013    11    11      819            830       -11      958
## 4  2013    11    12      732            736        -4      903
## 5  2013    11    30     2119           2120        -1       12
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>