##Load the dplyr package.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##Load the data file nycflights
library(nycflights13)
##filter: Filter the rows based on condition(s).
#where clause in select sql command.
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
head(filter(flights,month==11,day==3,carrier=='AA'))
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 3 538 545 -7 824
## 2 2013 11 3 556 600 -4 900
## 3 2013 11 3 604 610 -6 844
## 4 2013 11 3 624 629 -5 907
## 5 2013 11 3 625 630 -5 736
## 6 2013 11 3 653 655 -2 925
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
##slice: Retrieve the set of rows by indices
slice(flights,1:10)
## # A tibble: 10 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
##arrange: Sorting the rows based on conditions.
head(arrange(flights,year,month,day,desc(air_time)))
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 857 900 -3 1516
## 2 2013 1 1 1344 1344 0 2005
## 3 2013 1 1 1029 1030 -1 1427
## 4 2013 1 1 702 700 2 1058
## 5 2013 1 1 646 645 1 1023
## 6 2013 1 1 745 745 0 1135
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
##select: Selects the specified columns to display
head(select(flights,carrier))
## # A tibble: 6 × 1
## carrier
## <chr>
## 1 UA
## 2 UA
## 3 AA
## 4 B6
## 5 DL
## 6 UA
##rename: to rename a column name(like alias)
head(rename(flights,airlines_carrier=carrier))
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # airlines_carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
##distinct:to retrieve distinct values of a column
distinct(select(flights,carrier))
## # A tibble: 16 × 1
## carrier
## <chr>
## 1 UA
## 2 AA
## 3 B6
## 4 DL
## 5 EV
## 6 MQ
## 7 US
## 8 WN
## 9 VX
## 10 FL
## 11 AS
## 12 9E
## 13 F9
## 14 HA
## 15 YV
## 16 OO
##To add a new column based on existing columns
head(mutate(flights,new_col=arr_delay-dep_delay))
## # A tibble: 6 × 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, new_col <dbl>
head(transmute(flights,new_col=arr_delay-dep_delay))
## # A tibble: 6 × 1
## new_col
## <dbl>
## 1 9
## 2 16
## 3 31
## 4 -17
## 5 -19
## 6 16
##Summarise: It is like groupby. it only genereate
#one row.
summarise(flights,avg_air_time=mean(air_time,na.rm=T))
## # A tibble: 1 × 1
## avg_air_time
## <dbl>
## 1 150.6865
summarise(flights,total_time=sum(air_time,na.rm=T))
## # A tibble: 1 × 1
## total_time
## <dbl>
## 1 49326610
##To draw a sample of size 10 from a data set
sample_n(flights,10)
## # A tibble: 10 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 5 1323 1325 -2 1358
## 2 2013 7 27 1626 1605 21 1920
## 3 2013 10 26 1603 1600 3 1830
## 4 2013 10 19 1010 1020 -10 1204
## 5 2013 5 1 2145 2145 0 31
## 6 2013 3 26 1526 1530 -4 1858
## 7 2013 10 2 1449 1455 -6 1735
## 8 2013 11 8 1114 1114 0 1355
## 9 2013 7 21 1357 1359 -2 1522
## 10 2013 3 30 837 830 7 1200
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
##To draw a fraction of rows from a data set
sample_frac(flights,.1)
## # A tibble: 33,678 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 5 1858 1419 279 2134
## 2 2013 10 14 611 610 1 806
## 3 2013 9 7 1331 1340 -9 1448
## 4 2013 4 14 1953 2000 -7 2201
## 5 2013 12 30 1556 1600 -4 1914
## 6 2013 11 10 1300 1300 0 1421
## 7 2013 11 11 1508 1500 8 1810
## 8 2013 11 13 829 835 -6 951
## 9 2013 10 11 1920 1829 51 2035
## 10 2013 10 17 1805 1759 6 2050
## # ... with 33,668 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##Pipe operator
df<-flights
r<-df %>% filter(month==11)%>%sample_n(size=5)%>%arrange(desc(arr_time))
r
## # A tibble: 5 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 1 1327 1330 -3 1602
## 2 2013 11 26 1257 1300 -3 1534
## 3 2013 11 11 819 830 -11 958
## 4 2013 11 12 732 736 -4 903
## 5 2013 11 30 2119 2120 -1 12
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>