Functions

DataSet %>%
filter()
group_by()
arrange()
…

filter() to select cases based on their values.
arrange() to reorder the cases.
select() and rename() to select variables based on their names.
mutate() and transmute() to add new variables that are functions of existing variables.
summarise() to condense multiple values to a single value.
sample_n() and sample_frac() to take random samples.

References: https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html Data Table: https://github.com/Rdatatable/data.table/wiki

library(nycflights13)
library(dplyr)
head(flights)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

filter()

filter(flights, month == 1, year == 2013)

## # A tibble: 27,004 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 26,994 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Its the same:

flights[flights$month == 1 & flights$day == 1, ]

flights %>%
  filter(dep_time > 500 | dep_time < 545)

## # A tibble: 328,521 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 328,511 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

arrange()

arrange(flights, year, month, day)

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Use desc() to order a column in descending order:

arrange(flights, desc(arr_delay))

select()

# Select columns by name
select(flights, year, month, day)
#Select all columns between year and day (inclusive)
select(flights, year:day)
# Select all columns except those from year to day (inclusive)
select(flights, -(year:day))
# Select columns that starts with string
 flights %>% 
  select(starts_with("dep"))

rename()

#You can rename variables with select() by using named arguments:
select(flights, tail_num = tailnum)
#Just to rename:
rename(flights, tail_num = tailnum)

mutate()

#To create NEW COLUMNS
mutate(flights,
  gain = arr_delay - dep_delay,
  speed = distance / air_time * 60
)
#To filter and ADD COLUMNS
df <- select(flights, year:dep_time)
mutate(df, "year", 2)

#New Column adding +10 to a column
mutate(df, year + 10)

sample()

# to take a random sample of rows
sample_n(flights, 10)
sample_frac(flights, 0.01)

group_by()

by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dist < 2000)

#Group and convert 
group_by(df, month = as.factor(month))

#Group and bin
group_by(df, day_binned = cut(day, 3))

summarise()

summarise() with aggregate functions, which take a vector of values and return a single number. There are many useful examples of such functions in base R like min(), max(), mean(), sum(), sd(), median(), and IQR(). dplyr provides a handful of others:

n(): the number of observations in the current group
n_distinct(x):the number of unique values in x.
first(x), last(x) and nth(x, n) - these work similarly to x[1], x[length(x)], and x[n] but give you more control over the result if the value is missing.

You can use summarise function to extract two results. Like summarise(newcolumName1 = mean(Column), newcolumnName2 = n())

bind_rows

This function bind all the rows together if the columns are equal.

newData <- bind_rows(df, df, df)

Dplyr: Data Manipulation

el Sanju