Functions

DataSet %>%
filter()
group_by()
arrange()

  • filter() to select cases based on their values.
  • arrange() to reorder the cases.
  • select() and rename() to select variables based on their names.
  • mutate() and transmute() to add new variables that are functions of existing variables.
  • summarise() to condense multiple values to a single value.
  • sample_n() and sample_frac() to take random samples.

References: https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html Data Table: https://github.com/Rdatatable/data.table/wiki

library(nycflights13)
library(dplyr)
head(flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

filter()

filter(flights, month == 1, year == 2013)
## # A tibble: 27,004 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 26,994 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Its the same:

flights[flights$month == 1 & flights$day == 1, ]
flights %>%
  filter(dep_time > 500 | dep_time < 545)
## # A tibble: 328,521 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 328,511 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

arrange()

arrange(flights, year, month, day)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Use desc() to order a column in descending order:

arrange(flights, desc(arr_delay))

select()

# Select columns by name
select(flights, year, month, day)
#Select all columns between year and day (inclusive)
select(flights, year:day)
# Select all columns except those from year to day (inclusive)
select(flights, -(year:day))
# Select columns that starts with string
 flights %>% 
  select(starts_with("dep"))

rename()

#You can rename variables with select() by using named arguments:
select(flights, tail_num = tailnum)
#Just to rename:
rename(flights, tail_num = tailnum)

mutate()

#To create NEW COLUMNS
mutate(flights,
  gain = arr_delay - dep_delay,
  speed = distance / air_time * 60
)
#To filter and ADD COLUMNS
df <- select(flights, year:dep_time)
mutate(df, "year", 2)

#New Column adding +10 to a column
mutate(df, year + 10)

sample()

# to take a random sample of rows
sample_n(flights, 10)
sample_frac(flights, 0.01)

group_by()

by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dist < 2000)

#Group and convert 
group_by(df, month = as.factor(month))

#Group and bin
group_by(df, day_binned = cut(day, 3))

summarise()

summarise() with aggregate functions, which take a vector of values and return a single number. There are many useful examples of such functions in base R like min(), max(), mean(), sum(), sd(), median(), and IQR(). dplyr provides a handful of others:

  • n(): the number of observations in the current group
  • n_distinct(x):the number of unique values in x.
  • first(x), last(x) and nth(x, n) - these work similarly to x[1], x[length(x)], and x[n] but give you more control over the result if the value is missing.

You can use summarise function to extract two results. Like summarise(newcolumName1 = mean(Column), newcolumnName2 = n())

bind_rows

This function bind all the rows together if the columns are equal.

newData <- bind_rows(df, df, df)