knitr::opts_chunk$set(echo = TRUE)

1. 处理观测值

1. Extract rows that meet logical criteria.

library(tidyverse)
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DT)
iris %>% glimpse()
## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, s...
iris %>% dplyr::filter(Sepal.Length > 7) %>% 
  datatable()

2. Remove rows with duplicate values.

dplyr::distinct(iris, Species) %>% 
  datatable()

3. Randomly select fraction of rows.

dplyr::sample_frac(iris, 0.5) %>% 
  datatable()

4. Randomly select size rows.

iris1 <- iris %>% 
  mutate(id = seq(150)) %>%
  select(id,everything())

sample_n(iris1, 10) %>% 
  datatable()

5. Select rows by position.

slice(iris1,10:15) %>% 
  datatable()

6. Select and order top n entries (by group if grouped data).

iris1 %>% top_n(10,Sepal.Length) %>% 
  datatable()
iris1 %>% 
  arrange(desc(Sepal.Length)) %>% 
  slice(1:10) %>% 
  datatable()

7. Logical and boolean operators

Logical and boolean operators to use with filter()< <= is.na() %in% | xor() > >= !is.na() ! &

mpg %>% datatable()
mpg %>% filter(displ < 2) %>% 
  datatable()
mpg %>% mutate(id = c(1:232,NA,NA)) %>% 
  filter(is.na(id)) %>% 
  datatable()
mpg %>% 
  filter(drv == "f"|drv == "4") %>% 
  datatable()

8. Order rows by values of a column or columns

Order rows by values of a column or columns (low to high), use with desc() to order from high to low.

mtcars %>% arrange(mpg) %>% 
  datatable()
mtcars %>% arrange(desc(mpg)) %>% 
  datatable()

9. **Add one or more rows* to a table.

faithful %>% head()
##   eruptions waiting
## 1     3.600      79
## 2     1.800      54
## 3     3.333      74
## 4     2.283      62
## 5     4.533      85
## 6     2.883      55
faithful %>% add_case(eruptions = 1,waiting = 1) %>% tail() %>% datatable()

2. 处理变量

1. pull: Extract column values as a vector.

Choose by name or index. However,Select: Extract columns as a table.

dplyr::pull(iris1,Sepal.Length) %>% str()
##  num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
iris1 %>% select(Sepal.Length) %>% str()
## 'data.frame':    150 obs. of  1 variable:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...

2. Use these helpers with select():

  • starts_with(): Starts with a prefix.

  • ends_with(): Ends with a suffix.

  • contains(): Contains a literal string.

  • matches(): Matches a regular expression.

  • num_range(): Matches a numerical range like x01, x02, x03.

  • one_of(): Matches variable names in a character vector.

  • everything(): Matches all variables.

  • last_col(): Select last variable, possibly with an offset.

names(iris1)
## [1] "id"           "Sepal.Length" "Sepal.Width"  "Petal.Length"
## [5] "Petal.Width"  "Species"
iris1 %>% select(contains("Se")) %>% head() %>% datatable()
iris1 %>% select(ends_with("th")) %>% head() %>% datatable()
iris1 %>% select(starts_with("Se")) %>% head() %>% datatable()
iris1 %>% select(id:Sepal.Length) %>% head() %>% datatable()
iris1 %>% select(matches(".t.")) %>% head %>% datatable()

3. Compute new column(s):mutate()

# Compute new column(s)
mutate(mtcars,gpm = 1/mpg) %>% head() %>% datatable()
# Apply funs to every column. 
mutate_all(faithful, funs(log(.),log2(.))) %>% head() %>% datatable()
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
mutate_if(iris,is.numeric,list(log = log)) %>% head() %>% datatable()
# Apply funs to specific columns. 
iris %>% mutate_at(vars(-Species),list(log = log)) %>% head() %>% datatable()

4. Add new column(s).

add_column(mtcars, new = 1:32) %>% head() %>% datatable()

5. Rename columns.

rename(iris, Length = Sepal.Length) %>% head() %>% datatable()

3. 总结摘要

1. summarise functions

These apply summary functions to columns to create a new table of summarise statistics. Summarise functions take vectors as input and return one value (see back).

mtcars %>% summarise(avg = mean(mpg))
##        avg
## 1 20.09062
# Count number of rows in each group defined by the variables
iris %>% count(Species)
## # A tibble: 3 x 2
##   Species        n
##   <fct>      <int>
## 1 setosa        50
## 2 versicolor    50
## 3 virginica     50
iris %>% summarise_all(list(length = length)) # list function
##   Sepal.Length_length Sepal.Width_length Petal.Length_length
## 1                 150                150                 150
##   Petal.Width_length Species_length
## 1                150            150
by_species <- iris %>%
  group_by(Species)

# The _at() variants directly support strings:
starwars %>% head() %>% datatable()
starwars %>%
  summarise_at(vars("height", "mass"), mean, na.rm = TRUE)
## # A tibble: 1 x 2
##   height  mass
##    <dbl> <dbl>
## 1   174.  97.3
starwars %>%
  summarise_at(vars(height:mass), mean, na.rm = TRUE)
## # A tibble: 1 x 2
##   height  mass
##    <dbl> <dbl>
## 1   174.  97.3
starwars %>%
  summarise_if(is.numeric, mean, na.rm = TRUE)
## # A tibble: 1 x 3
##   height  mass birth_year
##    <dbl> <dbl>      <dbl>
## 1   174.  97.3       87.6
by_species %>%
  summarise_all(list(min, max)) %>% datatable()
# Note how the new variables include the function name, in order to keep things distinct. Passing purrr-style lambdas often creates better default names:

by_species %>%
  summarise_all(list(~min(.), ~max(.)))
## # A tibble: 3 x 9
##   Species Sepal.Length_min Sepal.Width_min Petal.Length_min Petal.Width_min
##   <fct>              <dbl>           <dbl>            <dbl>           <dbl>
## 1 setosa               4.3             2.3              1               0.1
## 2 versic~              4.9             2                3               1  
## 3 virgin~              4.9             2.2              4.5             1.4
## # ... with 4 more variables: Sepal.Length_max <dbl>,
## #   Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>
# When that's not good enough, you can also supply the names explicitly:
by_species %>%
  summarise_all(list(min = min, max = max)) # 更直接
## # A tibble: 3 x 9
##   Species Sepal.Length_min Sepal.Width_min Petal.Length_min Petal.Width_min
##   <fct>              <dbl>           <dbl>            <dbl>           <dbl>
## 1 setosa               4.3             2.3              1               0.1
## 2 versic~              4.9             2                3               1  
## 3 virgin~              4.9             2.2              4.5             1.4
## # ... with 4 more variables: Sepal.Length_max <dbl>,
## #   Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>
# When there's only one function in the list, it modifies existing variables in place. Give it a name to create new variables instead:
by_species %>% summarise_all(list(median))
## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa              5           3.4         1.5          0.2
## 2 versicolor          5.9         2.8         4.35         1.3
## 3 virginica           6.5         3           5.55         2
by_species %>% summarise_all(list(med = median))
## # A tibble: 3 x 5
##   Species Sepal.Length_med Sepal.Width_med Petal.Length_med Petal.Width_med
##   <fct>              <dbl>           <dbl>            <dbl>           <dbl>
## 1 setosa               5               3.4             1.5              0.2
## 2 versic~              5.9             2.8             4.35             1.3
## 3 virgin~              6.5             3               5.55             2
by_species %>% summarise_all(list(Q3 = quantile), probs = 0.75)
## # A tibble: 3 x 5
##   Species    Sepal.Length_Q3 Sepal.Width_Q3 Petal.Length_Q3 Petal.Width_Q3
##   <fct>                <dbl>          <dbl>           <dbl>          <dbl>
## 1 setosa                 5.2           3.68            1.58            0.3
## 2 versicolor             6.3           3               4.6             1.5
## 3 virginica              6.9           3.18            5.88            2.3

4. 分组摘要

1. Use group_by() to create a “grouped” copy of a table.

dplyr functions will manipulate each “group” separately and then combine the results.

mtcars %>% 
  group_by(cyl) %>% 
  summarise(mpg.avg = mean(mpg))
## # A tibble: 3 x 2
##     cyl mpg.avg
##   <dbl>   <dbl>
## 1     4    26.7
## 2     6    19.7
## 3     8    15.1
g_iris <- group_by(iris, Species)
g_iris %>% glimpse()
## Observations: 150
## Variables: 5
## Groups: Species [3]
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, s...

5. 向量函数

1. Offsets

dplyr::lag(1:10) # Offset elements by 1
##  [1] NA  1  2  3  4  5  6  7  8  9
dplyr::lead(1:10) # Offset elements by -1
##  [1]  2  3  4  5  6  7  8  9 10 NA

2. Cumulative Aggregation

dplyr::cumall(1:10)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
dplyr::cumany(1:10)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# as the input vector.
x <- c(1, 3, 5, 2, 2)
cummean(x)
## [1] 1.00 2.00 3.00 2.75 2.60
cumsum(x) / seq_along(x)
## [1] 1.00 2.00 3.00 2.75 2.60
# `cumall()` and `cumany()` return logicals
cumall(x < 5)
## [1]  TRUE  TRUE FALSE FALSE FALSE
cumany(x == 3)
## [1] FALSE  TRUE  TRUE  TRUE  TRUE
# `cumall()` vs. `cumany()`
df <- data.frame(
  date = as.Date("2020-01-01") + 0:6,
  balance = c(100, 50, 25, -25, -50, 30, 120)
)
df
##         date balance
## 1 2020-01-01     100
## 2 2020-01-02      50
## 3 2020-01-03      25
## 4 2020-01-04     -25
## 5 2020-01-05     -50
## 6 2020-01-06      30
## 7 2020-01-07     120
# all rows after first overdraft
df %>% filter(cumany(balance < 0))
##         date balance
## 1 2020-01-04     -25
## 2 2020-01-05     -50
## 3 2020-01-06      30
## 4 2020-01-07     120
# all rows until first overdraft
df %>% filter(cumall(!(balance < 0)))
##         date balance
## 1 2020-01-01     100
## 2 2020-01-02      50
## 3 2020-01-03      25
cummax(1:10)
##  [1]  1  2  3  4  5  6  7  8  9 10
cummin(1:10)
##  [1] 1 1 1 1 1 1 1 1 1 1
cummean(1:10)
##  [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
cumsum(1:10)
##  [1]  1  3  6 10 15 21 28 36 45 55
cumprod(1:10)
##  [1]       1       2       6      24     120     720    5040   40320
##  [9]  362880 3628800

3. Rankings

x <- c(5, 1, 3, 2, 2, NA)
row_number(x)
## [1]  5  1  4  2  3 NA
min_rank(x)
## [1]  5  1  4  2  2 NA
dense_rank(x)
## [1]  4  1  3  2  2 NA
percent_rank(x)
## [1] 1.00 0.00 0.75 0.25 0.25   NA
cume_dist(x)
## [1] 1.0 0.2 0.8 0.6 0.6  NA
ntile(x, 2)
## [1]  2  1  2  1  1 NA
ntile(runif(100), 10)
##   [1]  9  5  5  7 10  5  8  2  8  9 10  2  6 10  3 10  8  1  2 10  7  2  6
##  [24]  1  6  4  9  5  6  9  7  9  6  4  9  8  3  5 10  1  7  6 10  7  6  9
##  [47]  8  3  8  7  6  1  4  3  1  7  8  5  4  7  9  1  2  2  2  2  1 10  6
##  [70]  6  3 10  3  5  8  3  7  2  8  2  5  1  4  5  4  5  4  3  9  4  9 10
##  [93]  4  7  3  1  3  1  8  4
# row_number can be used with single table verbs without specifying x
# (for data frames and databases that support windowing)
mutate(mtcars, row_number() == 1L)
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb row_number() == 1L
## 1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4               TRUE
## 2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4              FALSE
## 3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1              FALSE
## 4  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1              FALSE
## 5  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2              FALSE
## 6  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1              FALSE
## 7  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4              FALSE
## 8  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2              FALSE
## 9  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2              FALSE
## 10 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4              FALSE
## 11 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4              FALSE
## 12 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3              FALSE
## 13 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3              FALSE
## 14 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3              FALSE
## 15 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4              FALSE
## 16 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4              FALSE
## 17 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4              FALSE
## 18 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1              FALSE
## 19 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2              FALSE
## 20 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1              FALSE
## 21 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1              FALSE
## 22 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2              FALSE
## 23 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2              FALSE
## 24 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4              FALSE
## 25 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2              FALSE
## 26 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1              FALSE
## 27 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2              FALSE
## 28 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2              FALSE
## 29 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4              FALSE
## 30 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6              FALSE
## 31 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8              FALSE
## 32 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2              FALSE
mtcars %>% filter(between(row_number(), 1, 10))
##     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 4  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 5  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 6  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 7  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 8  24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 9  22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 10 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4

4. Math

100 %/%6
## [1] 16
100%%6
## [1] 4
near(2,sqrt(2)^2)
## [1] TRUE

5. Misc

map(iris,class)
## $Sepal.Length
## [1] "numeric"
## 
## $Sepal.Width
## [1] "numeric"
## 
## $Petal.Length
## [1] "numeric"
## 
## $Petal.Width
## [1] "numeric"
## 
## $Species
## [1] "factor"
iris1$Species <- as.character(iris1$Species)
iris1 %>% mutate(Species1 = case_when(
  Species == "versicolor" ~ "versi",
  Species == "virginica" ~ "virgi",
  TRUE ~ Species)) %>% datatable()

6. na_if function

na_if(1:5, 5:1) # 用NA替换任何等于y的值
## [1]  1  2 NA  4  5
x <- c(1, -1, 0, 10)
100 / x
## [1]  100 -100  Inf   10
100 / na_if(x, 0)
## [1]  100 -100   NA   10
y <- c("abc", "def", "", "ghi")
na_if(y, "")
## [1] "abc" "def" NA    "ghi"
# na_if is particularly useful inside mutate, and is meant for use with vectors rather than entire data frames
starwars %>%
  select(name, eye_color) %>%
  mutate(eye_color = na_if(eye_color, "unknown")) %>% datatable()
# na_if can also be used with scoped variants of mutate like mutate_if to mutate multiple columns
starwars %>%
  mutate_if(is.character, list(~na_if(., "unknown"))) %>% datatable()

6. Summary函数总结

1. summarise() applies summary functions to columns to create a new table.

Summary functions take vectors as input and return single values as output

mtcars %>% glimpse()
## Observations: 32
## Variables: 11
## $ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl  <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 1...
## $ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt   <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...
mtcars %>% count(cyl,gear)
## # A tibble: 8 x 3
##     cyl  gear     n
##   <dbl> <dbl> <int>
## 1     4     3     1
## 2     4     4     8
## 3     4     5     2
## 4     6     3     2
## 5     6     4     4
## 6     6     5     1
## 7     8     3    12
## 8     8     5     2
mtcars %>% 
  group_by(cyl) %>% 
  summarise(num = n())
## # A tibble: 3 x 2
##     cyl   num
##   <dbl> <int>
## 1     4    11
## 2     6     7
## 3     8    14
mtcars %>% 
  group_by(cyl) %>% 
  summarise(n_distinct(gear))
## # A tibble: 3 x 2
##     cyl `n_distinct(gear)`
##   <dbl>              <int>
## 1     4                  3
## 2     6                  3
## 3     8                  2
sum(!is.na(c(1,2,NA))) # of non-NA’s
## [1] 2
sum(is.na(c(1:10,NA)))
## [1] 1
mtcars %>% group_by(cyl) %>% 
  summarise(first(disp))
## # A tibble: 3 x 2
##     cyl `first(disp)`
##   <dbl>         <dbl>
## 1     4           108
## 2     6           160
## 3     8           360
mtcars %>% group_by(cyl) %>% 
  summarise(last(disp))
## # A tibble: 3 x 2
##     cyl `last(disp)`
##   <dbl>        <dbl>
## 1     4          121
## 2     6          145
## 3     8          301
quantile(1:10)
##    0%   25%   50%   75%  100% 
##  1.00  3.25  5.50  7.75 10.00

7. 行名

1. Tidy data does not use rownames

Tidy data does not use rownames, which store a variable outside of the columns. To work with the rownames, first move them into a column.

a <- rownames_to_column(iris, var = "C")
a %>% datatable()
column_to_rownames(a, var = "C") %>% datatable()

8. 组合表格

1. Combine Variables

  1. Use a Mutating Join to join one table to columns from another, matching values with the rows that they correspond to. Each join retains a different combination of values from the tables.
x <- tibble::tibble(A = c('a','b','c'),
                    B = c('t','u','v'),
                    C = 1:3)

y <- tibble(A = c('a', 'b','d'),
            B = c('t','u','w'),
            D = 3:1)
x 
## # A tibble: 3 x 3
##   A     B         C
##   <chr> <chr> <int>
## 1 a     t         1
## 2 b     u         2
## 3 c     v         3
y
## # A tibble: 3 x 3
##   A     B         D
##   <chr> <chr> <int>
## 1 a     t         3
## 2 b     u         2
## 3 d     w         1
# 左连接
left_join(x,y)
## Joining, by = c("A", "B")
## # A tibble: 3 x 4
##   A     B         C     D
##   <chr> <chr> <int> <int>
## 1 a     t         1     3
## 2 b     u         2     2
## 3 c     v         3    NA
left_join(x,y,by = c("A","B"))
## # A tibble: 3 x 4
##   A     B         C     D
##   <chr> <chr> <int> <int>
## 1 a     t         1     3
## 2 b     u         2     2
## 3 c     v         3    NA
# 右连接
right_join(x,y)
## Joining, by = c("A", "B")
## # A tibble: 3 x 4
##   A     B         C     D
##   <chr> <chr> <int> <int>
## 1 a     t         1     3
## 2 b     u         2     2
## 3 d     w        NA     1
# 内连接
inner_join(x,y)
## Joining, by = c("A", "B")
## # A tibble: 2 x 4
##   A     B         C     D
##   <chr> <chr> <int> <int>
## 1 a     t         1     3
## 2 b     u         2     2
# 全连接
full_join(x,y)
## Joining, by = c("A", "B")
## # A tibble: 4 x 4
##   A     B         C     D
##   <chr> <chr> <int> <int>
## 1 a     t         1     3
## 2 b     u         2     2
## 3 c     v         3    NA
## 4 d     w        NA     1
left_join(x, y, by = "A")
## # A tibble: 3 x 5
##   A     B.x       C B.y       D
##   <chr> <chr> <int> <chr> <int>
## 1 a     t         1 t         3
## 2 b     u         2 u         2
## 3 c     v         3 <NA>     NA
right_join(x, y, by = "A")
## # A tibble: 3 x 5
##   A     B.x       C B.y       D
##   <chr> <chr> <int> <chr> <int>
## 1 a     t         1 t         3
## 2 b     u         2 u         2
## 3 d     <NA>     NA w         1
right_join(x,y,by = "B")
## # A tibble: 3 x 5
##   A.x   B         C A.y       D
##   <chr> <chr> <int> <chr> <int>
## 1 a     t         1 a         3
## 2 b     u         2 b         2
## 3 <NA>  w        NA d         1
left_join(x, y, by = c("C" = "D")) # to match on columns that have different names in each table.
## # A tibble: 3 x 5
##   A.x   B.x       C A.y   B.y  
##   <chr> <chr> <int> <chr> <chr>
## 1 a     t         1 d     w    
## 2 b     u         2 b     u    
## 3 c     v         3 a     t
# Use suffix to specify the suffix to give to unmatched columns that have the same name in both tables.
left_join(x, y, by = c("C" = "D"), suffix = c("1", "2"))
## # A tibble: 3 x 5
##   A1    B1        C A2    B2   
##   <chr> <chr> <int> <chr> <chr>
## 1 a     t         1 d     w    
## 2 b     u         2 b     u    
## 3 c     v         3 a     t

2. Combine Cases

x <- tibble(A = letters[1:3],
            B = c('t','u','v'),
            C = 1:3)
x
## # A tibble: 3 x 3
##   A     B         C
##   <chr> <chr> <int>
## 1 a     t         1
## 2 b     u         2
## 3 c     v         3
y <- tibble(A = c('c','d'),
            B = c('v','w'),
            C = 3:4)

bind_rows(x,y)
## # A tibble: 5 x 3
##   A     B         C
##   <chr> <chr> <int>
## 1 a     t         1
## 2 b     u         2
## 3 c     v         3
## 4 c     v         3
## 5 d     w         4
intersect(x,y)
## # A tibble: 1 x 3
##   A     B         C
##   <chr> <chr> <int>
## 1 c     v         3
union(x,y)
## # A tibble: 4 x 3
##   A     B         C
##   <chr> <chr> <int>
## 1 a     t         1
## 2 b     u         2
## 3 c     v         3
## 4 d     w         4
setdiff(x,y)
## # A tibble: 2 x 3
##   A     B         C
##   <chr> <chr> <int>
## 1 a     t         1
## 2 b     u         2
setequal(x,y) #  test whether two data sets contain the exact same rows (in any order)
## [1] FALSE