knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 1.0.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DT)
iris %>% glimpse()
## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species <fct> setosa, setosa, setosa, setosa, setosa, setosa, s...
iris %>% dplyr::filter(Sepal.Length > 7) %>%
datatable()
dplyr::distinct(iris, Species) %>%
datatable()
dplyr::sample_frac(iris, 0.5) %>%
datatable()
iris1 <- iris %>%
mutate(id = seq(150)) %>%
select(id,everything())
sample_n(iris1, 10) %>%
datatable()
slice(iris1,10:15) %>%
datatable()
iris1 %>% top_n(10,Sepal.Length) %>%
datatable()
iris1 %>%
arrange(desc(Sepal.Length)) %>%
slice(1:10) %>%
datatable()
Logical and boolean operators to use with filter()< <= is.na() %in% | xor() > >= !is.na() ! &
mpg %>% datatable()
mpg %>% filter(displ < 2) %>%
datatable()
mpg %>% mutate(id = c(1:232,NA,NA)) %>%
filter(is.na(id)) %>%
datatable()
mpg %>%
filter(drv == "f"|drv == "4") %>%
datatable()
Order rows by values of a column or columns (low to high), use with desc() to order from high to low.
mtcars %>% arrange(mpg) %>%
datatable()
mtcars %>% arrange(desc(mpg)) %>%
datatable()
faithful %>% head()
## eruptions waiting
## 1 3.600 79
## 2 1.800 54
## 3 3.333 74
## 4 2.283 62
## 5 4.533 85
## 6 2.883 55
faithful %>% add_case(eruptions = 1,waiting = 1) %>% tail() %>% datatable()
pull: Extract column values as a vector.Choose by name or index. However,Select: Extract columns as a table.
dplyr::pull(iris1,Sepal.Length) %>% str()
## num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
iris1 %>% select(Sepal.Length) %>% str()
## 'data.frame': 150 obs. of 1 variable:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
starts_with(): Starts with a prefix.
ends_with(): Ends with a suffix.
contains(): Contains a literal string.
matches(): Matches a regular expression.
num_range(): Matches a numerical range like x01, x02, x03.
one_of(): Matches variable names in a character vector.
everything(): Matches all variables.
last_col(): Select last variable, possibly with an offset.
names(iris1)
## [1] "id" "Sepal.Length" "Sepal.Width" "Petal.Length"
## [5] "Petal.Width" "Species"
iris1 %>% select(contains("Se")) %>% head() %>% datatable()
iris1 %>% select(ends_with("th")) %>% head() %>% datatable()
iris1 %>% select(starts_with("Se")) %>% head() %>% datatable()
iris1 %>% select(id:Sepal.Length) %>% head() %>% datatable()
iris1 %>% select(matches(".t.")) %>% head %>% datatable()
# Compute new column(s)
mutate(mtcars,gpm = 1/mpg) %>% head() %>% datatable()
# Apply funs to every column.
mutate_all(faithful, funs(log(.),log2(.))) %>% head() %>% datatable()
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
mutate_if(iris,is.numeric,list(log = log)) %>% head() %>% datatable()
# Apply funs to specific columns.
iris %>% mutate_at(vars(-Species),list(log = log)) %>% head() %>% datatable()
add_column(mtcars, new = 1:32) %>% head() %>% datatable()
rename(iris, Length = Sepal.Length) %>% head() %>% datatable()
These apply summary functions to columns to create a new table of summarise statistics. Summarise functions take vectors as input and return one value (see back).
mtcars %>% summarise(avg = mean(mpg))
## avg
## 1 20.09062
# Count number of rows in each group defined by the variables
iris %>% count(Species)
## # A tibble: 3 x 2
## Species n
## <fct> <int>
## 1 setosa 50
## 2 versicolor 50
## 3 virginica 50
iris %>% summarise_all(list(length = length)) # list function
## Sepal.Length_length Sepal.Width_length Petal.Length_length
## 1 150 150 150
## Petal.Width_length Species_length
## 1 150 150
by_species <- iris %>%
group_by(Species)
# The _at() variants directly support strings:
starwars %>% head() %>% datatable()
starwars %>%
summarise_at(vars("height", "mass"), mean, na.rm = TRUE)
## # A tibble: 1 x 2
## height mass
## <dbl> <dbl>
## 1 174. 97.3
starwars %>%
summarise_at(vars(height:mass), mean, na.rm = TRUE)
## # A tibble: 1 x 2
## height mass
## <dbl> <dbl>
## 1 174. 97.3
starwars %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
## # A tibble: 1 x 3
## height mass birth_year
## <dbl> <dbl> <dbl>
## 1 174. 97.3 87.6
by_species %>%
summarise_all(list(min, max)) %>% datatable()
# Note how the new variables include the function name, in order to keep things distinct. Passing purrr-style lambdas often creates better default names:
by_species %>%
summarise_all(list(~min(.), ~max(.)))
## # A tibble: 3 x 9
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min Petal.Width_min
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 4.3 2.3 1 0.1
## 2 versic~ 4.9 2 3 1
## 3 virgin~ 4.9 2.2 4.5 1.4
## # ... with 4 more variables: Sepal.Length_max <dbl>,
## # Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>
# When that's not good enough, you can also supply the names explicitly:
by_species %>%
summarise_all(list(min = min, max = max)) # 更直接
## # A tibble: 3 x 9
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min Petal.Width_min
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 4.3 2.3 1 0.1
## 2 versic~ 4.9 2 3 1
## 3 virgin~ 4.9 2.2 4.5 1.4
## # ... with 4 more variables: Sepal.Length_max <dbl>,
## # Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>
# When there's only one function in the list, it modifies existing variables in place. Give it a name to create new variables instead:
by_species %>% summarise_all(list(median))
## # A tibble: 3 x 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5 3.4 1.5 0.2
## 2 versicolor 5.9 2.8 4.35 1.3
## 3 virginica 6.5 3 5.55 2
by_species %>% summarise_all(list(med = median))
## # A tibble: 3 x 5
## Species Sepal.Length_med Sepal.Width_med Petal.Length_med Petal.Width_med
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5 3.4 1.5 0.2
## 2 versic~ 5.9 2.8 4.35 1.3
## 3 virgin~ 6.5 3 5.55 2
by_species %>% summarise_all(list(Q3 = quantile), probs = 0.75)
## # A tibble: 3 x 5
## Species Sepal.Length_Q3 Sepal.Width_Q3 Petal.Length_Q3 Petal.Width_Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.2 3.68 1.58 0.3
## 2 versicolor 6.3 3 4.6 1.5
## 3 virginica 6.9 3.18 5.88 2.3
group_by() to create a “grouped” copy of a table.dplyr functions will manipulate each “group” separately and then combine the results.
mtcars %>%
group_by(cyl) %>%
summarise(mpg.avg = mean(mpg))
## # A tibble: 3 x 2
## cyl mpg.avg
## <dbl> <dbl>
## 1 4 26.7
## 2 6 19.7
## 3 8 15.1
g_iris <- group_by(iris, Species)
g_iris %>% glimpse()
## Observations: 150
## Variables: 5
## Groups: Species [3]
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species <fct> setosa, setosa, setosa, setosa, setosa, setosa, s...
dplyr::lag(1:10) # Offset elements by 1
## [1] NA 1 2 3 4 5 6 7 8 9
dplyr::lead(1:10) # Offset elements by -1
## [1] 2 3 4 5 6 7 8 9 10 NA
dplyr::cumall(1:10)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
dplyr::cumany(1:10)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# as the input vector.
x <- c(1, 3, 5, 2, 2)
cummean(x)
## [1] 1.00 2.00 3.00 2.75 2.60
cumsum(x) / seq_along(x)
## [1] 1.00 2.00 3.00 2.75 2.60
# `cumall()` and `cumany()` return logicals
cumall(x < 5)
## [1] TRUE TRUE FALSE FALSE FALSE
cumany(x == 3)
## [1] FALSE TRUE TRUE TRUE TRUE
# `cumall()` vs. `cumany()`
df <- data.frame(
date = as.Date("2020-01-01") + 0:6,
balance = c(100, 50, 25, -25, -50, 30, 120)
)
df
## date balance
## 1 2020-01-01 100
## 2 2020-01-02 50
## 3 2020-01-03 25
## 4 2020-01-04 -25
## 5 2020-01-05 -50
## 6 2020-01-06 30
## 7 2020-01-07 120
# all rows after first overdraft
df %>% filter(cumany(balance < 0))
## date balance
## 1 2020-01-04 -25
## 2 2020-01-05 -50
## 3 2020-01-06 30
## 4 2020-01-07 120
# all rows until first overdraft
df %>% filter(cumall(!(balance < 0)))
## date balance
## 1 2020-01-01 100
## 2 2020-01-02 50
## 3 2020-01-03 25
cummax(1:10)
## [1] 1 2 3 4 5 6 7 8 9 10
cummin(1:10)
## [1] 1 1 1 1 1 1 1 1 1 1
cummean(1:10)
## [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
cumsum(1:10)
## [1] 1 3 6 10 15 21 28 36 45 55
cumprod(1:10)
## [1] 1 2 6 24 120 720 5040 40320
## [9] 362880 3628800
x <- c(5, 1, 3, 2, 2, NA)
row_number(x)
## [1] 5 1 4 2 3 NA
min_rank(x)
## [1] 5 1 4 2 2 NA
dense_rank(x)
## [1] 4 1 3 2 2 NA
percent_rank(x)
## [1] 1.00 0.00 0.75 0.25 0.25 NA
cume_dist(x)
## [1] 1.0 0.2 0.8 0.6 0.6 NA
ntile(x, 2)
## [1] 2 1 2 1 1 NA
ntile(runif(100), 10)
## [1] 9 5 5 7 10 5 8 2 8 9 10 2 6 10 3 10 8 1 2 10 7 2 6
## [24] 1 6 4 9 5 6 9 7 9 6 4 9 8 3 5 10 1 7 6 10 7 6 9
## [47] 8 3 8 7 6 1 4 3 1 7 8 5 4 7 9 1 2 2 2 2 1 10 6
## [70] 6 3 10 3 5 8 3 7 2 8 2 5 1 4 5 4 5 4 3 9 4 9 10
## [93] 4 7 3 1 3 1 8 4
# row_number can be used with single table verbs without specifying x
# (for data frames and databases that support windowing)
mutate(mtcars, row_number() == 1L)
## mpg cyl disp hp drat wt qsec vs am gear carb row_number() == 1L
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 TRUE
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 FALSE
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 FALSE
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 FALSE
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 FALSE
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 FALSE
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 FALSE
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 FALSE
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 FALSE
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 FALSE
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 FALSE
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 FALSE
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 FALSE
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 FALSE
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 FALSE
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 FALSE
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 FALSE
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 FALSE
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 FALSE
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 FALSE
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 FALSE
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 FALSE
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 FALSE
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 FALSE
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 FALSE
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 FALSE
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 FALSE
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 FALSE
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 FALSE
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 FALSE
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 FALSE
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 FALSE
mtcars %>% filter(between(row_number(), 1, 10))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
100 %/%6
## [1] 16
100%%6
## [1] 4
near(2,sqrt(2)^2)
## [1] TRUE
map(iris,class)
## $Sepal.Length
## [1] "numeric"
##
## $Sepal.Width
## [1] "numeric"
##
## $Petal.Length
## [1] "numeric"
##
## $Petal.Width
## [1] "numeric"
##
## $Species
## [1] "factor"
iris1$Species <- as.character(iris1$Species)
iris1 %>% mutate(Species1 = case_when(
Species == "versicolor" ~ "versi",
Species == "virginica" ~ "virgi",
TRUE ~ Species)) %>% datatable()
na_if(1:5, 5:1) # 用NA替换任何等于y的值
## [1] 1 2 NA 4 5
x <- c(1, -1, 0, 10)
100 / x
## [1] 100 -100 Inf 10
100 / na_if(x, 0)
## [1] 100 -100 NA 10
y <- c("abc", "def", "", "ghi")
na_if(y, "")
## [1] "abc" "def" NA "ghi"
# na_if is particularly useful inside mutate, and is meant for use with vectors rather than entire data frames
starwars %>%
select(name, eye_color) %>%
mutate(eye_color = na_if(eye_color, "unknown")) %>% datatable()
# na_if can also be used with scoped variants of mutate like mutate_if to mutate multiple columns
starwars %>%
mutate_if(is.character, list(~na_if(., "unknown"))) %>% datatable()
summarise() applies summary functions to columns to create a new table.Summary functions take vectors as input and return single values as output
mtcars %>% glimpse()
## Observations: 32
## Variables: 11
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 1...
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...
mtcars %>% count(cyl,gear)
## # A tibble: 8 x 3
## cyl gear n
## <dbl> <dbl> <int>
## 1 4 3 1
## 2 4 4 8
## 3 4 5 2
## 4 6 3 2
## 5 6 4 4
## 6 6 5 1
## 7 8 3 12
## 8 8 5 2
mtcars %>%
group_by(cyl) %>%
summarise(num = n())
## # A tibble: 3 x 2
## cyl num
## <dbl> <int>
## 1 4 11
## 2 6 7
## 3 8 14
mtcars %>%
group_by(cyl) %>%
summarise(n_distinct(gear))
## # A tibble: 3 x 2
## cyl `n_distinct(gear)`
## <dbl> <int>
## 1 4 3
## 2 6 3
## 3 8 2
sum(!is.na(c(1,2,NA))) # of non-NA’s
## [1] 2
sum(is.na(c(1:10,NA)))
## [1] 1
mtcars %>% group_by(cyl) %>%
summarise(first(disp))
## # A tibble: 3 x 2
## cyl `first(disp)`
## <dbl> <dbl>
## 1 4 108
## 2 6 160
## 3 8 360
mtcars %>% group_by(cyl) %>%
summarise(last(disp))
## # A tibble: 3 x 2
## cyl `last(disp)`
## <dbl> <dbl>
## 1 4 121
## 2 6 145
## 3 8 301
quantile(1:10)
## 0% 25% 50% 75% 100%
## 1.00 3.25 5.50 7.75 10.00
Tidy data does not use rownames, which store a variable outside of the columns. To work with the rownames, first move them into a column.
a <- rownames_to_column(iris, var = "C")
a %>% datatable()
column_to_rownames(a, var = "C") %>% datatable()
Mutating Join to join one table to columns from another, matching values with the rows that they correspond to. Each join retains a different combination of values from the tables.x <- tibble::tibble(A = c('a','b','c'),
B = c('t','u','v'),
C = 1:3)
y <- tibble(A = c('a', 'b','d'),
B = c('t','u','w'),
D = 3:1)
x
## # A tibble: 3 x 3
## A B C
## <chr> <chr> <int>
## 1 a t 1
## 2 b u 2
## 3 c v 3
y
## # A tibble: 3 x 3
## A B D
## <chr> <chr> <int>
## 1 a t 3
## 2 b u 2
## 3 d w 1
# 左连接
left_join(x,y)
## Joining, by = c("A", "B")
## # A tibble: 3 x 4
## A B C D
## <chr> <chr> <int> <int>
## 1 a t 1 3
## 2 b u 2 2
## 3 c v 3 NA
left_join(x,y,by = c("A","B"))
## # A tibble: 3 x 4
## A B C D
## <chr> <chr> <int> <int>
## 1 a t 1 3
## 2 b u 2 2
## 3 c v 3 NA
# 右连接
right_join(x,y)
## Joining, by = c("A", "B")
## # A tibble: 3 x 4
## A B C D
## <chr> <chr> <int> <int>
## 1 a t 1 3
## 2 b u 2 2
## 3 d w NA 1
# 内连接
inner_join(x,y)
## Joining, by = c("A", "B")
## # A tibble: 2 x 4
## A B C D
## <chr> <chr> <int> <int>
## 1 a t 1 3
## 2 b u 2 2
# 全连接
full_join(x,y)
## Joining, by = c("A", "B")
## # A tibble: 4 x 4
## A B C D
## <chr> <chr> <int> <int>
## 1 a t 1 3
## 2 b u 2 2
## 3 c v 3 NA
## 4 d w NA 1
left_join(x, y, by = "A")
## # A tibble: 3 x 5
## A B.x C B.y D
## <chr> <chr> <int> <chr> <int>
## 1 a t 1 t 3
## 2 b u 2 u 2
## 3 c v 3 <NA> NA
right_join(x, y, by = "A")
## # A tibble: 3 x 5
## A B.x C B.y D
## <chr> <chr> <int> <chr> <int>
## 1 a t 1 t 3
## 2 b u 2 u 2
## 3 d <NA> NA w 1
right_join(x,y,by = "B")
## # A tibble: 3 x 5
## A.x B C A.y D
## <chr> <chr> <int> <chr> <int>
## 1 a t 1 a 3
## 2 b u 2 b 2
## 3 <NA> w NA d 1
left_join(x, y, by = c("C" = "D")) # to match on columns that have different names in each table.
## # A tibble: 3 x 5
## A.x B.x C A.y B.y
## <chr> <chr> <int> <chr> <chr>
## 1 a t 1 d w
## 2 b u 2 b u
## 3 c v 3 a t
# Use suffix to specify the suffix to give to unmatched columns that have the same name in both tables.
left_join(x, y, by = c("C" = "D"), suffix = c("1", "2"))
## # A tibble: 3 x 5
## A1 B1 C A2 B2
## <chr> <chr> <int> <chr> <chr>
## 1 a t 1 d w
## 2 b u 2 b u
## 3 c v 3 a t
x <- tibble(A = letters[1:3],
B = c('t','u','v'),
C = 1:3)
x
## # A tibble: 3 x 3
## A B C
## <chr> <chr> <int>
## 1 a t 1
## 2 b u 2
## 3 c v 3
y <- tibble(A = c('c','d'),
B = c('v','w'),
C = 3:4)
bind_rows(x,y)
## # A tibble: 5 x 3
## A B C
## <chr> <chr> <int>
## 1 a t 1
## 2 b u 2
## 3 c v 3
## 4 c v 3
## 5 d w 4
intersect(x,y)
## # A tibble: 1 x 3
## A B C
## <chr> <chr> <int>
## 1 c v 3
union(x,y)
## # A tibble: 4 x 3
## A B C
## <chr> <chr> <int>
## 1 a t 1
## 2 b u 2
## 3 c v 3
## 4 d w 4
setdiff(x,y)
## # A tibble: 2 x 3
## A B C
## <chr> <chr> <int>
## 1 a t 1
## 2 b u 2
setequal(x,y) # test whether two data sets contain the exact same rows (in any order)
## [1] FALSE