Merge two data frames by common columns or row names, or do other versions of database join operations. merge() is a generic function whose principal method is for data frames.By default the data frames are merged on the columns with names they both have, but separate specifications of the columns can be given by by.x and by.y. The rows in the two data frames that match on the specified columns are extracted, and joined together.
df1 = data.frame(CustomerId = c(1:6), Product = c(rep("Toaster", 3), rep("Radio",
3)))
df2 = data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio",
1)))
df1
CustomerId Product
1 1 Toaster
2 2 Toaster
3 3 Toaster
4 4 Radio
5 5 Radio
6 6 Radio
df2
CustomerId State
1 2 Alabama
2 4 Alabama
3 6 Ohio
Columns to merge on can be specified by name, number or by a logical vector. If all.x is true, all the non matching cases of x are appended to the result as well, with NA filled in the corresponding columns of y; analogously for all.y.
# Outer join
merge(x = df1, y = df2, by = "CustomerId", all = TRUE)
CustomerId Product State
1 1 Toaster <NA>
2 2 Toaster Alabama
3 3 Toaster <NA>
4 4 Radio Alabama
5 5 Radio <NA>
6 6 Radio Ohio
# Left outer
merge(x = df1, y = df2, by = "CustomerId", all.x = TRUE)
CustomerId Product State
1 1 Toaster <NA>
2 2 Toaster Alabama
3 3 Toaster <NA>
4 4 Radio Alabama
5 5 Radio <NA>
6 6 Radio Ohio
# Right outer
merge(x = df1, y = df2, by = "CustomerId", all.y = TRUE)
CustomerId Product State
1 2 Toaster Alabama
2 4 Radio Alabama
3 6 Radio Ohio
# Cross join
merge(x = df1, y = df2, by = NULL)
CustomerId.x Product CustomerId.y State
1 1 Toaster 2 Alabama
2 2 Toaster 2 Alabama
3 3 Toaster 2 Alabama
4 4 Radio 2 Alabama
5 5 Radio 2 Alabama
6 6 Radio 2 Alabama
7 1 Toaster 4 Alabama
8 2 Toaster 4 Alabama
9 3 Toaster 4 Alabama
10 4 Radio 4 Alabama
11 5 Radio 4 Alabama
12 6 Radio 4 Alabama
13 1 Toaster 6 Ohio
14 2 Toaster 6 Ohio
15 3 Toaster 6 Ohio
16 4 Radio 6 Ohio
17 5 Radio 6 Ohio
18 6 Radio 6 Ohio
# default
df = merge(x = df1, y = df2)
df
CustomerId Product State
1 2 Toaster Alabama
2 4 Radio Alabama
3 6 Radio Ohio
# combine two data frame by colume (the shorter one is recycled)
cbind(df1, df2)
CustomerId Product CustomerId State
1 1 Toaster 2 Alabama
2 2 Toaster 4 Alabama
3 3 Toaster 6 Ohio
4 4 Radio 2 Alabama
5 5 Radio 4 Alabama
6 6 Radio 6 Ohio
# combine two vector by colume (the shorter one is recycled)
cbind(1:5, 6:10)
[,1] [,2]
[1,] 1 6
[2,] 2 7
[3,] 3 8
[4,] 4 9
[5,] 5 10
cbind(1:10, c("A", "B"))
[,1] [,2]
[1,] "1" "A"
[2,] "2" "B"
[3,] "3" "A"
[4,] "4" "B"
[5,] "5" "A"
[6,] "6" "B"
[7,] "7" "A"
[8,] "8" "B"
[9,] "9" "A"
[10,] "10" "B"
cbind(1:6, 7:12, "A", c("B", "C"))
[,1] [,2] [,3] [,4]
[1,] "1" "7" "A" "B"
[2,] "2" "8" "A" "C"
[3,] "3" "9" "A" "B"
[4,] "4" "10" "A" "C"
[5,] "5" "11" "A" "B"
[6,] "6" "12" "A" "C"
# combine two vector by row (the shorter one is recycled)
rbind(1:5, 6:10)
[,1] [,2] [,3] [,4] [,5]
[1,] 1 2 3 4 5
[2,] 6 7 8 9 10
rbind(1:10, c("A", "B"))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
[2,] "A" "B" "A" "B" "A" "B" "A" "B" "A" "B"
rbind(1:6, 7:12, "A", c("B", "C"))
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "1" "2" "3" "4" "5" "6"
[2,] "7" "8" "9" "10" "11" "12"
[3,] "A" "A" "A" "A" "A" "A"
[4,] "B" "C" "B" "C" "B" "C"
# conditions
subset(df1, CustomerId > 3)
CustomerId Product
4 4 Radio
5 5 Radio
6 6 Radio
subset(df, CustomerId > 3)
CustomerId Product State
2 4 Radio Alabama
3 6 Radio Ohio
subset(df1, CustomerId > 3 & CustomerId < 6)
CustomerId Product
4 4 Radio
5 5 Radio
# select
subset(df, CustomerId > 3, select = c(Product, State))
Product State
2 Radio Alabama
3 Radio Ohio
subset(df, CustomerId > 3, select = -c(State))
CustomerId Product
2 4 Radio
3 6 Radio
# %in%
subset(df1, subset = CustomerId %in% c(1, 3, 5))
CustomerId Product
1 1 Toaster
3 3 Toaster
5 5 Radio
# with
df1[with(df1, CustomerId > 3), ]
CustomerId Product
4 4 Radio
5 5 Radio
6 6 Radio
df1[with(df1, CustomerId %in% c(1, 3, 5)), ]
CustomerId Product
1 1 Toaster
3 3 Toaster
5 5 Radio
# remove duplicated
df3 = rbind(df1, df1)
df3
CustomerId Product
1 1 Toaster
2 2 Toaster
3 3 Toaster
4 4 Radio
5 5 Radio
6 6 Radio
7 1 Toaster
8 2 Toaster
9 3 Toaster
10 4 Radio
11 5 Radio
12 6 Radio
df3[!duplicated(df3[, c("CustomerId")]), ]
CustomerId Product
1 1 Toaster
2 2 Toaster
3 3 Toaster
4 4 Radio
5 5 Radio
6 6 Radio
Reference one