Merge data and subset

merge() function

Merge two data frames by common columns or row names, or do other versions of database join operations. merge() is a generic function whose principal method is for data frames.By default the data frames are merged on the columns with names they both have, but separate specifications of the columns can be given by by.x and by.y. The rows in the two data frames that match on the specified columns are extracted, and joined together.

generate two data frame

df1 = data.frame(CustomerId = c(1:6), Product = c(rep("Toaster", 3), rep("Radio", 
    3)))
df2 = data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio", 
    1)))
df1

  CustomerId Product
1          1 Toaster
2          2 Toaster
3          3 Toaster
4          4   Radio
5          5   Radio
6          6   Radio

df2

  CustomerId   State
1          2 Alabama
2          4 Alabama
3          6    Ohio

merge two data frame

Columns to merge on can be specified by name, number or by a logical vector. If all.x is true, all the non matching cases of x are appended to the result as well, with NA filled in the corresponding columns of y; analogously for all.y.

# Outer join
merge(x = df1, y = df2, by = "CustomerId", all = TRUE)

  CustomerId Product   State
1          1 Toaster    <NA>
2          2 Toaster Alabama
3          3 Toaster    <NA>
4          4   Radio Alabama
5          5   Radio    <NA>
6          6   Radio    Ohio


# Left outer
merge(x = df1, y = df2, by = "CustomerId", all.x = TRUE)

  CustomerId Product   State
1          1 Toaster    <NA>
2          2 Toaster Alabama
3          3 Toaster    <NA>
4          4   Radio Alabama
5          5   Radio    <NA>
6          6   Radio    Ohio


# Right outer
merge(x = df1, y = df2, by = "CustomerId", all.y = TRUE)

  CustomerId Product   State
1          2 Toaster Alabama
2          4   Radio Alabama
3          6   Radio    Ohio


# Cross join
merge(x = df1, y = df2, by = NULL)

   CustomerId.x Product CustomerId.y   State
1             1 Toaster            2 Alabama
2             2 Toaster            2 Alabama
3             3 Toaster            2 Alabama
4             4   Radio            2 Alabama
5             5   Radio            2 Alabama
6             6   Radio            2 Alabama
7             1 Toaster            4 Alabama
8             2 Toaster            4 Alabama
9             3 Toaster            4 Alabama
10            4   Radio            4 Alabama
11            5   Radio            4 Alabama
12            6   Radio            4 Alabama
13            1 Toaster            6    Ohio
14            2 Toaster            6    Ohio
15            3 Toaster            6    Ohio
16            4   Radio            6    Ohio
17            5   Radio            6    Ohio
18            6   Radio            6    Ohio


# default
df = merge(x = df1, y = df2)
df

  CustomerId Product   State
1          2 Toaster Alabama
2          4   Radio Alabama
3          6   Radio    Ohio

cbind() and rbind(), combine by columns or rows

# combine two data frame by colume (the shorter one is recycled)
cbind(df1, df2)

  CustomerId Product CustomerId   State
1          1 Toaster          2 Alabama
2          2 Toaster          4 Alabama
3          3 Toaster          6    Ohio
4          4   Radio          2 Alabama
5          5   Radio          4 Alabama
6          6   Radio          6    Ohio


# combine two vector by colume (the shorter one is recycled)
cbind(1:5, 6:10)

     [,1] [,2]
[1,]    1    6
[2,]    2    7
[3,]    3    8
[4,]    4    9
[5,]    5   10

cbind(1:10, c("A", "B"))

      [,1] [,2]
 [1,] "1"  "A" 
 [2,] "2"  "B" 
 [3,] "3"  "A" 
 [4,] "4"  "B" 
 [5,] "5"  "A" 
 [6,] "6"  "B" 
 [7,] "7"  "A" 
 [8,] "8"  "B" 
 [9,] "9"  "A" 
[10,] "10" "B"

cbind(1:6, 7:12, "A", c("B", "C"))

     [,1] [,2] [,3] [,4]
[1,] "1"  "7"  "A"  "B" 
[2,] "2"  "8"  "A"  "C" 
[3,] "3"  "9"  "A"  "B" 
[4,] "4"  "10" "A"  "C" 
[5,] "5"  "11" "A"  "B" 
[6,] "6"  "12" "A"  "C"


# combine two vector by row (the shorter one is recycled)
rbind(1:5, 6:10)

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    2    3    4    5
[2,]    6    7    8    9   10

rbind(1:10, c("A", "B"))

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" 
[2,] "A"  "B"  "A"  "B"  "A"  "B"  "A"  "B"  "A"  "B"

rbind(1:6, 7:12, "A", c("B", "C"))

     [,1] [,2] [,3] [,4] [,5] [,6]
[1,] "1"  "2"  "3"  "4"  "5"  "6" 
[2,] "7"  "8"  "9"  "10" "11" "12"
[3,] "A"  "A"  "A"  "A"  "A"  "A" 
[4,] "B"  "C"  "B"  "C"  "B"  "C"

subset()

# conditions
subset(df1, CustomerId > 3)

  CustomerId Product
4          4   Radio
5          5   Radio
6          6   Radio

subset(df, CustomerId > 3)

  CustomerId Product   State
2          4   Radio Alabama
3          6   Radio    Ohio

subset(df1, CustomerId > 3 & CustomerId < 6)

  CustomerId Product
4          4   Radio
5          5   Radio


# select
subset(df, CustomerId > 3, select = c(Product, State))

  Product   State
2   Radio Alabama
3   Radio    Ohio

subset(df, CustomerId > 3, select = -c(State))

  CustomerId Product
2          4   Radio
3          6   Radio


# %in%
subset(df1, subset = CustomerId %in% c(1, 3, 5))

  CustomerId Product
1          1 Toaster
3          3 Toaster
5          5   Radio


# with
df1[with(df1, CustomerId > 3), ]

  CustomerId Product
4          4   Radio
5          5   Radio
6          6   Radio

df1[with(df1, CustomerId %in% c(1, 3, 5)), ]

  CustomerId Product
1          1 Toaster
3          3 Toaster
5          5   Radio


# remove duplicated
df3 = rbind(df1, df1)
df3

   CustomerId Product
1           1 Toaster
2           2 Toaster
3           3 Toaster
4           4   Radio
5           5   Radio
6           6   Radio
7           1 Toaster
8           2 Toaster
9           3 Toaster
10          4   Radio
11          5   Radio
12          6   Radio

df3[!duplicated(df3[, c("CustomerId")]), ]

  CustomerId Product
1          1 Toaster
2          2 Toaster
3          3 Toaster
4          4   Radio
5          5   Radio
6          6   Radio

Reference one