Merge tutorial

small data set

“?merge”

set.seed(123) # set random seed
x <- data.frame(id = c(1,1,2,2,3,3), 
                time = c("w1","w2","w1","w2","w1","w2"), 
                weight=c(sample(1:6)))
y <- data.frame(id = c(1,1,2,2,3,3), 
                time = c("w1","w2","w1","w2","w1","w2"), 
                height=rnorm(6))
y <- y[sample(nrow(y)),]

merge(x=x, y=y, by=c("id","time"))
##   id time weight   height
## 1  1   w1      2  0.07051
## 2  1   w2      4  0.12929
## 3  2   w1      6  1.71506
## 4  2   w2      3  0.46092
## 5  3   w1      5 -1.26506
## 6  3   w2      1 -0.68685
x <- data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5)
y <- data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5)
merge(x, y, by = c("k1","k2")) # NA's match
##   k1 k2 data.x data.y
## 1  4  4      4      4
## 2  5  5      5      5
## 3 NA NA      2      1
merge(x, y, by = "data") # NA's match, so 6 rows
##   data k1.x k2.x k1.y k2.y
## 1    1   NA    1   NA   NA
## 2    2   NA   NA    2   NA
## 3    3    3   NA   NA    3
## 4    4    4    4    4    4
## 5    5    5    5    5    5
merge(x, y, by = "k1") # NA's match, so 6 rows
##   k1 k2.x data.x k2.y data.y
## 1  4    4      4    4      4
## 2  5    5      5    5      5
## 3 NA    1      1   NA      1
## 4 NA    1      1    3      3
## 5 NA   NA      2   NA      1
## 6 NA   NA      2    3      3
merge(x, y, by = "k2") # NA's match, so 6 rows
##   k2 k1.x data.x k1.y data.y
## 1  4    4      4    4      4
## 2  5    5      5    5      5
## 3 NA   NA      2   NA      1
## 4 NA   NA      2    2      2
## 5 NA    3      3   NA      1
## 6 NA    3      3    2      2
merge(x, y, by = c("data", "k1","k2")) # NA's match
##   data k1 k2
## 1    4  4  4
## 2    5  5  5

big data set

use data.table package

install.packages("data.table")
## Error: trying to use CRAN without setting a mirror
library(data.table)
n=1e6
DT = data.table( a=sample(1:1000,n,replace=TRUE),
                 b=sample(1:1000,n,replace=TRUE),
                 c=rnorm(n),
                 d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE),
                 e=rnorm(n),
                 f=1:n)

DT2 = data.table(f=n:1,  g=rnorm(n))

merge(DT, DT2, by="f")
##              f   a   b       c    d       e        g
## 1e+00:       1 656 839 -0.9460  foo -0.4253  1.28413
## 2e+00:       2 709 968 -0.9979  baz -0.8791 -0.52712
## 3e+00:       3 545 336 -0.1102  baz  2.0788 -0.64525
## 4e+00:       4 595 979 -0.4677  baz -1.0269  0.63584
## 5e+00:       5 290 358  0.8173 quux -1.9013 -1.57476
##    ---                                              
## 1e+06:  999996 399 991  0.4975  foo  0.8475 -2.03451
## 1e+06:  999997 855 654 -0.4182  foo -0.9286  0.46599
## 1e+06:  999998 581  16  1.4037  qux -0.7189 -1.20245
## 1e+06:  999999 512  53  1.1885  bar -1.7627  0.07735
## 1e+06: 1000000 870 464 -1.0909  qux -1.6692 -1.11044