This example demonstrates merging two data.table objects, aggregating some of the data (which results in a long data.table), using expand.grid to create a “balanced” long data.table, and then “reshaping” the aggregated data.table to a wide format.
A <- structure(list(Ord_num_1 = c(100, 101), Ord_date_1 = c("1/1/2013", "1/15/2013"),
Part_num = c(1001, 1002), Cust_ID = c(1111, 1111)), .Names = c("Ord_num_1",
"Ord_date_1", "Part_num", "Cust_ID"), row.names = 1:2, class = "data.frame")
B <- structure(list(Ord_num_2 = c(200, 201, 202, 203), Ord_date_2 = c("2/1/2013",
"3/15/2013", "4/18/2013", "5/1/2013"), Part_num = c(2001, 2002, 2002, 2002),
Ord_num_1 = c(100, 100, 100, 101)), .Names = c("Ord_num_2", "Ord_date_2",
"Part_num", "Ord_num_1"), row.names = c(NA, 4L), class = "data.frame")
The data look like:
A
## Ord_num_1 Ord_date_1 Part_num Cust_ID
## 1 100 1/1/2013 1001 1111
## 2 101 1/15/2013 1002 1111
B
## Ord_num_2 Ord_date_2 Part_num Ord_num_1
## 1 200 2/1/2013 2001 100
## 2 201 3/15/2013 2002 100
## 3 202 4/18/2013 2002 100
## 4 203 5/1/2013 2002 101
Convert the data.frames to data.tables. Set a key on which to merge.
library(data.table)
DTA <- data.table(A, key = "Ord_num_1")
DTB <- data.table(B, key = "Ord_num_1")
DTM <- merge(DTA, DTB)
DTM
## Ord_num_1 Ord_date_1 Part_num.x Cust_ID Ord_num_2 Ord_date_2 Part_num.y
## 1: 100 1/1/2013 1001 1111 200 2/1/2013 2001
## 2: 100 1/1/2013 1001 1111 201 3/15/2013 2002
## 3: 100 1/1/2013 1001 1111 202 4/18/2013 2002
## 4: 101 1/15/2013 1002 1111 203 5/1/2013 2002
Aggregate the data.
DTML <- DTM[, length(Part_num.x), by = c("Ord_num_1", "Ord_date_1", "Cust_ID",
"Part_num.y")]
DTML
## Ord_num_1 Ord_date_1 Cust_ID Part_num.y V1
## 1: 100 1/1/2013 1111 2001 1
## 2: 100 1/1/2013 1111 2002 2
## 3: 101 1/15/2013 1111 2002 1
These data are “unbalanced”. Use expand.grid to make it balanced.
DTML <- DTML[expand.grid(lapply(DTML, unique)[setdiff(names(DTML), "V1")])]
DTML
## Ord_num_1 Ord_date_1 Cust_ID Part_num.y V1
## 1: 100 1/1/2013 1111 2001 1
## 2: 101 1/1/2013 1111 2001 NA
## 3: 100 1/15/2013 1111 2001 NA
## 4: 101 1/15/2013 1111 2001 NA
## 5: 100 1/1/2013 1111 2002 2
## 6: 101 1/1/2013 1111 2002 NA
## 7: 100 1/15/2013 1111 2002 NA
## 8: 101 1/15/2013 1111 2002 1
Now, convert it from long to wide, and optionally, remove rows where both columns “2001” and “2002” are NA.
DTMW <- DTML[, as.list(setattr(V1, "names", Part_num.y)), by = eval(setdiff(names(DTML),
c("V1", "Part_num.y")))]
DTMW[rowSums(is.na(DTMW[, c("2001", "2002"), with = FALSE])) < 2, ]
## Ord_num_1 Ord_date_1 Cust_ID 2001 2002
## 1: 100 1/1/2013 1111 1 2
## 2: 101 1/15/2013 1111 NA 1
NA