Our task is to read 10 csv files, stitch them together, and plot the results with a variable to keep track of the original data file. We compare below a few approaches, roughly in chronological order.
tmp <- tempdir()
knitr::opts_chunk$set(root.dir = tmp)
dummy <- function(id) {
N <- sample(10:100,1)
write.table(data.frame(x=seq_len(N), y=sin(seq(0,2*pi,length=N)) + id),
file = paste0(id,".txt"),
row.names = FALSE)
}
purrr::walk(1:10, dummy)
lf <- list.files(pattern="\\.txt$")
lf
## [1] "1.txt" "10.txt" "2.txt" "3.txt" "4.txt" "5.txt" "6.txt"
## [8] "7.txt" "8.txt" "9.txt"
(note the order 1, 10, 2, ...)
ld <- vector(mode = "list", length = length(lf))
for (i in seq_along(lf)){
ld[[i]] <- cbind(read.table(lf[[i]], header = TRUE), .id = as.character(i))
}
m1 <- do.call(rbind, ld)
str(m1)
## 'data.frame': 482 obs. of 3 variables:
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ y : num 1 1.2 1.39 1.57 1.72 ...
## $ .id: Factor w/ 10 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
Disadvantages:
m2 <- do.call(rbind, lapply(lf, function(f) cbind(read.table(f, header = TRUE), .id = gsub("\\.txt","",f))))
str(m2)
## 'data.frame': 482 obs. of 3 variables:
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ y : num 1 1.2 1.39 1.57 1.72 ...
## $ .id: Factor w/ 10 levels "1","10","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
Disadvantages:
m3 <- reshape2::melt(lapply(lf, read.table, header = TRUE), id.vars = c("x","y"))
m3$.id <- as.character(m3$L1)
str(m3)
## 'data.frame': 482 obs. of 4 variables:
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ y : num 1 1.2 1.39 1.57 1.72 ...
## $ L1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ .id: chr "1" "1" "1" "1" ...
Disadvantages:
id.varsL1 is hard-codednames(lf) <- gsub("\\.txt","",lf)
m4 <- plyr::ldply(lf, read.table, header = TRUE)
str(m4)
## 'data.frame': 482 obs. of 3 variables:
## $ .id: chr "1" "1" "1" "1" ...
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ y : num 1 1.2 1.39 1.57 1.72 ...
Disadvantages:
.id variable not createdm5 <- purrr::map_df(lf, read.table, header = TRUE, .id=".id")
str(m5)
## 'data.frame': 482 obs. of 3 variables:
## $ .id: chr "1" "1" "1" "1" ...
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ y : num 1 1.2 1.39 1.57 1.72 ...
(combining the data.frames and melting to long format for plotting)
The 5 different approaches yield essentially identical results, but note the difference in ordering .id variables.