Create some random data:
time <- 1:4000
d <- data.frame(time=time, value=1 + time * .01 + rnorm(1000)) # Using random 'value' somewhat based on time
head(d)
## time value
## 1 1 1.2543709
## 2 2 0.2745092
## 3 3 2.3474733
## 4 4 -0.2412253
## 5 5 -1.2558403
## 6 6 0.9586704
Split the data into bins of size 100, meaning there would 4000/100 = 40 bins overall:
bin.size <- 100
# This is the key part, "split" divides a data frame based on results of a function
chunks = split(d, ceiling(d$time/bin.size))
# Chunks is no a list of 40 data frames
length(chunks)
## [1] 40
# And the first chunk looks like this:
head(chunks[[1]]) # The number of items in each chunk is 100
## time value
## 1 1 1.2543709
## 2 2 0.2745092
## 3 3 2.3474733
## 4 4 -0.2412253
## 5 5 -1.2558403
## 6 6 0.9586704
Pull a random sample for each chunk:
samples <- lapply(names(chunks), function(chunk.num) {
# Draw 30 values for this particular chunk
samp <- sample(chunks[[chunk.num]]$value, size = 30, replace = F)
# Return samples in data frame with chunk number for later reference
data.frame(value=samp, chunk=chunk.num)
})
# Combine all samples into one data frame
samples <- do.call('rbind', samples)
# Now we should have 40 * 30 samples overall:
nrow(samples)
## [1] 1200
# And we can see those samples for each chunk:
head(samples)
## value chunk
## 1 0.2490261 1
## 2 1.7010287 1
## 3 0.4366815 1
## 4 1.6389478 1
## 5 -1.5585512 1
## 6 1.6756419 1
Plot the sampled data set:
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
ggplot(samples, aes(x=chunk, y=value)) + geom_point() + theme_bw()
And plot the original dataset to make sure it lines up with sampled version:
ggplot(d, aes(x=time, y=value)) + geom_point() + theme_bw()