setwd(p2(.loc,'Anal/Help/Jim-Durant')); getwd()
# at home, make sure to submit file thats in R/ directory OR set working
# directory to be /anal etc in rstudio options the setwd() command screws up
# the knitR menu !!!
library(plyr)
library(data.table)
library(microbenchmark)
# because data.table is too fast, but results are in nanoseconds
Extension of data.frame for fast indexing, fast ordered joins, fast assignment, fast group-ing and list columns.
Author M Dowle, T Short, S Lianoglou with contributions from A Srinivasan, R Saporta
Description
Enhanced data.frame. Fast indexing, fast ordered joins, fast assignment by reference, fast grouping and list columns in a short and flexible syntax.
i and j may be expressions of column names directly, for faster development.
** (1) Why Use data.table?**
options(width=80)
num.obs <- 5e+04 # 100,000 observations
num.id <- num.obs/10 #each child has 10 visits
set.seed(42)
df <- df.orig <- data.frame(
id = as.integer(rep(1:num.id, each = 10)),
sex = gl(2, 100, labels = c('boy','girl')),
visit = rep(1:10, num.id),
bmi=round(rnorm(num.obs, mean = 27, sd = 5),1),
tchol = as.integer(rnorm(num.obs, mean = 200, sd = 25)))
# gl is generate factor levels
head(arrange(df,id,visit), 12)
## id sex visit bmi tchol
## 1 1 boy 1 33.9 213
## 2 1 boy 2 24.2 167
## 3 1 boy 3 28.8 203
## 4 1 boy 4 30.2 201
## 5 1 boy 5 29.0 190
## 6 1 boy 6 26.5 199
## 7 1 boy 7 34.6 236
## 8 1 boy 8 26.5 222
## 9 1 boy 9 37.1 196
## 10 1 boy 10 26.7 164
## 11 2 boy 1 33.5 215
## 12 2 boy 2 38.4 189
sapply(df,class)
## id sex visit bmi tchol
## "integer" "factor" "integer" "numeric" "integer"
dim(df);
## [1] 50000 5
table(df$visit)
##
## 1 2 3 4 5 6 7 8 9 10
## 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
print(object.size(df), units = "MB")
## 1.1 Mb
dt <- dt.orig <- data.table(df)
class(dt)
## [1] "data.table" "data.frame"
kable(head(iris), format = "markdown")
## | Sepal.Length| Sepal.Width| Petal.Length| Petal.Width|Species |
## |-------------:|------------:|-------------:|------------:|:--------|
## | 5.1| 3.5| 1.4| 0.2|setosa |
## | 4.9| 3.0| 1.4| 0.2|setosa |
## | 4.7| 3.2| 1.3| 0.2|setosa |
## | 4.6| 3.1| 1.5| 0.2|setosa |
## | 5.0| 3.6| 1.4| 0.2|setosa |
## | 5.4| 3.9| 1.7| 0.4|setosa |
## Summarization using ddply vs data.table
(time.df <- (system.time(
df2 <- ddply(df, .(id), summarise,
mean.bmi = mean(bmi),
mean.tc = mean(tchol),
max.tc = max(tchol))
)
))
## user system elapsed
## 1.49 0.00 1.48
head(df2)
## id mean.bmi mean.tc max.tc
## 1 1 29.75 199.1 236
## 2 2 26.18 212.6 262
## 3 3 26.11 201.1 220
## 4 4 25.18 197.4 227
## 5 5 26.90 191.2 223
## 6 6 27.08 176.3 197
# Compare with data.table speed:
time.dt <- (microbenchmark(
dt2 <- dt[,list(mean.bmi = mean(bmi),
mean.tc = mean(tchol),
max.tc = max(tchol)),
keyby = 'id'],
times=10)
)$time
identical(df2,data.frame(dt2))
## [1] TRUE
dt2
## id mean.bmi mean.tc max.tc
## 1: 1 29.75 199.1 236
## 2: 2 26.18 212.6 262
## 3: 3 26.11 201.1 220
## 4: 4 25.18 197.4 227
## 5: 5 26.90 191.2 223
## ---
## 4996: 4996 28.75 185.6 227
## 4997: 4997 22.64 194.2 267
## 4998: 4998 26.03 195.8 233
## 4999: 4999 26.65 207.4 251
## 5000: 5000 30.40 199.0 228
dt[1:5, "bmi"]
## [1] "bmi"
dt[1:5, bmi] # output is vector
## [1] 33.9 24.2 28.8 30.2 29.0
dt[1:5, list(bmi)]
## bmi
## 1: 33.9
## 2: 24.2
## 3: 28.8
## 4: 30.2
## 5: 29.0
dt[1:5, "bmi", with = FALSE]
## bmi
## 1: 33.9
## 2: 24.2
## 3: 28.8
## 4: 30.2
## 5: 29.0
# use functions on subset of data.table
dt[id == 1, summary(bmi)]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.2 26.6 28.9 29.8 33.0 37.1
# note the lack of [data$variable ... OR summary(data$bmi[data$id==1])
# Suppose one wants to work with only some of the variables:
dt[id == 1, list(bmi, sex)]
## bmi sex
## 1: 33.9 boy
## 2: 24.2 boy
## 3: 28.8 boy
## 4: 30.2 boy
## 5: 29.0 boy
## 6: 26.5 boy
## 7: 34.6 boy
## 8: 26.5 boy
## 9: 37.1 boy
## 10: 26.7 boy
vars = quote(list(bmi, sex))
dt[, eval(vars)]
## bmi sex
## 1: 33.9 boy
## 2: 24.2 boy
## 3: 28.8 boy
## 4: 30.2 boy
## 5: 29.0 boy
## ---
## 49996: 33.2 girl
## 49997: 30.0 girl
## 49998: 27.8 girl
## 49999: 37.5 girl
## 50000: 35.0 girl
(des.ids = seq(501, num.id, 250)) #select every 250th ID
## [1] 501 751 1001 1251 1501 1751 2001 2251 2501 2751 3001 3251 3501 3751 4001
## [16] 4251 4501 4751
df.scan <- microbenchmark(
df2 <- df[df$id %in% des.ids & df$visit == 6 &
df$tchol >= 190 & df$tchol < 200,],
times=10)
df2
## id sex visit bmi tchol
## 7506 751 girl 6 22.4 192
## 45006 4501 boy 6 26.9 195
round( median(df.scan$time) / 1e9 , 3) #seconds
## [1] 0.009
# note the [id== rather than the [df$id== in data.table
dt.scan <- microbenchmark(
dt2 <- dt[id %in% des.ids & visit == 6 & tchol >= 190 & tchol < 200,]
)
dt2
## id sex visit bmi tchol
## 1: 751 girl 6 22.4 192
## 2: 4501 boy 6 26.9 195
round( median(dt.scan$time) / 1e9 , 3) #seconds
## [1] 0.005
x <- median(df.scan$time) / median(dt.scan$time) #seconds
#### Biggest speed increase results from using keys
sapply(dt,class)
## id sex visit bmi tchol
## "integer" "factor" "integer" "numeric" "integer"
setkey(dt,id,visit) # key is id, visit
dt[J(des.ids, 6),] # J is Join
## id visit sex bmi tchol
## 1: 501 6 boy 26.4 175
## 2: 751 6 girl 22.4 192
## 3: 1001 6 boy 18.9 179
## 4: 1251 6 girl 32.1 173
## 5: 1501 6 boy 42.0 134
## 6: 1751 6 girl 27.2 254
## 7: 2001 6 boy 26.6 203
## 8: 2251 6 girl 22.9 222
## 9: 2501 6 boy 26.2 187
## 10: 2751 6 girl 19.7 167
## 11: 3001 6 boy 31.8 188
## 12: 3251 6 girl 22.7 169
## 13: 3501 6 boy 21.1 182
## 14: 3751 6 girl 30.8 175
## 15: 4001 6 boy 22.3 165
## 16: 4251 6 girl 29.6 231
## 17: 4501 6 boy 26.9 195
## 18: 4751 6 girl 27.1 205
dt.join1 <- microbenchmark(
dt.join1 <- dt[J(des.ids, 6)])
round( median(dt.join1$time) / 1e9 , 3) # about 10 times faster
## [1] 0.001
dt[J(des.ids, 6)][tchol >= 190 & tchol < 200,]
## id visit sex bmi tchol
## 1: 751 6 girl 22.4 192
## 2: 4501 6 boy 26.9 195
# first select desired IDs at visit 6
# from these rows, then select desired tchol levels
x <- microbenchmark(dt[J(des.ids, 6)][tchol >= 190 & tchol < 200,])$time
# setkey(dt, id, sex, tchol) ### not sure about including
# dt[J(des.ids, 'girl', 190:210)] #this doesn't work in 1.8.11
df2 <- df[df.orig$visit==1, c('id','visit')]
df2$sbp <- runif(nrow(df2), 100, 140)
head(df2)
## id visit sbp
## 1 1 1 123.3
## 11 2 1 112.8
## 21 3 1 104.0
## 31 4 1 135.0
## 41 5 1 131.7
## 51 6 1 133.3
# inner merge / join
time.df.merge <- system.time(
df.merge <- merge(df, df2, by = c('id','visit'), all.x = FALSE, all.y = FALSE)
)
head(arrange(df.merge, id))
## id visit sex bmi tchol sbp
## 1 1 1 boy 33.9 213 123.3
## 2 2 1 boy 33.5 215 112.8
## 3 3 1 boy 25.5 214 104.0
## 4 4 1 boy 29.3 187 135.0
## 5 5 1 boy 28.0 177 131.7
## 6 6 1 boy 28.6 197 133.3
# a data.table merge - need to setkey
setkey(dt, id, visit)
dt2 <- data.table(df2, key = c('id', 'visit'))
time.dt.merge <- microbenchmark(
dt.merge <- merge(dt, dt2, all.x = FALSE, all.y = FALSE),
times=10)
x <- as.numeric(time.df.merge[3]) / median(time.dt.merge$time/1e9)
dt.merge2 <- dt[dt2, nomatch = 0] #inner / right join
# dt2[dt, nomatch = 0] # left join
dt.merge2
## id visit sex bmi tchol sbp
## 1: 1 1 boy 33.9 213 123.3
## 2: 2 1 boy 33.5 215 112.8
## 3: 3 1 boy 25.5 214 104.0
## 4: 4 1 boy 29.3 187 135.0
## 5: 5 1 boy 28.0 177 131.7
## ---
## 4996: 4996 1 girl 29.2 167 101.3
## 4997: 4997 1 girl 20.1 167 138.8
## 4998: 4998 1 girl 33.9 179 125.8
## 4999: 4999 1 girl 25.1 218 110.3
## 5000: 5000 1 girl 23.2 203 105.4
identical(dt.merge, dt.merge2)
## [1] TRUE
time.dt.merge2 <- microbenchmark(
dt.merge2 <- dt[dt2, nomatch = 0],
times=10)
x2 <- median(time.dt.merge$time) / median(time.dt.merge2$time)
.N is an integer specifying the number of rows in the group
.SD is a data.table containing the Subset of Data for each group
.SDcols specifies the columns to be included in .SD
Very useful with lapply
:= transform
dt[, list(mbmi = mean(bmi),
q75.tchol = quantile(tchol, 0.75)),
by = id]
## id mbmi q75.tchol
## 1: 1 29.75 210.5
## 2: 2 26.18 229.2
## 3: 3 26.11 214.0
## 4: 4 25.18 209.2
## 5: 5 26.90 212.0
## ---
## 4996: 4996 28.75 196.0
## 4997: 4997 22.64 211.0
## 4998: 4998 26.03 212.2
## 4999: 4999 26.65 220.2
## 5000: 5000 30.40 222.0
dt[, lapply(.SD, mean, na.rm = T),
by = 'sex', .SDcols = c('bmi')]
## sex bmi
## 1: boy 27.01
## 2: girl 26.97
# .SDcols select the specfic columns for aggregation
dt[,c(Number = .N,
lapply(.SD, mean, na.rm = T)),
by = sex, .SDcols = 'tchol']
## sex Number tchol
## 1: boy 25000 199.3
## 2: girl 25000 199.4
dt[,.SD[which.min(tchol)], by = visit] # nested query within group
## visit id sex bmi tchol
## 1: 1 2060 girl 21.4 112
## 2: 2 1185 boy 21.2 113
## 3: 3 201 boy 18.4 102
## 4: 4 4718 girl 21.8 102
## 5: 5 3005 boy 20.8 106
## 6: 6 4957 girl 26.1 106
## 7: 7 2073 girl 32.2 116
## 8: 8 198 girl 27.7 119
## 9: 9 1480 girl 33.4 117
## 10: 10 1596 girl 24.2 107
# define the by group in the summary operation
dt[, list(Number = .N, mean.tc = mean(tchol)),
keyby = cut(bmi, breaks = c(0,25,30,35,40, max(bmi))) ]
## cut Number mean.tc
## 1: (0,25] 17436 199.4
## 2: (25,30] 18977 199.5
## 3: (30,35] 10881 199.1
## 4: (35,40] 2478 199.3
## 5: (40,48.6] 228 200.3
dt[,high.tchol := tchol > mean(tchol),
by = 'id'][]
## id sex visit bmi tchol high.tchol
## 1: 1 boy 1 33.9 213 TRUE
## 2: 1 boy 2 24.2 167 FALSE
## 3: 1 boy 3 28.8 203 TRUE
## 4: 1 boy 4 30.2 201 TRUE
## 5: 1 boy 5 29.0 190 FALSE
## ---
## 49996: 5000 girl 6 33.2 160 FALSE
## 49997: 5000 girl 7 30.0 190 FALSE
## 49998: 5000 girl 8 27.8 210 TRUE
## 49999: 5000 girl 9 37.5 227 TRUE
## 50000: 5000 girl 10 35.0 174 FALSE
# adds variables to dt (rather than using <- operator)
# which visits have higher than 'normal' TC for that person?
# express BMI at each visit relative to mean BMI for that person
dt[, ':=' (high.tchol = tchol > mean(tchol),
relative.bmi = bmi / mean(bmi) ),
by = id][]
## id sex visit bmi tchol high.tchol relative.bmi
## 1: 1 boy 1 33.9 213 TRUE 1.1395
## 2: 1 boy 2 24.2 167 FALSE 0.8134
## 3: 1 boy 3 28.8 203 TRUE 0.9681
## 4: 1 boy 4 30.2 201 TRUE 1.0151
## 5: 1 boy 5 29.0 190 FALSE 0.9748
## ---
## 49996: 5000 girl 6 33.2 160 FALSE 1.0921
## 49997: 5000 girl 7 30.0 190 FALSE 0.9868
## 49998: 5000 girl 8 27.8 210 TRUE 0.9145
## 49999: 5000 girl 9 37.5 227 TRUE 1.2336
## 50000: 5000 girl 10 35.0 174 FALSE 1.1513
# ':=' is used to transform >1 column
dt[, c('relative.bmi','high.tchol') := NULL][] # drop variables
## id sex visit bmi tchol
## 1: 1 boy 1 33.9 213
## 2: 1 boy 2 24.2 167
## 3: 1 boy 3 28.8 203
## 4: 1 boy 4 30.2 201
## 5: 1 boy 5 29.0 190
## ---
## 49996: 5000 girl 6 33.2 160
## 49997: 5000 girl 7 30.0 190
## 49998: 5000 girl 8 27.8 210
## 49999: 5000 girl 9 37.5 227
## 50000: 5000 girl 10 35.0 174
# wide vs long format
dt[, quantile(tchol), keyby = 'sex']
## sex V1
## 1: boy 102
## 2: boy 183
## 3: boy 199
## 4: boy 216
## 5: boy 292
## 6: girl 102
## 7: girl 182
## 8: girl 199
## 9: girl 216
## 10: girl 295
dt[, as.list(quantile(tchol) ), keyby = 'sex']
## sex 0% 25% 50% 75% 100%
## 1: boy 102 183 199 216 292
## 2: girl 102 182 199 216 295
dt[sex %like% '^b' &
tchol %between% c(195,200) &
bmi %between% c(27.0, 27.2),]
## id sex visit bmi tchol
## 1: 41 boy 4 27.2 198
## 2: 85 boy 5 27.2 195
## 3: 161 boy 1 27.1 198
## 4: 169 boy 4 27.2 199
## 5: 189 boy 10 27.1 198
## 6: 222 boy 6 27.2 196
## 7: 381 boy 1 27.0 196
## 8: 645 boy 6 27.1 195
## 9: 703 boy 5 27.0 197
## 10: 804 boy 8 27.1 199
## 11: 1009 boy 1 27.1 195
## 12: 1088 boy 5 27.2 200
## 13: 1108 boy 6 27.2 197
## 14: 1148 boy 4 27.0 196
## 15: 1187 boy 1 27.2 196
## 16: 1241 boy 4 27.2 198
## 17: 1269 boy 2 27.0 196
## 18: 1390 boy 5 27.2 200
## 19: 1403 boy 3 27.1 199
## 20: 1409 boy 5 27.0 197
## 21: 1427 boy 4 27.2 200
## 22: 1484 boy 4 27.0 199
## 23: 1508 boy 5 27.2 200
## 24: 1585 boy 10 27.0 196
## 25: 1607 boy 8 27.1 198
## 26: 1626 boy 10 27.2 197
## 27: 1629 boy 1 27.0 198
## 28: 1670 boy 5 27.0 200
## 29: 1767 boy 1 27.2 199
## 30: 1865 boy 10 27.2 196
## 31: 1905 boy 10 27.2 198
## 32: 2009 boy 1 27.1 195
## 33: 2203 boy 5 27.1 197
## 34: 2270 boy 10 27.1 195
## 35: 2345 boy 9 27.2 199
## 36: 2424 boy 6 27.1 195
## 37: 2447 boy 3 27.1 195
## 38: 2544 boy 5 27.1 200
## 39: 2646 boy 9 27.0 198
## 40: 2667 boy 8 27.0 200
## 41: 2688 boy 3 27.0 196
## 42: 2803 boy 4 27.1 199
## 43: 3070 boy 3 27.2 196
## 44: 3144 boy 10 27.1 199
## 45: 3381 boy 7 27.0 195
## 46: 3526 boy 7 27.1 200
## 47: 3564 boy 3 27.1 197
## 48: 3609 boy 5 27.2 196
## 49: 3610 boy 9 27.1 199
## 50: 3626 boy 10 27.0 200
## 51: 3670 boy 10 27.1 195
## 52: 3727 boy 10 27.2 200
## 53: 3763 boy 3 27.1 196
## 54: 3786 boy 10 27.1 198
## 55: 3890 boy 3 27.2 198
## 56: 3950 boy 8 27.0 200
## 57: 4087 boy 9 27.1 200
## 58: 4182 boy 3 27.2 199
## 59: 4209 boy 3 27.1 198
## 60: 4362 boy 1 27.1 196
## 61: 4441 boy 10 27.1 196
## 62: 4443 boy 7 27.0 199
## 63: 4526 boy 4 27.1 195
## 64: 4581 boy 10 27.2 197
## 65: 4727 boy 4 27.0 195
## 66: 4769 boy 2 27.1 197
## 67: 4806 boy 5 27.2 197
## 68: 4841 boy 8 27.1 196
## 69: 4868 boy 4 27.0 197
## 70: 4869 boy 2 27.1 196
## id sex visit bmi tchol
example(data.table)
http://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.pdf
http://cran.r-project.org/web/packages/data.table/vignettes/datatable-faq.pdf
http://stackoverflow.com/questions/tagged/r+data.table
http://r.789695.n4.nabble.com/datatable-help-f2315188.html
Plyr specialised for data frames – https://github.com/hadley/dplyr Can be installed from github with – devtools::install_github(“dplyr”)
installed under ubuntu, not windows
dplyr is the next iteration of plyr, focussed on tools for working with data frames (hence the d in the name). It has three main goals:
(1) Identify the most important data manipulation tools needed for data analysis and make them easy to use from R.
(2) Provide blazing fast performance for in-memory data by writing key pieces in C++.
(3) Use the same interface to work with data no matter where it's stored, whether in a data frame, a data table or database.
time.df <- system.time(x1 <- ddply(df, .(id), summarise, mean.bmi = mean(bmi), mean.tc = mean(tchol),
max.tc = max(tchol)))
head(x1)
## id mean.bmi mean.tc max.tc
## 1 1 29.75 199.1 236
## 2 2 26.18 212.6 262
## 3 3 26.11 201.1 220
## 4 4 25.18 197.4 227
## 5 5 26.90 191.2 223
## 6 6 27.08 176.3 197
detach("package:plyr")
# devtools::install_github(dplyr) library(devtools)
library(dplyr)
## Error: there is no package called 'dplyr'
library(microbenchmark)
df_grp <- group_by(df, id)
## Error: could not find function "group_by"
time.df2 <- microbenchmark(x2 <- summarise(df_grp, mean.bmi = mean(bmi), mean.tc = mean(tchol),
max.tc = max(tchol)))$time
## Error: could not find function "summarise"
head(x2, 10)
## [1] 2.829
time.dt <- (microbenchmark(dt2 <- dt[, list(mean.bmi = mean(bmi), mean.tc = mean(tchol),
max.tc = max(tchol)), keyby = "id"], times = 10))$time
time.df
## user system elapsed
## 1.44 0.00 1.44
time.df[3]/median(time.df2/1e+09) # dplyr is 5000 times faster than plyr
## Error: object 'time.df2' not found
median(time.df2/1e+09)
## Error: object 'time.df2' not found
median(time.dt/1e+09)/median(time.df2/1e+09) # dply is 35 times faster than data.table
## Error: object 'time.df2' not found