data.table Package

setwd(p2(.loc,'Anal/Help/Jim-Durant')); getwd()


# at home, make sure to submit file thats in R/ directory OR set working
# directory to be /anal etc in rstudio options the setwd() command screws up
# the knitR menu !!!

library(plyr)
library(data.table)
library(microbenchmark)
# because data.table is too fast, but results are in nanoseconds

data.table Package

Extension of data.frame for fast indexing, fast ordered joins, fast assignment, fast group-ing and list columns.

Author M Dowle, T Short, S Lianoglou with contributions from A Srinivasan, R Saporta

Description Enhanced data.frame. Fast indexing, fast ordered joins, fast assignment by reference, fast grouping and list columns in a short and ﬂexible syntax.
i and j may be expressions of column names directly, for faster development.

(1) Why Use data.table?

** (1) Why Use data.table?**

options(width=80)
num.obs <- 5e+04  # 100,000 observations
num.id <- num.obs/10  #each child has 10 visits


set.seed(42)


df <- df.orig <- data.frame(  
                    id = as.integer(rep(1:num.id, each = 10)),  
                    sex = gl(2, 100, labels = c('boy','girl')),   
                    visit = rep(1:10, num.id),  
                    bmi=round(rnorm(num.obs, mean = 27, sd = 5),1),  
                    tchol = as.integer(rnorm(num.obs, mean = 200, sd = 25)))

# gl is generate factor levels 

head(arrange(df,id,visit), 12)

##    id sex visit  bmi tchol
## 1   1 boy     1 33.9   213
## 2   1 boy     2 24.2   167
## 3   1 boy     3 28.8   203
## 4   1 boy     4 30.2   201
## 5   1 boy     5 29.0   190
## 6   1 boy     6 26.5   199
## 7   1 boy     7 34.6   236
## 8   1 boy     8 26.5   222
## 9   1 boy     9 37.1   196
## 10  1 boy    10 26.7   164
## 11  2 boy     1 33.5   215
## 12  2 boy     2 38.4   189

sapply(df,class)

##        id       sex     visit       bmi     tchol 
## "integer"  "factor" "integer" "numeric" "integer"

dim(df);

## [1] 50000     5

table(df$visit)

## 
##    1    2    3    4    5    6    7    8    9   10 
## 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000

print(object.size(df), units = "MB")

## 1.1 Mb


dt <- dt.orig <- data.table(df)
class(dt)

## [1] "data.table" "data.frame"


kable(head(iris), format = "markdown")

## |  Sepal.Length|  Sepal.Width|  Petal.Length|  Petal.Width|Species  |
## |-------------:|------------:|-------------:|------------:|:--------|
## |           5.1|          3.5|           1.4|          0.2|setosa   |
## |           4.9|          3.0|           1.4|          0.2|setosa   |
## |           4.7|          3.2|           1.3|          0.2|setosa   |
## |           4.6|          3.1|           1.5|          0.2|setosa   |
## |           5.0|          3.6|           1.4|          0.2|setosa   |
## |           5.4|          3.9|           1.7|          0.4|setosa   |

(2) Aggregation timing


## Summarization using ddply vs data.table 

(time.df <- (system.time( 
  df2 <- ddply(df, .(id), summarise, 
               mean.bmi = mean(bmi), 
               mean.tc = mean(tchol),
               max.tc = max(tchol))
              )
))

##    user  system elapsed 
##    1.49    0.00    1.48

head(df2)

##   id mean.bmi mean.tc max.tc
## 1  1    29.75   199.1    236
## 2  2    26.18   212.6    262
## 3  3    26.11   201.1    220
## 4  4    25.18   197.4    227
## 5  5    26.90   191.2    223
## 6  6    27.08   176.3    197



# Compare with data.table speed: 

time.dt <- (microbenchmark(
  dt2 <- dt[,list(mean.bmi = mean(bmi), 
                  mean.tc = mean(tchol),
                  max.tc = max(tchol)), 
                keyby  =  'id'], 
  times=10) 
)$time

identical(df2,data.frame(dt2))

## [1] TRUE

dt2

##         id mean.bmi mean.tc max.tc
##    1:    1    29.75   199.1    236
##    2:    2    26.18   212.6    262
##    3:    3    26.11   201.1    220
##    4:    4    25.18   197.4    227
##    5:    5    26.90   191.2    223
##   ---                             
## 4996: 4996    28.75   185.6    227
## 4997: 4997    22.64   194.2    267
## 4998: 4998    26.03   195.8    233
## 4999: 4999    26.65   207.4    251
## 5000: 5000    30.40   199.0    228

ddply takes 143 times longer than data.table

(3) Data-table operations are of the form:

DT [ i, j, by, …]

Take DT, subset rows using i, then calculate j grouped by 'by' variable

Unlike a data.frame, both i and j are evaluated within the scope of the data.table

dt[1:5, "bmi"]

## [1] "bmi"

dt[1:5, bmi]  # output is vector

## [1] 33.9 24.2 28.8 30.2 29.0


dt[1:5, list(bmi)]

##     bmi
## 1: 33.9
## 2: 24.2
## 3: 28.8
## 4: 30.2
## 5: 29.0

dt[1:5, "bmi", with = FALSE]

##     bmi
## 1: 33.9
## 2: 24.2
## 3: 28.8
## 4: 30.2
## 5: 29.0

(4) Subsetting


# use functions on subset of data.table
dt[id == 1, summary(bmi)]

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    24.2    26.6    28.9    29.8    33.0    37.1

# note the lack of [data$variable ... OR summary(data$bmi[data$id==1])

# Suppose one wants to work with only some of the variables:
dt[id == 1, list(bmi, sex)]

##      bmi sex
##  1: 33.9 boy
##  2: 24.2 boy
##  3: 28.8 boy
##  4: 30.2 boy
##  5: 29.0 boy
##  6: 26.5 boy
##  7: 34.6 boy
##  8: 26.5 boy
##  9: 37.1 boy
## 10: 26.7 boy


vars = quote(list(bmi, sex))
dt[, eval(vars)]

##         bmi  sex
##     1: 33.9  boy
##     2: 24.2  boy
##     3: 28.8  boy
##     4: 30.2  boy
##     5: 29.0  boy
##    ---          
## 49996: 33.2 girl
## 49997: 30.0 girl
## 49998: 27.8 girl
## 49999: 37.5 girl
## 50000: 35.0 girl

Subsetting: scan (data.frame method) vs J (join) for data.tables Select specific IDs from visit 6 who have cholesterol between 190 and 200 mg/dL


(des.ids = seq(501, num.id, 250))   #select every 250th ID

##  [1]  501  751 1001 1251 1501 1751 2001 2251 2501 2751 3001 3251 3501 3751 4001
## [16] 4251 4501 4751



df.scan <- microbenchmark(
   df2 <- df[df$id %in% des.ids & df$visit == 6 & 
             df$tchol >= 190 & df$tchol < 200,],
   times=10) 
df2

##         id  sex visit  bmi tchol
## 7506   751 girl     6 22.4   192
## 45006 4501  boy     6 26.9   195

round( median(df.scan$time) / 1e9 , 3) #seconds

## [1] 0.009


# note the [id== rather than the [df$id==  in data.table
dt.scan <- microbenchmark(  
  dt2 <- dt[id %in% des.ids & visit == 6 & tchol >= 190 & tchol < 200,]  
  )
dt2

##      id  sex visit  bmi tchol
## 1:  751 girl     6 22.4   192
## 2: 4501  boy     6 26.9   195

round( median(dt.scan$time) / 1e9 , 3) #seconds

## [1] 0.005


x <- median(df.scan$time) / median(dt.scan$time) #seconds

data.frame subset takes 1.9 times longer than does data.table subset

(and data.table approach is easier to read)

#### Biggest speed increase results from using keys
sapply(dt,class)

##        id       sex     visit       bmi     tchol 
## "integer"  "factor" "integer" "numeric" "integer"


setkey(dt,id,visit)  # key is id, visit
dt[J(des.ids, 6),] # J is Join

##       id visit  sex  bmi tchol
##  1:  501     6  boy 26.4   175
##  2:  751     6 girl 22.4   192
##  3: 1001     6  boy 18.9   179
##  4: 1251     6 girl 32.1   173
##  5: 1501     6  boy 42.0   134
##  6: 1751     6 girl 27.2   254
##  7: 2001     6  boy 26.6   203
##  8: 2251     6 girl 22.9   222
##  9: 2501     6  boy 26.2   187
## 10: 2751     6 girl 19.7   167
## 11: 3001     6  boy 31.8   188
## 12: 3251     6 girl 22.7   169
## 13: 3501     6  boy 21.1   182
## 14: 3751     6 girl 30.8   175
## 15: 4001     6  boy 22.3   165
## 16: 4251     6 girl 29.6   231
## 17: 4501     6  boy 26.9   195
## 18: 4751     6 girl 27.1   205

dt.join1 <- microbenchmark(
  dt.join1 <- dt[J(des.ids, 6)])
round( median(dt.join1$time) / 1e9 , 3) # about 10 times faster

## [1] 0.001


dt[J(des.ids, 6)][tchol >= 190 & tchol < 200,]

##      id visit  sex  bmi tchol
## 1:  751     6 girl 22.4   192
## 2: 4501     6  boy 26.9   195

# first select desired IDs at visit 6
# from these rows, then select desired tchol levels

x <- microbenchmark(dt[J(des.ids, 6)][tchol >= 190 & tchol < 200,])$time

# setkey(dt, id, sex, tchol) ### not sure about including
# dt[J(des.ids, 'girl', 190:210)] #this doesn't work in 1.8.11

data.table subset w/o J() takes 3.9 times longer than data.table with join

(5) Merging: data.table merge is much faster than a merge of data frames

df2 <- df[df.orig$visit==1, c('id','visit')]
df2$sbp <- runif(nrow(df2), 100, 140)
head(df2)

##    id visit   sbp
## 1   1     1 123.3
## 11  2     1 112.8
## 21  3     1 104.0
## 31  4     1 135.0
## 41  5     1 131.7
## 51  6     1 133.3


# inner merge / join 
time.df.merge <- system.time(
  df.merge <- merge(df, df2, by = c('id','visit'), all.x = FALSE, all.y = FALSE)
  )
head(arrange(df.merge, id))

##   id visit sex  bmi tchol   sbp
## 1  1     1 boy 33.9   213 123.3
## 2  2     1 boy 33.5   215 112.8
## 3  3     1 boy 25.5   214 104.0
## 4  4     1 boy 29.3   187 135.0
## 5  5     1 boy 28.0   177 131.7
## 6  6     1 boy 28.6   197 133.3


# a data.table merge - need to setkey
setkey(dt, id, visit)
dt2 <- data.table(df2, key = c('id', 'visit'))

time.dt.merge <-  microbenchmark(
    dt.merge <- merge(dt, dt2, all.x = FALSE, all.y = FALSE), 
  times=10)


x <- as.numeric(time.df.merge[3]) / median(time.dt.merge$time/1e9)

A data.frame merge takes 29.1 times longer than does a data.table merge

Merging #2



dt.merge2 <- dt[dt2, nomatch = 0] #inner / right join
# dt2[dt, nomatch = 0] # left join

dt.merge2

##         id visit  sex  bmi tchol   sbp
##    1:    1     1  boy 33.9   213 123.3
##    2:    2     1  boy 33.5   215 112.8
##    3:    3     1  boy 25.5   214 104.0
##    4:    4     1  boy 29.3   187 135.0
##    5:    5     1  boy 28.0   177 131.7
##   ---                                 
## 4996: 4996     1 girl 29.2   167 101.3
## 4997: 4997     1 girl 20.1   167 138.8
## 4998: 4998     1 girl 33.9   179 125.8
## 4999: 4999     1 girl 25.1   218 110.3
## 5000: 5000     1 girl 23.2   203 105.4

identical(dt.merge, dt.merge2)

## [1] TRUE


time.dt.merge2 <- microbenchmark(
    dt.merge2 <- dt[dt2, nomatch = 0], 
  times=10)


x2 <- median(time.dt.merge$time) / median(time.dt.merge2$time)

A merge of data.tables takes 2.8 times longer than does the dt1[dt2] approach

(6) Special characters: .SD, .SDcols, .N :=

.N is an integer specifying the number of rows in the group

.SD is a data.table containing the Subset of Data for each group
.SDcols speciﬁes the columns to be included in .SD
Very useful with lapply

:= transform

Mathew Dowle: You DO NOT need a key - setkey(dt, id) - for the by= or keyby= operations, but many online examples online include it.

keyby= sorts the output.


dt[, list(mbmi = mean(bmi), 
          q75.tchol = quantile(tchol, 0.75)), 
   by = id]

##         id  mbmi q75.tchol
##    1:    1 29.75     210.5
##    2:    2 26.18     229.2
##    3:    3 26.11     214.0
##    4:    4 25.18     209.2
##    5:    5 26.90     212.0
##   ---                     
## 4996: 4996 28.75     196.0
## 4997: 4997 22.64     211.0
## 4998: 4998 26.03     212.2
## 4999: 4999 26.65     220.2
## 5000: 5000 30.40     222.0


dt[, lapply(.SD, mean, na.rm = T), 
   by = 'sex', .SDcols = c('bmi')]

##     sex   bmi
## 1:  boy 27.01
## 2: girl 26.97

# .SDcols select the specfic columns for aggregation

dt[,c(Number = .N, 
      lapply(.SD, mean, na.rm = T)),  
   by = sex, .SDcols = 'tchol']

##     sex Number tchol
## 1:  boy  25000 199.3
## 2: girl  25000 199.4


dt[,.SD[which.min(tchol)], by = visit] # nested query within group

##     visit   id  sex  bmi tchol
##  1:     1 2060 girl 21.4   112
##  2:     2 1185  boy 21.2   113
##  3:     3  201  boy 18.4   102
##  4:     4 4718 girl 21.8   102
##  5:     5 3005  boy 20.8   106
##  6:     6 4957 girl 26.1   106
##  7:     7 2073 girl 32.2   116
##  8:     8  198 girl 27.7   119
##  9:     9 1480 girl 33.4   117
## 10:    10 1596 girl 24.2   107


# define the by group in the summary operation
dt[, list(Number = .N, mean.tc = mean(tchol)), 
     keyby = cut(bmi, breaks = c(0,25,30,35,40, max(bmi))) ]

##          cut Number mean.tc
## 1:    (0,25]  17436   199.4
## 2:   (25,30]  18977   199.5
## 3:   (30,35]  10881   199.1
## 4:   (35,40]   2478   199.3
## 5: (40,48.6]    228   200.3

:= data.table transform

dt[,high.tchol := tchol > mean(tchol), 
   by = 'id'][]

##          id  sex visit  bmi tchol high.tchol
##     1:    1  boy     1 33.9   213       TRUE
##     2:    1  boy     2 24.2   167      FALSE
##     3:    1  boy     3 28.8   203       TRUE
##     4:    1  boy     4 30.2   201       TRUE
##     5:    1  boy     5 29.0   190      FALSE
##    ---                                      
## 49996: 5000 girl     6 33.2   160      FALSE
## 49997: 5000 girl     7 30.0   190      FALSE
## 49998: 5000 girl     8 27.8   210       TRUE
## 49999: 5000 girl     9 37.5   227       TRUE
## 50000: 5000 girl    10 35.0   174      FALSE

  # adds variables to dt (rather than using <- operator)

# which visits have higher than 'normal' TC for that person?
# express BMI at each visit relative to mean BMI for that person

dt[, ':=' (high.tchol = tchol > mean(tchol), 
           relative.bmi = bmi / mean(bmi) ), 
   by = id][]

##          id  sex visit  bmi tchol high.tchol relative.bmi
##     1:    1  boy     1 33.9   213       TRUE       1.1395
##     2:    1  boy     2 24.2   167      FALSE       0.8134
##     3:    1  boy     3 28.8   203       TRUE       0.9681
##     4:    1  boy     4 30.2   201       TRUE       1.0151
##     5:    1  boy     5 29.0   190      FALSE       0.9748
##    ---                                                   
## 49996: 5000 girl     6 33.2   160      FALSE       1.0921
## 49997: 5000 girl     7 30.0   190      FALSE       0.9868
## 49998: 5000 girl     8 27.8   210       TRUE       0.9145
## 49999: 5000 girl     9 37.5   227       TRUE       1.2336
## 50000: 5000 girl    10 35.0   174      FALSE       1.1513

# ':=' is used to transform >1 column

dt[, c('relative.bmi','high.tchol') := NULL][] # drop variables

##          id  sex visit  bmi tchol
##     1:    1  boy     1 33.9   213
##     2:    1  boy     2 24.2   167
##     3:    1  boy     3 28.8   203
##     4:    1  boy     4 30.2   201
##     5:    1  boy     5 29.0   190
##    ---                           
## 49996: 5000 girl     6 33.2   160
## 49997: 5000 girl     7 30.0   190
## 49998: 5000 girl     8 27.8   210
## 49999: 5000 girl     9 37.5   227
## 50000: 5000 girl    10 35.0   174

wide vs long output


# wide vs long format
dt[, quantile(tchol), keyby = 'sex']

##      sex  V1
##  1:  boy 102
##  2:  boy 183
##  3:  boy 199
##  4:  boy 216
##  5:  boy 292
##  6: girl 102
##  7: girl 182
##  8: girl 199
##  9: girl 216
## 10: girl 295

dt[, as.list(quantile(tchol) ), keyby = 'sex']

##     sex  0% 25% 50% 75% 100%
## 1:  boy 102 183 199 216  292
## 2: girl 102 182 199 216  295


dt[sex %like% '^b' & 
     tchol %between% c(195,200) & 
     bmi %between% c(27.0, 27.2),]

##       id sex visit  bmi tchol
##  1:   41 boy     4 27.2   198
##  2:   85 boy     5 27.2   195
##  3:  161 boy     1 27.1   198
##  4:  169 boy     4 27.2   199
##  5:  189 boy    10 27.1   198
##  6:  222 boy     6 27.2   196
##  7:  381 boy     1 27.0   196
##  8:  645 boy     6 27.1   195
##  9:  703 boy     5 27.0   197
## 10:  804 boy     8 27.1   199
## 11: 1009 boy     1 27.1   195
## 12: 1088 boy     5 27.2   200
## 13: 1108 boy     6 27.2   197
## 14: 1148 boy     4 27.0   196
## 15: 1187 boy     1 27.2   196
## 16: 1241 boy     4 27.2   198
## 17: 1269 boy     2 27.0   196
## 18: 1390 boy     5 27.2   200
## 19: 1403 boy     3 27.1   199
## 20: 1409 boy     5 27.0   197
## 21: 1427 boy     4 27.2   200
## 22: 1484 boy     4 27.0   199
## 23: 1508 boy     5 27.2   200
## 24: 1585 boy    10 27.0   196
## 25: 1607 boy     8 27.1   198
## 26: 1626 boy    10 27.2   197
## 27: 1629 boy     1 27.0   198
## 28: 1670 boy     5 27.0   200
## 29: 1767 boy     1 27.2   199
## 30: 1865 boy    10 27.2   196
## 31: 1905 boy    10 27.2   198
## 32: 2009 boy     1 27.1   195
## 33: 2203 boy     5 27.1   197
## 34: 2270 boy    10 27.1   195
## 35: 2345 boy     9 27.2   199
## 36: 2424 boy     6 27.1   195
## 37: 2447 boy     3 27.1   195
## 38: 2544 boy     5 27.1   200
## 39: 2646 boy     9 27.0   198
## 40: 2667 boy     8 27.0   200
## 41: 2688 boy     3 27.0   196
## 42: 2803 boy     4 27.1   199
## 43: 3070 boy     3 27.2   196
## 44: 3144 boy    10 27.1   199
## 45: 3381 boy     7 27.0   195
## 46: 3526 boy     7 27.1   200
## 47: 3564 boy     3 27.1   197
## 48: 3609 boy     5 27.2   196
## 49: 3610 boy     9 27.1   199
## 50: 3626 boy    10 27.0   200
## 51: 3670 boy    10 27.1   195
## 52: 3727 boy    10 27.2   200
## 53: 3763 boy     3 27.1   196
## 54: 3786 boy    10 27.1   198
## 55: 3890 boy     3 27.2   198
## 56: 3950 boy     8 27.0   200
## 57: 4087 boy     9 27.1   200
## 58: 4182 boy     3 27.2   199
## 59: 4209 boy     3 27.1   198
## 60: 4362 boy     1 27.1   196
## 61: 4441 boy    10 27.1   196
## 62: 4443 boy     7 27.0   199
## 63: 4526 boy     4 27.1   195
## 64: 4581 boy    10 27.2   197
## 65: 4727 boy     4 27.0   195
## 66: 4769 boy     2 27.1   197
## 67: 4806 boy     5 27.2   197
## 68: 4841 boy     8 27.1   196
## 69: 4868 boy     4 27.0   197
## 70: 4869 boy     2 27.1   196
##       id sex visit  bmi tchol

Additional information

example(data.table)

http://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.pdf
http://cran.r-project.org/web/packages/data.table/vignettes/datatable-faq.pdf

tag

http://stackoverflow.com/questions/tagged/r+data.table

subforum

http://r.789695.n4.nabble.com/datatable-help-f2315188.html

dplyr package -

Plyr specialised for data frames – https://github.com/hadley/dplyr Can be installed from github with – devtools::install_github(“dplyr”)

installed under ubuntu, not windows

dplyr is the next iteration of plyr, focussed on tools for working with data frames (hence the d in the name). It has three main goals:

(1) Identify the most important data manipulation tools needed for data analysis and make them easy to use from R.

(2) Provide blazing fast performance for in-memory data by writing key pieces in C++.

(3) Use the same interface to work with data no matter where it's stored, whether in a data frame, a data table or database.


time.df <- system.time(x1 <- ddply(df, .(id), summarise, mean.bmi = mean(bmi), mean.tc = mean(tchol), 
    max.tc = max(tchol)))
head(x1)

##   id mean.bmi mean.tc max.tc
## 1  1    29.75   199.1    236
## 2  2    26.18   212.6    262
## 3  3    26.11   201.1    220
## 4  4    25.18   197.4    227
## 5  5    26.90   191.2    223
## 6  6    27.08   176.3    197

detach("package:plyr")

# devtools::install_github(dplyr) library(devtools)
library(dplyr)

## Error: there is no package called 'dplyr'

library(microbenchmark)

df_grp <- group_by(df, id)

## Error: could not find function "group_by"

time.df2 <- microbenchmark(x2 <- summarise(df_grp, mean.bmi = mean(bmi), mean.tc = mean(tchol), 
    max.tc = max(tchol)))$time

## Error: could not find function "summarise"

head(x2, 10)

## [1] 2.829


time.dt <- (microbenchmark(dt2 <- dt[, list(mean.bmi = mean(bmi), mean.tc = mean(tchol), 
    max.tc = max(tchol)), keyby = "id"], times = 10))$time

time.df

##    user  system elapsed 
##    1.44    0.00    1.44


time.df[3]/median(time.df2/1e+09)  # dplyr is 5000 times faster than plyr

## Error: object 'time.df2' not found

median(time.df2/1e+09)

## Error: object 'time.df2' not found


median(time.dt/1e+09)/median(time.df2/1e+09)  # dply is 35 times faster than data.table

## Error: object 'time.df2' not found