Load the library
library(data.table)
We’ll use the iris dataset because it is very well known and will hopefully let you focus more on the syntax and what we’re doing rather than on the data itself.
DT <- as.data.table(iris)
The default names are kind of annoying so we’ll convert them to snake_case.
iris2snake_case <- function(name) {
name <- tolower(name)
gsub("\\.", "_", name)
}
setnames(DT, iris2snake_case(names(DT)))
As a reminder of the syntax, we take our DT, filter in i, do calculations in j and group in by.
For example, we can sample 75 rows from the iris dataset, and take the mean of the sepal_length column grouped by species like this:
set.seed(99)
DT[sample(.N, 75),
.(mean_sepal_length = mean(sepal_length)),
by = species]
## species mean_sepal_length
## 1: setosa 4.975000
## 2: virginica 6.681818
## 3: versicolor 6.017241
.SD refers to Subset of Data, you can think of it as your original data.table filtered by i and grouped by by to do computations in j.
With this in mind, if we want to apply a function to every column we simply apply it to every element of .SD.
For example, to get the number of unique observations per column.
DT[, lapply(.SD, uniqueN)]
## sepal_length sepal_width petal_length petal_width species
## 1: 35 23 43 22 3
Mean of all columns by species.
DT[, lapply(.SD, mean), by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: setosa 5.006 3.428 1.462 0.246
## 2: versicolor 5.936 2.770 4.260 1.326
## 3: virginica 6.588 2.974 5.552 2.026
Column max of a sample of size 10 by species.
DT[sample(.N, 10), lapply(.SD, max), by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: virginica 6.4 3.3 6.0 2.5
## 2: versicolor 5.8 2.7 4.1 1.2
## 3: setosa 5.7 4.4 1.6 0.4
Get the first two observations by species.
DT[, .SD[1:2], by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: setosa 5.1 3.5 1.4 0.2
## 2: setosa 4.9 3.0 1.4 0.2
## 3: versicolor 7.0 3.2 4.7 1.4
## 4: versicolor 6.4 3.2 4.5 1.5
## 5: virginica 6.3 3.3 6.0 2.5
## 6: virginica 5.8 2.7 5.1 1.9
Or the safer option in case there are less than two.
DT[1:51, head(.SD, 2), by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: setosa 5.1 3.5 1.4 0.2
## 2: setosa 4.9 3.0 1.4 0.2
## 3: versicolor 7.0 3.2 4.7 1.4
Get the last two observations by species.
DT[, tail(.SD, 2), by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: setosa 5.3 3.7 1.5 0.2
## 2: setosa 5.0 3.3 1.4 0.2
## 3: versicolor 5.1 2.5 3.0 1.1
## 4: versicolor 5.7 2.8 4.1 1.3
## 5: virginica 6.2 3.4 5.4 2.3
## 6: virginica 5.9 3.0 5.1 1.8
These arent sorted, how about the top 2 sorted by sepal length in descending order?
DT[order(-sepal_length),
head(.SD, 2),
by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: virginica 7.9 3.8 6.4 2.0
## 2: virginica 7.7 3.8 6.7 2.2
## 3: versicolor 7.0 3.2 4.7 1.4
## 4: versicolor 6.9 3.1 4.9 1.5
## 5: setosa 5.8 4.0 1.2 0.2
## 6: setosa 5.7 4.4 1.5 0.4
Or by the diference between sepal length and width?
DT[order(sepal_length - sepal_width),
head(.SD, 2),
by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: setosa 4.6 3.6 1.0 0.2
## 2: setosa 5.2 4.1 1.5 0.1
## 3: versicolor 5.4 3.0 4.5 1.5
## 4: versicolor 5.2 2.7 3.9 1.4
## 5: virginica 4.9 2.5 4.5 1.7
## 6: virginica 5.6 2.8 4.9 2.0
Filter observations above the mean of sepal_length by species.
DT[,
.SD[sepal_length > mean(sepal_length)],
by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: setosa 5.1 3.5 1.4 0.2
## 2: setosa 5.4 3.9 1.7 0.4
## 3: setosa 5.4 3.7 1.5 0.2
## 4: setosa 5.8 4.0 1.2 0.2
## 5: setosa 5.7 4.4 1.5 0.4
## ---
## 64: virginica 6.7 3.1 5.6 2.4
## 65: virginica 6.9 3.1 5.1 2.3
## 66: virginica 6.8 3.2 5.9 2.3
## 67: virginica 6.7 3.3 5.7 2.5
## 68: virginica 6.7 3.0 5.2 2.3
Filter rows with group size greater than 10.
DT[,
.SD[.N > 10],
keyby = .(species, petal_width)]
## species petal_width sepal_length sepal_width petal_length
## 1: setosa 0.2 5.1 3.5 1.4
## 2: setosa 0.2 4.9 3.0 1.4
## 3: setosa 0.2 4.7 3.2 1.3
## 4: setosa 0.2 4.6 3.1 1.5
## 5: setosa 0.2 5.0 3.6 1.4
## ---
## 49: virginica 1.8 6.2 2.8 4.8
## 50: virginica 1.8 6.1 3.0 4.9
## 51: virginica 1.8 6.4 3.1 5.5
## 52: virginica 1.8 6.0 3.0 4.8
## 53: virginica 1.8 5.9 3.0 5.1
Get the row with the max petal_length by species.
DT[, .SD[which.max(petal_length)], by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: setosa 4.8 3.4 1.9 0.2
## 2: versicolor 6.0 2.7 5.1 1.6
## 3: virginica 7.7 2.6 6.9 2.3
In case there’s more than one.
DT[, .SD[petal_length == max(petal_length)], by = species]
## species sepal_length sepal_width petal_length petal_width
## 1: setosa 4.8 3.4 1.9 0.2
## 2: setosa 5.1 3.8 1.9 0.4
## 3: versicolor 6.0 2.7 5.1 1.6
## 4: virginica 7.7 2.6 6.9 2.3
Note: if speed is a concern use .I instead, which returns row indices for the condition you specify. For example:
DT_filter <- DT[, .I[petal_length == max(petal_length)], by = species]
print(DT_filter)
## species V1
## 1: setosa 25
## 2: setosa 45
## 3: versicolor 84
## 4: virginica 119
DT[DT_filter$V1]
## sepal_length sepal_width petal_length petal_width species
## 1: 4.8 3.4 1.9 0.2 setosa
## 2: 5.1 3.8 1.9 0.4 setosa
## 3: 6.0 2.7 5.1 1.6 versicolor
## 4: 7.7 2.6 6.9 2.3 virginica
Which is usually compressed to one line like this.
DT[DT[, .I[petal_length == max(petal_length)], by = species]$V1]
## sepal_length sepal_width petal_length petal_width species
## 1: 4.8 3.4 1.9 0.2 setosa
## 2: 5.1 3.8 1.9 0.4 setosa
## 3: 6.0 2.7 5.1 1.6 versicolor
## 4: 7.7 2.6 6.9 2.3 virginica
We can specify the columns we want .SD to include through .SDcols, then we can do computations only on those columns.
.SDcols is very flexible, we can use column indices.
col_idx <- grep("^sepal", names(DT))
print(col_idx)
## [1] 1 2
DT[, lapply(.SD, mean), .SDcols = col_idx]
## sepal_length sepal_width
## 1: 5.843333 3.057333
Column names.
col_names <- grep("^sepal", names(DT), value = TRUE)
print(col_names)
## [1] "sepal_length" "sepal_width"
DT[, lapply(.SD, mean), .SDcols = col_names]
## sepal_length sepal_width
## 1: 5.843333 3.057333
Or a boolean vector.
col_bool <- grepl("^sepal", names(DT))
print(col_bool)
## [1] TRUE TRUE FALSE FALSE FALSE
DT[, lapply(.SD, mean), .SDcols = col_bool]
## sepal_length sepal_width
## 1: 5.843333 3.057333
We can also remove columns from .SD by index.
col_idx <- grep("^(petal|species)", names(DT))
print(col_idx)
## [1] 3 4 5
DT[, lapply(.SD, mean), .SDcols = -col_idx]
## sepal_length sepal_width
## 1: 5.843333 3.057333
Or by name.
col_names <- grep("^(petal|species)", names(DT), value = TRUE)
print(col_names)
## [1] "petal_length" "petal_width" "species"
DT[, lapply(.SD, mean), .SDcols = -col_names]
## sepal_length sepal_width
## 1: 5.843333 3.057333
Or with a boolean vector.
col_bool <- grepl("^(petal|species)", names(DT))
print(col_bool)
## [1] FALSE FALSE TRUE TRUE TRUE
DT[, lapply(.SD, mean), .SDcols = !col_bool]
## sepal_length sepal_width
## 1: 5.843333 3.057333
.SDcols also allows ranges in column names
DT[, lapply(.SD, mean), .SDcols = sepal_length:sepal_width]
## sepal_length sepal_width
## 1: 5.843333 3.057333
And removing column ranges.
DT[, lapply(.SD, mean), .SDcols = -(petal_length:species)]
## sepal_length sepal_width
## 1: 5.843333 3.057333
Suppose you have a data.table where all the string columns are factors.
DF <- data.frame(x = sample(letters, 50, TRUE),
y = sample(letters, 50, TRUE),
z = sample(letters, 50, TRUE),
val = rnorm(50))
setDT(DF)
sapply(DF, class)
## x y z val
## "factor" "factor" "factor" "numeric"
To turn all these factor columns into strings, we can apply the as.character function to every element of .SD by specifying the factor columns in .SDcols.
factor_cols <- names(DF)[sapply(DF, is.factor)]
print(factor_cols)
## [1] "x" "y" "z"
DF[, (factor_cols) := lapply(.SD, as.character), .SDcols = factor_cols]
sapply(DF, class)
## x y z val
## "character" "character" "character" "numeric"
Get the 3rd quartile of the petal columns by species.
DT[,
lapply(.SD, quantile, 0.75),
by = species,
.SDcols = grep("^petal", names(DT))]
## species petal_length petal_width
## 1: setosa 1.575 0.3
## 2: versicolor 4.600 1.5
## 3: virginica 5.875 2.3
As you may have noticed, if we evaluate a named list in j, the elements of that list are the column values and the names of the list are the column names. So you can pretty much plug anything you like in j and if it returns a named list with equal sized elements you will get a data.table from it. For example, we can define a function that takes a data.table and a function, and returns the result of the function applied to every column as well as the name of the operation.
apply_func_and_get_names <- function(DT, func, ...) {
res <- lapply(DT, func, ...)
c(list(op = names(res[[1]])), res)
}
Now we can use it to get the 1st, 2nd and 3rd quartiles of all columns by species.
DT[, apply_func_and_get_names(.SD, quantile, c(0.25, 0.5, 0.75)), by = species]
## species op sepal_length sepal_width petal_length petal_width
## 1: setosa 25% 4.800 3.200 1.400 0.2
## 2: setosa 50% 5.000 3.400 1.500 0.2
## 3: setosa 75% 5.200 3.675 1.575 0.3
## 4: versicolor 25% 5.600 2.525 4.000 1.2
## 5: versicolor 50% 5.900 2.800 4.350 1.3
## 6: versicolor 75% 6.300 3.000 4.600 1.5
## 7: virginica 25% 6.225 2.800 5.100 1.8
## 8: virginica 50% 6.500 3.000 5.550 2.0
## 9: virginica 75% 6.900 3.175 5.875 2.3
Summary by species.
DT[, apply_func_and_get_names(.SD, summary), by = species]
## species op sepal_length sepal_width petal_length petal_width
## 1: setosa Min. 4.300 2.300 1.000 0.100
## 2: setosa 1st Qu. 4.800 3.200 1.400 0.200
## 3: setosa Median 5.000 3.400 1.500 0.200
## 4: setosa Mean 5.006 3.428 1.462 0.246
## 5: setosa 3rd Qu. 5.200 3.675 1.575 0.300
## ---
## 14: virginica 1st Qu. 6.225 2.800 5.100 1.800
## 15: virginica Median 6.500 3.000 5.550 2.000
## 16: virginica Mean 6.588 2.974 5.552 2.026
## 17: virginica 3rd Qu. 6.900 3.175 5.875 2.300
## 18: virginica Max. 7.900 3.800 6.900 2.500
We can also define a function that applies a list of functions to all columns and returns the name of the function as well.
apply_list_of_funs <- function(x, funs)
sapply(funs, function(f) f(x))
custom_summaries <- function(DT, funs) {
res <- lapply(DT, apply_list_of_funs, funs)
c(list(summary = names(funs)), res)
}
And use it to get some custom aggregations by species.
funs <- list(mean = mean,
sd = sd,
n_unique = uniqueN,
median = median,
iqr = IQR)
DT[, custom_summaries(.SD, funs), by = species]
## species summary sepal_length sepal_width petal_length petal_width
## 1: setosa mean 5.0060000 3.4280000 1.4620000 0.2460000
## 2: setosa sd 0.3524897 0.3790644 0.1736640 0.1053856
## 3: setosa n_unique 15.0000000 16.0000000 9.0000000 6.0000000
## 4: setosa median 5.0000000 3.4000000 1.5000000 0.2000000
## 5: setosa iqr 0.4000000 0.4750000 0.1750000 0.1000000
## ---
## 11: virginica mean 6.5880000 2.9740000 5.5520000 2.0260000
## 12: virginica sd 0.6358796 0.3224966 0.5518947 0.2746501
## 13: virginica n_unique 21.0000000 13.0000000 20.0000000 12.0000000
## 14: virginica median 6.5000000 3.0000000 5.5500000 2.0000000
## 15: virginica iqr 0.6750000 0.3750000 0.7750000 0.5000000
.SD doesn’t include what we specify in by, however the groups are accesible as a named list in .BY. This allows us to use the group names and .SD to do fun stuff in j.
Write a csv for each species.
DT[, fwrite(.SD, glue::glue("{.BY$species}.csv")), by = species]
Save a scatter plot of the sepals by species.
library(ggplot2)
DT[,
{
ggplot(.SD, aes(sepal_length, sepal_width)) +
geom_point();
ggsave(glue::glue("{.BY$species}.png"))
},
by = species]