X <- as.factor(c(1,1,2,2))
n <- 1000
a <- rlnorm(n)
# slightly more concise version of simulation
Y <- outer(a, X == 2, "*") + rnorm(n * 4)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
# first reshape your data into one-row-per-observation
reshaped <- Y %>%
melt(value.name = "y") %>%
mutate(x = X[Var2])
head(reshaped)
## Var1 Var2 y x
## 1 1 1 -0.9701810 1
## 2 2 1 -2.2132788 1
## 3 3 1 0.8182787 1
## 4 4 1 -0.6899683 1
## 5 5 1 0.8602347 1
## 6 6 1 -0.1008782 1
# then perform your glms like:
glms <- reshaped %>%
group_by(Var1) %>%
do(mod = glm(y ~ x, data = .))
# and tidy them into coefficients with:
library(broom)
coefs <- glms %>%
tidy(mod)
coefs
## Source: local data frame [2,000 x 6]
## Groups: Var1 [1,000]
##
## Var1 term estimate std.error statistic p.value
## (int) (chr) (dbl) (dbl) (dbl) (dbl)
## 1 1 (Intercept) -0.11815370 0.6882764 -0.1716661 0.8794983
## 2 1 x2 0.12695478 0.9733697 0.1304281 0.9081631
## 3 2 (Intercept) -0.93288088 1.1017830 -0.8467011 0.4863195
## 4 2 x2 4.10329225 1.5581564 2.6334276 0.1190007
## 5 3 (Intercept) -0.91932017 1.2485139 -0.7363316 0.5381830
## 6 3 x2 0.85663907 1.7656652 0.4851651 0.6755010
## 7 4 (Intercept) 0.28702313 0.8949300 0.3207213 0.7788319
## 8 4 x2 1.58216906 1.2656221 1.2501117 0.3377006
## 9 5 (Intercept) -0.06855478 0.6635304 -0.1033182 0.9271372
## 10 5 x2 0.43056647 0.9383737 0.4588433 0.6913861
## .. ... ... ... ... ... ...
The above is the unparallelized version- to parallelize, use multidplyr. Note that the multidplyr package is still in development, as are broom’s methods for dealing with it, so this takes a little more work. Nor am I sure it works on a GPU specifically, though it works on multicore.
# first install devtools, then
# devtools::install_github("hadley/multidplyr")
library(multidplyr)
glms <- reshaped %>%
partition(Var1) %>% # like group_by, but across cluster
do(mod = glm(y ~ x, data = .)) %>%
collect() %>% # bring it back into memory
rowwise() # prep for tidying method
## Initialising 3 core cluster.
# then you can get coefs same way:
glms %>%
tidy(mod)
## Source: local data frame [2,000 x 6]
## Groups: Var1 [1,000]
##
## Var1 term estimate std.error statistic p.value
## (int) (chr) (dbl) (dbl) (dbl) (dbl)
## 1 1 (Intercept) -0.11815370 0.6882764 -0.1716661 0.8794983
## 2 1 x2 0.12695478 0.9733697 0.1304281 0.9081631
## 3 2 (Intercept) -0.93288088 1.1017830 -0.8467011 0.4863195
## 4 2 x2 4.10329225 1.5581564 2.6334276 0.1190007
## 5 3 (Intercept) -0.91932017 1.2485139 -0.7363316 0.5381830
## 6 3 x2 0.85663907 1.7656652 0.4851651 0.6755010
## 7 4 (Intercept) 0.28702313 0.8949300 0.3207213 0.7788319
## 8 4 x2 1.58216906 1.2656221 1.2501117 0.3377006
## 9 5 (Intercept) -0.06855478 0.6635304 -0.1033182 0.9271372
## 10 5 x2 0.43056647 0.9383737 0.4588433 0.6913861
## .. ... ... ... ... ... ...
Having the data in this format is useful. For example, p-value histogram:
library(ggplot2)
ggplot(coefs, aes(p.value)) +
geom_histogram() +
facet_wrap(~term)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Volcano plot:
coefs %>%
filter(term == "x2") %>%
ggplot(aes(estimate, p.value)) +
geom_point() +
scale_y_log10()
See my broom paper for more examples.