X <- as.factor(c(1,1,2,2))
n <- 1000
a <- rlnorm(n)

# slightly more concise version of simulation
Y <- outer(a, X == 2, "*") + rnorm(n * 4)

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)

# first reshape your data into one-row-per-observation
reshaped <- Y %>%
  melt(value.name = "y") %>%
  mutate(x = X[Var2])

head(reshaped)
##   Var1 Var2          y x
## 1    1    1 -0.9701810 1
## 2    2    1 -2.2132788 1
## 3    3    1  0.8182787 1
## 4    4    1 -0.6899683 1
## 5    5    1  0.8602347 1
## 6    6    1 -0.1008782 1
# then perform your glms like:
glms <- reshaped %>%
  group_by(Var1) %>%
  do(mod = glm(y ~ x, data = .))

# and tidy them into coefficients with:
library(broom)
coefs <- glms %>%
  tidy(mod)

coefs
## Source: local data frame [2,000 x 6]
## Groups: Var1 [1,000]
## 
##     Var1        term    estimate std.error  statistic   p.value
##    (int)       (chr)       (dbl)     (dbl)      (dbl)     (dbl)
## 1      1 (Intercept) -0.11815370 0.6882764 -0.1716661 0.8794983
## 2      1          x2  0.12695478 0.9733697  0.1304281 0.9081631
## 3      2 (Intercept) -0.93288088 1.1017830 -0.8467011 0.4863195
## 4      2          x2  4.10329225 1.5581564  2.6334276 0.1190007
## 5      3 (Intercept) -0.91932017 1.2485139 -0.7363316 0.5381830
## 6      3          x2  0.85663907 1.7656652  0.4851651 0.6755010
## 7      4 (Intercept)  0.28702313 0.8949300  0.3207213 0.7788319
## 8      4          x2  1.58216906 1.2656221  1.2501117 0.3377006
## 9      5 (Intercept) -0.06855478 0.6635304 -0.1033182 0.9271372
## 10     5          x2  0.43056647 0.9383737  0.4588433 0.6913861
## ..   ...         ...         ...       ...        ...       ...

The above is the unparallelized version- to parallelize, use multidplyr. Note that the multidplyr package is still in development, as are broom’s methods for dealing with it, so this takes a little more work. Nor am I sure it works on a GPU specifically, though it works on multicore.

# first install devtools, then
# devtools::install_github("hadley/multidplyr")

library(multidplyr)

glms <- reshaped %>%
  partition(Var1) %>%                  # like group_by, but across cluster
  do(mod = glm(y ~ x, data = .)) %>%
  collect() %>%                        # bring it back into memory
  rowwise()                            # prep for tidying method
## Initialising 3 core cluster.
# then you can get coefs same way:
glms %>%
  tidy(mod)
## Source: local data frame [2,000 x 6]
## Groups: Var1 [1,000]
## 
##     Var1        term    estimate std.error  statistic   p.value
##    (int)       (chr)       (dbl)     (dbl)      (dbl)     (dbl)
## 1      1 (Intercept) -0.11815370 0.6882764 -0.1716661 0.8794983
## 2      1          x2  0.12695478 0.9733697  0.1304281 0.9081631
## 3      2 (Intercept) -0.93288088 1.1017830 -0.8467011 0.4863195
## 4      2          x2  4.10329225 1.5581564  2.6334276 0.1190007
## 5      3 (Intercept) -0.91932017 1.2485139 -0.7363316 0.5381830
## 6      3          x2  0.85663907 1.7656652  0.4851651 0.6755010
## 7      4 (Intercept)  0.28702313 0.8949300  0.3207213 0.7788319
## 8      4          x2  1.58216906 1.2656221  1.2501117 0.3377006
## 9      5 (Intercept) -0.06855478 0.6635304 -0.1033182 0.9271372
## 10     5          x2  0.43056647 0.9383737  0.4588433 0.6913861
## ..   ...         ...         ...       ...        ...       ...

Having the data in this format is useful. For example, p-value histogram:

library(ggplot2)

ggplot(coefs, aes(p.value)) +
  geom_histogram() +
  facet_wrap(~term)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Volcano plot:

coefs %>%
  filter(term == "x2") %>%
  ggplot(aes(estimate, p.value)) +
  geom_point() +
  scale_y_log10()

See my broom paper for more examples.