Celery + RabbiMQ is one of proposed solutions to parallelize intelligence web features. We tested the solution in daily estimation machines with connections to production databases to verify performance of the solution for country breakdown.
Briefly, this test consists of three parts:
We also union all 155+ queries of country breakdown into one big SQL.
BRIEDLY, performance of Celery + RabbimtMQ is BAD and CANNOT apply it to parallelize web features since:
web backend | rabbitmq | celery worker | database |
---|---|---|---|
create group tasks (1) ==> | | | | | | |
| | tasks are sending to queue (2) ==> | | | | |
| | | | get task_1 (3) ==> | | |
| | | | send SQL_1 of task_1 to db (4) ==> | | |
| | | | | | execute SQL_1 (5) |
| | | | <== get result of task_1 (6) | | |
| | <== result of task_1 (7) | get task_2 ==> | | |
<== result of task_1 (8) | | | send SQL_2 of task_2 to db ==> | |
| | | | | | execute SQL_2 |
This Analysis consists of two parts:
library(ggplot2)
df <- read.csv('/Users/robin/Works/logs/rabbitmq/production/merge2.csv')
df$worker <- factor(df$worker)
df$type <- factor(df$type)
df$threads <- factor(df$threads)
df$machine <- factor(df$machine)
let’s first get basic number on perfomrmnace of current implementation
dfb <- df[df$type=='baseline',]
summary(dfb)
## X device app_id start_day
## Min. : 0 ios:964 Min. :4.44e+08 2014-05-01:483
## 1st Qu.:241 1st Qu.:5.29e+08 2014-06-01:481
## Median :482 Median :5.54e+08
## Mean :482 Mean :5.66e+08
## 3rd Qu.:722 3rd Qu.:6.08e+08
## Max. :963 Max. :6.72e+08
##
## end_day categories feeds top_n
## 2014-05-01:161 [-1] :322 (1000, 1001, 1002):964 Min. :1000
## 2014-05-07:162 [36] :319 1st Qu.:1000
## 2014-05-31:160 [6014]:323 Median :1000
## 2014-06-01:161 Mean :1000
## 2014-06-07:159 3rd Qu.:1000
## 2014-06-30:161 Max. :1000
##
## begin time threads worker
## 2014-08-07 00:37:35.550283: 3 Min. :1.07 1 :162 0 :964
## 2014-08-07 00:38:00.618467: 3 1st Qu.:1.17 2 :162 16: 0
## 2014-08-07 00:38:00.622933: 2 Median :1.29 4 :160 32: 0
## 2014-08-07 00:28:12.117471: 1 Mean :1.46 8 :160 40: 0
## 2014-08-07 00:28:13.375356: 1 3rd Qu.:1.59 16:160
## 2014-08-07 00:28:14.473507: 1 Max. :3.87 32:160
## (Other) :953
## type machine
## baseline :964 1:964
## rabbmitmq+celery: 0 2: 0
## union : 0
##
##
##
##
ggplot(dfb, aes(time, fill=threads)) + geom_density(alpha=0.5)
ggplot(dfb, aes(threads, time, fill=threads)) + geom_violin()
from above two images, we know:
Let’s use the best celery + rabbitmq configuration, two machines and 40 workers, to get it’s performance numbers.
dfcm <- df[df$machine==2 & df$worker == 40 & df$type == 'rabbmitmq+celery',]
summary(dfcm)
## X device app_id start_day
## Min. :4984 ios:804 Min. :4.44e+08 2014-05-01:401
## 1st Qu.:5185 1st Qu.:5.29e+08 2014-06-01:403
## Median :5386 Median :5.54e+08
## Mean :5386 Mean :5.66e+08
## 3rd Qu.:5586 3rd Qu.:6.08e+08
## Max. :5787 Max. :6.72e+08
##
## end_day categories feeds top_n
## 2014-05-01:133 [-1] :269 (1000, 1001, 1002):804 Min. :1000
## 2014-05-07:135 [36] :267 1st Qu.:1000
## 2014-05-31:133 [6014]:268 Median :1000
## 2014-06-01:135 Mean :1000
## 2014-06-07:133 3rd Qu.:1000
## 2014-06-30:135 Max. :1000
##
## begin time threads worker
## 2014-08-10 22:15:37.044490: 1 Min. : 1.11 1 :162 0 : 0
## 2014-08-10 22:15:38.888636: 1 1st Qu.: 1.56 2 :162 16: 0
## 2014-08-10 22:15:40.153113: 1 Median : 2.42 4 :160 32: 0
## 2014-08-10 22:15:41.407925: 1 Mean : 4.66 8 :160 40:804
## 2014-08-10 22:15:42.656058: 1 3rd Qu.: 4.70 16:160
## 2014-08-10 22:15:43.963749: 1 Max. :47.37 32: 0
## (Other) :798
## type machine
## baseline : 0 1: 0
## rabbmitmq+celery:804 2:804
## union : 0
##
##
##
##
Overall
ggplot(dfcm, aes(time, fill=threads)) + geom_density(alpha=0.5)
ggplot(dfcm, aes(threads, time, fill=threads)) + geom_violin()
quantile(dfcm[dfcm$threads==1,]$time)
## 0% 25% 50% 75% 100%
## 1.109 1.195 1.238 1.276 1.915
let’s see details
ggplot(dfcm[dfcm$threads!=16 & dfcm$threads!=8 & dfcm$threads!=4,], aes(threads, time, fill=threads)) + geom_violin()
ggplot(dfcm[dfcm$threads!=16 & dfcm$threads!=8,], aes(threads, time, fill=threads)) + geom_violin()
ggplot(dfcm[dfcm$threads!=16,], aes(threads, time, fill=threads)) + geom_violin()
ggplot(dfcm, aes(threads, time, fill=threads)) + geom_violin()
for (n in list(1, 2, 4, 8, 16)) { print(paste("Quantile of # Threads:", n)); print(quantile(dfcm[dfcm$threads==n,]$time)) }
## [1] "Quantile of # Threads: 1"
## 0% 25% 50% 75% 100%
## 1.109 1.195 1.238 1.276 1.915
## [1] "Quantile of # Threads: 2"
## 0% 25% 50% 75% 100%
## 1.330 1.559 1.632 1.685 1.960
## [1] "Quantile of # Threads: 4"
## 0% 25% 50% 75% 100%
## 1.758 2.305 2.432 2.611 20.785
## [1] "Quantile of # Threads: 8"
## 0% 25% 50% 75% 100%
## 1.769 4.089 4.362 4.589 5.891
## [1] "Quantile of # Threads: 16"
## 0% 25% 50% 75% 100%
## 4.064 7.239 8.519 14.924 47.370
Time delay
df_celery <- df[df$machine==2 & df$worker!=16 & df$worker!=32, ]
df_baseline <- df[df$type=='baseline' & df$threads!=32, ]
df_delay <- rbind(df_celery, df_baseline)
ggplot(df_delay[df_delay$threads==1,], aes(threads, time, fill=type)) + geom_point(aes(color=type, alpha=type, position = "jitter"))
ggplot(df_delay, aes(threads, time, fill=type)) + geom_point(aes(color=type, alpha=type, position = "jitter"))
ggplot(df_delay, aes(threads, time, fill=type)) + geom_violin(aes(color=type))
for (n in list(1, 2, 4, 8, 16)) { print(mean(df_delay[df_delay$type=='rabbmitmq+celery' & df_delay$threads==n,]$time) - mean(df_delay[df_delay$type=='baseline' & df_delay$threads==n,]$time)) }
## [1] -0.02525
## [1] 0.3342
## [1] 2.149
## [1] 2.922
## [1] 11.24
dfu <- df[df$type=='union',]
summary(dfu)
## X device app_id start_day
## Min. : 0 ios:2416 Min. :4.44e+08 2014-05-01:1210
## 1st Qu.: 604 1st Qu.:5.29e+08 2014-06-01:1206
## Median :1208 Median :5.54e+08
## Mean :1208 Mean :5.66e+08
## 3rd Qu.:1811 3rd Qu.:6.08e+08
## Max. :2415 Max. :6.72e+08
##
## end_day categories feeds top_n
## 2014-05-01:402 [-1] :802 (1000, 1001, 1002):2416 Min. :1000
## 2014-05-07:404 [36] :806 1st Qu.:1000
## 2014-05-31:404 [6014]:808 Median :1000
## 2014-06-01:402 Mean :1000
## 2014-06-07:403 3rd Qu.:1000
## 2014-06-30:401 Max. :1000
##
## begin time threads worker
## 2014-08-11 00:15:55.503037: 2 Min. :0.810 1 :486 0 :2416
## 2014-08-11 00:01:07.065419: 1 1st Qu.:0.948 2 :486 16: 0
## 2014-08-11 00:01:08.371085: 1 Median :1.014 4 :484 32: 0
## 2014-08-11 00:01:09.365466: 1 Mean :1.052 8 :480 40: 0
## 2014-08-11 00:01:10.495604: 1 3rd Qu.:1.087 16:480
## 2014-08-11 00:01:11.621157: 1 Max. :3.022 32: 0
## (Other) :2409
## type machine
## baseline : 0 1:2416
## rabbmitmq+celery: 0 2: 0
## union :2416
##
##
##
##
ggplot(dfu, aes(time, fill=threads)) + geom_density()
ggplot(dfu, aes(threads, time, fill=threads)) + geom_violin()
dfc <- df[df$type == 'rabbmitmq+celery',]
summary(dfc)
## X device app_id start_day
## Min. : 964 ios:4824 Min. :4.44e+08 2014-05-01:2403
## 1st Qu.:2170 1st Qu.:5.29e+08 2014-06-01:2421
## Median :3376 Median :5.54e+08
## Mean :3376 Mean :5.66e+08
## 3rd Qu.:4581 3rd Qu.:6.08e+08
## Max. :5787 Max. :6.72e+08
##
## end_day categories feeds top_n
## 2014-05-01:799 [-1] :1604 (1000, 1001, 1002):4824 Min. :1000
## 2014-05-07:804 [36] :1612 1st Qu.:1000
## 2014-05-31:800 [6014]:1608 Median :1000
## 2014-06-01:809 Mean :1000
## 2014-06-07:805 3rd Qu.:1000
## 2014-06-30:807 Max. :1000
##
## begin time threads worker
## 2014-08-08 00:25:43.503002: 2 Min. : 1.06 1 :972 0 : 0
## 2014-08-08 00:25:43.510293: 2 1st Qu.: 1.70 2 :972 16:1608
## 2014-08-08 01:58:29.410540: 2 Median : 3.93 4 :960 32:1608
## 2014-08-08 01:58:29.440075: 2 Mean : 10.66 8 :960 40:1608
## 2014-08-08 01:58:29.453845: 2 3rd Qu.: 8.77 16:960
## 2014-08-08 02:37:51.661312: 2 Max. :122.97 32: 0
## (Other) :4812
## type machine
## baseline : 0 1:2412
## rabbmitmq+celery:4824 2:2412
## union : 0
##
##
##
##
# ggplot(dfc[dfc$machine==1,], aes(threads, time, fill=threads)) + geom_violin() + facet_grid(worker ~ .)
ggplot(dfc[dfc$machine==1,], aes(threads, time, fill=threads)) + geom_boxplot() + facet_grid(worker ~ .)
# ggplot(dfc[dfc$worker==40,], aes(threads, time, fill=threads)) + geom_violin() + facet_grid(machine ~ .)
ggplot(dfc[dfc$worker==40,], aes(threads, time, fill=threads)) + geom_boxplot() + facet_grid(machine ~ .)
# ggplot(dfc, aes(threads, time, fill=threads)) + geom_violin() + facet_grid(machine ~ worker)
ggplot(dfc, aes(threads, time, fill=threads)) + geom_boxplot() + facet_grid(machine ~ worker)