Experiment setup

Experiment duration: 80secs (results ignored the first and last 10 seconds)
Number of experiment repetitions: 5
Throughput: 3000 reqs/sec
Number of buffered messages: 5000 (this will result in a ~2GB heap usage)
Message size: 300KB
Go: 1.8
Number of instances: 2
Servers: 2 vCPUs, 4GB RAM

Latency

Latency was measured at the load balancer (NGINX).

read.accesslog <- function(f) {
  # https://lincolnloop.com/blog/tracking-application-response-time-nginx/
  al <- read.csv(f, sep=";", colClasses=c("upstream_response_time"="character"))
  # request processing time in seconds with a milliseconds resolution;
  # time elapsed between the first bytes were read from the client and
  # the log write after the last bytes were sent to the client
  # http://nginx.org/en/docs/http/ngx_http_log_module.html.
  al$request_time <- al$request_time * 1000 # Making it milliseconds.
  # Calculating elapsed time. It is more useful than timestamp.
  al <- al %>% arrange(timestamp)
  al$exp_dur_ms <- c(0, al$timestamp[2:NROW(al)]-al$timestamp[1]) * 1000
  al$hop1 <- sub(',.*$', '', al$upstream_response_time)
  al$hop1 <- as.numeric(al$hop1)*1000
  al$hop2 <- sub('^.*,', '', al$upstream_response_time)
  al$hop2 <- as.numeric(al$hop2)*1000
  al$num_hops <- str_count(al$upstream_response_time, ',')+1
  # Removing the first and last 10 seconds from the experiment results.
  return(filter(al, exp_dur_ms > 10000 & exp_dur_ms < (EXPERIMENT_DURATION-10000)))
}

Overall Summary

print(paste("Number of requests considered (GCI On): ", NROW(al.gci)))

## [1] "Number of requests considered (GCI On):  899980"

summary(al.gci$request_time)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   3.175   4.000 217.000

print(paste("Number of requests considered (GCI Off): ", NROW(al.nogci)))

## [1] "Number of requests considered (GCI Off):  900566"

summary(al.nogci$request_time)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.000    2.000    3.000    3.613    4.000 1327.000

hist(log(al.gci$request_time), breaks=50, main="Overall Latency (GCI ON)", ylab = "Frequency", xlab="Latency in log scale (ms)", xlim=c(0,7))

hist(log(al.nogci$request_time), breaks=50, main="Overall Latency (GCI OFF)",ylab = "Frequency", xlab="Latency in log scale (ms)", xlim=c(0,7))

quantile(al.gci$request_time, c(0.9,0.99,0.999,0.9999, 0.99999))

##     90%     99%   99.9%  99.99% 99.999% 
##       6      16      28      42      60

quantile(al.nogci$request_time, c(0.9,0.99,0.999,0.9999, 0.99999))

##      90%      99%    99.9%   99.99%  99.999% 
##   7.0000  20.0000  33.0000 221.7740 927.9887

Null-Hypothesis check (around the median)

# Non parametric test which checks the pseudo-median
# https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
wilcox.test(sample(al.gci$request_time, 50000), conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  sample(al.gci$request_time, 50000)
## V = 1229300000, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  2.500067 2.500090
## sample estimates:
## (pseudo)median 
##       2.500002

wilcox.test(sample(al.nogci$request_time, 50000), conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  sample(al.nogci$request_time, 50000)
## V = 1232200000, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  2.999922 2.999999
## sample estimates:
## (pseudo)median 
##        2.99997

Failed Requests

failed.requests <- filter(al.gci, status != 200)
print(paste("Number of failed requests: ", NROW(failed.requests)))

## [1] "Number of failed requests:  0"

summary(failed.requests$request_time)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##

Requests that have not been shed

direct.requests <- filter(al.gci, num_hops==1)
print(paste("Number of requests that have not been shed: ", NROW(direct.requests), "(", (NROW(direct.requests)/NROW(al.gci))*100,"%)"))

## [1] "Number of requests that have not been shed:  892392 ( 99.1568701526701 %)"

summary(direct.requests$request_time)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   3.107   4.000 217.000

hist(direct.requests$request_time, breaks=50, main="Latency of requests that have not been shed", xlab="Latency (ms)")

quantile(direct.requests$request_time, c(0.9,0.99,0.999,0.9999, 0.99999))

##     90%     99%   99.9%  99.99% 99.999% 
##       6      15      26      38      55

Shed requests

The requests have been shed by the first upstream service and processed by the second.

Overall Stats

resent.requests <- filter(al.gci, num_hops > 1)
print(paste("Number of shed requests: ", NROW(resent.requests), "(", (NROW(resent.requests)/NROW(al.gci))*100,"%)"))

## [1] "Number of shed requests:  7588 ( 0.843129847329941 %)"

summary(resent.requests$request_time)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.00   10.00   11.16   14.00   67.00

hist(resent.requests$request_time, breaks=50, main="Latency of requests that have been shed", xlab="Latency (ms)")

quantile(resent.requests$request_time, c(0.9,0.99,0.999,0.9999, 0.99999))

##      90%      99%    99.9%   99.99%  99.999% 
## 20.00000 38.00000 57.41300 63.20650 66.62065

First node stats

Statistics about the time for a request to reach the first instance (upstream server), go over the queue and be shed.

summary(resent.requests$hop1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    3.00    4.38    6.00   38.00

hist(resent.requests$hop1, main="Latency of the first upstream service (shed)", xlab="Latency (ms)")

quantile(resent.requests$hop1, c(0.9,0.99,0.999,0.9999, 0.99999))

##      90%      99%    99.9%   99.99%  99.999% 
## 10.00000 20.00000 36.00000 37.24130 37.92413

Processing instance stats

After being shed by the first instance the request is redirected. The following statistic represents the latency considering only of the second upstream server.

summary(resent.requests$hop2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   5.000   6.782   9.000  43.000

hist(resent.requests$hop2, main="Latency of the second upstream service (shed)", xlab="Latency (ms)")

quantile(resent.requests$hop2, c(0.9,0.99,0.999,0.9999, 0.99999))

##     90%     99%   99.9%  99.99% 99.999% 
##      14      30      40      43      43

Garbage Collector

# Parsing some of gctrace fields and doing some cleanup.
read.gctrace <- function(f) {
  df <- read.csv(f, sep = " ", colClasses=c("wallclock"="character", "mem"="character"))
  df$walltime <-sapply(strsplit(df$wallclock, split="+", fixed=T) , function (x) sum(as.numeric(unlist(x))))
  df$time <- as.numeric(gsub("@|s","", df$time))*1000 # Making it elaspsed time in ms.
  df$isforced <- ifelse(is.na(df$isforced),0,df$isforced)
  df$cleaned <-sapply(strsplit(df$mem, split="->", fixed=T), function (x) as.numeric(x[1])-as.numeric(x[3]))
  # Removing the first and last 10 seconds from the experiment results.
  df <- filter(df, time > 10000  & time < (EXPERIMENT_DURATION-10000))
  return(na.omit(df))
}

Overall Stats

print(paste("Total number of collections considered (GCI On):", NROW(gctrace.gci)))

## [1] "Total number of collections considered (GCI On): 889"

summary(gctrace.gci$walltime)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.411   5.126   6.509   6.835   8.073  25.020

print(paste("Total number of collections considered (GCI Off):", NROW(gctrace.nogci)))

## [1] "Total number of collections considered (GCI Off): 193"

summary(gctrace.nogci$walltime)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.177   5.251   7.490   8.103  10.100  31.240

hist(gctrace.gci$walltime, breaks=50, main="Garbage Collection Duration (GCI On)", xlab="GC Duration - Wall time (ms)", ylab="Frequency")

hist(gctrace.nogci$walltime, breaks=50, main="Garbage Collection Duration (GCI Off)", xlab="GC Duration - Wall time (ms)", ylab="Frequency")

quantile(gctrace.gci$walltime, c(0.9,0.99,0.999,0.9999, 0.99999))

##      90%      99%    99.9%   99.99%  99.999% 
##  9.75560 15.02000 24.11835 24.93434 25.01593

quantile(gctrace.nogci$walltime, c(0.9,0.99,0.999,0.9999, 0.99999))

##      90%      99%    99.9%   99.99%  99.999% 
## 12.86800 19.48832 29.88406 31.10801 31.23040

Number of MBs cleaned by GC

summary(gctrace.gci$cleaned)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   362.0   362.0   350.3   362.0   543.0

summary(gctrace.nogci$cleaned)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2    1671    1671    1566    1672    1675

hist(gctrace.gci$cleaned, breaks=50, main="Amount of garbage cleaned at each collection (GCI On)", xlab="Amount of garbage cleaned (MB)")

hist(gctrace.nogci$cleaned, breaks=50, main="Amount of garbage cleaned at each collection (GCI Off)", xlab="Amount of garbage cleaned (MB)")

quantile(gctrace.gci$cleaned, c(0.9,0.99,0.999,0.9999, 0.99999))

##      90%      99%    99.9%   99.99%  99.999% 
## 362.0000 362.0000 383.1600 527.0160 541.4016

quantile(gctrace.nogci$cleaned, c(0.9,0.99,0.999,0.9999, 0.99999))

##     90%     99%   99.9%  99.99% 99.999% 
##    1673    1675    1675    1675    1675

CPU Usage

read.cpu <- function(f) {
  df <- read.csv(f)
  return(na.omit(df[11:NROW(df-10),]))
}

summary(cpu.gci$cpu)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    27.0   107.0   113.0   114.2   122.0   142.0

summary(cpu.nogci$cpu)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    35.0   113.0   120.4   119.8   127.0   174.3

hist((cpu.gci$cpu/200)*100, breaks=50, main="CPU Usage (GCI On)", xlab="CPU Usage (%)", xlim = c(0, 100))

hist((cpu.nogci$cpu/200)*100, breaks=50, main="CPU Usage (GCI Off)", xlab="CPU Usage (%)", xlim = c(0, 100))

quantile(cpu.gci$cpu, c(0.9,0.99,0.999,0.9999, 0.99999))

##      90%      99%    99.9%   99.99%  99.999% 
## 129.0000 139.6880 141.2860 141.9286 141.9929

quantile(cpu.nogci$cpu, c(0.9,0.99,0.999,0.9999, 0.99999))

##      90%      99%    99.9%   99.99%  99.999% 
## 134.0000 142.0000 156.2726 172.5333 174.1593

Null-Hyphothesis check (around the median)

# Non parametric test which checks the pseudo-median
# https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
wilcox.test(cpu.gci$cpu, conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  cpu.gci$cpu
## V = 255970, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  113.4999 115.0000
## sample estimates:
## (pseudo)median 
##         114.45

wilcox.test(cpu.nogci$cpu, conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  cpu.nogci$cpu
## V = 255260, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  119.5 121.0
## sample estimates:
## (pseudo)median 
##            120

Appendix

Commands used to execute the experiment

Note: Using fake IP addresses.

$ INSTANCES="1.1.1.1 1.1.1.2" ./update-msgpush.sh 
$ ROUND_START=1 ROUND_END=5 EXPERIMENT_DURATION=80s SUFFIX=_3K_300KB_5K THROUGHPUT=3000 MSG_SIZE=361200 WINDOW_SIZE=5000 USE_GCI=true INSTANCES="1.1.1.1 1.1.1.2 LB=1.1.1.5 ./run-msgpush.sh
$ ROUND_START=1 ROUND_END=5 EXPERIMENT_DURATION=80s SUFFIX=_3K_300KB_5K THROUGHPUT=3000 MSG_SIZE=361200 WINDOW_SIZE=5000 USE_GCI=false INSTANCES="1.1.1.2 1.1.1.2" LB=1.1.1.5 ./run-msgpush.sh

Nginx.conf

Note: Using fake IP addresses.

user www-data;
worker_processes auto;
pid /run/nginx.pid;

events {
        worker_connections 768;
        # multi_accept on;
}

http {
        upstream backends {
            server 1.1.1.1:3000 max_fails=0 fail_timeout=15s;
            server 1.1.1.2:3000 max_fails=0 fail_timeout=15s;
            #server 1.1.1.3:3000 max_fails=0 fail_timeout=15s;
            #server 1.1.1.4:3000 max_fails=0 fail_timeout=15s;
        }

        server {
            listen 80;

            location / {
                proxy_pass http://backends;
                proxy_next_upstream  timeout error http_503;
                proxy_next_upstream_tries 0;
                proxy_read_timeout 5m;
            }
        }

        ##
        # Logging Settings
        ##
        log_format exp '$msec;$status;$request_time;$upstream_response_time';
        access_log /var/log/nginx/access.log exp;
        error_log /var/log/nginx/error.log;

        ##
        # Virtual Host Configs
        ##

        #include /etc/nginx/conf.d/*.conf;
        #include /etc/nginx/sites-enabled/*;
}

GCI-Go reaches the Cloud: 2 Intances

Daniel Fireman (danielfireman@gmail.com)

January, 2018

Experiment setup

Latency

Overall Summary

Null-Hypothesis check (around the median)

Failed Requests

Requests that have not been shed

Shed requests

Overall Stats

First node stats

Processing instance stats

Garbage Collector

Overall Stats

Number of MBs cleaned by GC

CPU Usage

Null-Hyphothesis check (around the median)

Appendix

Commands used to execute the experiment

Nginx.conf