Latency was measured at the load balancer (NGINX).
read.accesslog <- function(f) {
# https://lincolnloop.com/blog/tracking-application-response-time-nginx/
al <- read.csv(f, sep=";", colClasses=c("upstream_response_time"="character"))
# request processing time in seconds with a milliseconds resolution;
# time elapsed between the first bytes were read from the client and
# the log write after the last bytes were sent to the client
# http://nginx.org/en/docs/http/ngx_http_log_module.html.
al$request_time <- al$request_time * 1000 # Making it milliseconds.
# Calculating elapsed time. It is more useful than timestamp.
al <- al %>% arrange(timestamp)
al$exp_dur_ms <- c(0, al$timestamp[2:NROW(al)]-al$timestamp[1]) * 1000
al$hop1 <- sub(',.*$', '', al$upstream_response_time)
al$hop1 <- as.numeric(al$hop1)*1000
al$hop2 <- sub('^.*,', '', al$upstream_response_time)
al$hop2 <- as.numeric(al$hop2)*1000
al$num_hops <- str_count(al$upstream_response_time, ',')+1
# Removing the first and last 10 seconds from the experiment results.
return(filter(al, exp_dur_ms > 10000 & exp_dur_ms < (EXPERIMENT_DURATION-10000)))
}
print(paste("Number of requests considered (GCI On): ", NROW(al.gci)))
## [1] "Number of requests considered (GCI On): 899980"
summary(al.gci$request_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 3.175 4.000 217.000
print(paste("Number of requests considered (GCI Off): ", NROW(al.nogci)))
## [1] "Number of requests considered (GCI Off): 900566"
summary(al.nogci$request_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 3.613 4.000 1327.000
hist(log(al.gci$request_time), breaks=50, main="Overall Latency (GCI ON)", ylab = "Frequency", xlab="Latency in log scale (ms)", xlim=c(0,7))
hist(log(al.nogci$request_time), breaks=50, main="Overall Latency (GCI OFF)",ylab = "Frequency", xlab="Latency in log scale (ms)", xlim=c(0,7))
quantile(al.gci$request_time, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 6 16 28 42 60
quantile(al.nogci$request_time, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 7.0000 20.0000 33.0000 221.7740 927.9887
# Non parametric test which checks the pseudo-median
# https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
wilcox.test(sample(al.gci$request_time, 50000), conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon signed rank test with continuity correction
##
## data: sample(al.gci$request_time, 50000)
## V = 1229300000, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 2.500067 2.500090
## sample estimates:
## (pseudo)median
## 2.500002
wilcox.test(sample(al.nogci$request_time, 50000), conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon signed rank test with continuity correction
##
## data: sample(al.nogci$request_time, 50000)
## V = 1232200000, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 2.999922 2.999999
## sample estimates:
## (pseudo)median
## 2.99997
failed.requests <- filter(al.gci, status != 200)
print(paste("Number of failed requests: ", NROW(failed.requests)))
## [1] "Number of failed requests: 0"
summary(failed.requests$request_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
direct.requests <- filter(al.gci, num_hops==1)
print(paste("Number of requests that have not been shed: ", NROW(direct.requests), "(", (NROW(direct.requests)/NROW(al.gci))*100,"%)"))
## [1] "Number of requests that have not been shed: 892392 ( 99.1568701526701 %)"
summary(direct.requests$request_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 3.107 4.000 217.000
hist(direct.requests$request_time, breaks=50, main="Latency of requests that have not been shed", xlab="Latency (ms)")
quantile(direct.requests$request_time, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 6 15 26 38 55
The requests have been shed by the first upstream service and processed by the second.
resent.requests <- filter(al.gci, num_hops > 1)
print(paste("Number of shed requests: ", NROW(resent.requests), "(", (NROW(resent.requests)/NROW(al.gci))*100,"%)"))
## [1] "Number of shed requests: 7588 ( 0.843129847329941 %)"
summary(resent.requests$request_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 10.00 11.16 14.00 67.00
hist(resent.requests$request_time, breaks=50, main="Latency of requests that have been shed", xlab="Latency (ms)")
quantile(resent.requests$request_time, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 20.00000 38.00000 57.41300 63.20650 66.62065
Statistics about the time for a request to reach the first instance (upstream server), go over the queue and be shed.
summary(resent.requests$hop1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 3.00 4.38 6.00 38.00
hist(resent.requests$hop1, main="Latency of the first upstream service (shed)", xlab="Latency (ms)")
quantile(resent.requests$hop1, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 10.00000 20.00000 36.00000 37.24130 37.92413
After being shed by the first instance the request is redirected. The following statistic represents the latency considering only of the second upstream server.
summary(resent.requests$hop2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 5.000 6.782 9.000 43.000
hist(resent.requests$hop2, main="Latency of the second upstream service (shed)", xlab="Latency (ms)")
quantile(resent.requests$hop2, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 14 30 40 43 43
# Parsing some of gctrace fields and doing some cleanup.
read.gctrace <- function(f) {
df <- read.csv(f, sep = " ", colClasses=c("wallclock"="character", "mem"="character"))
df$walltime <-sapply(strsplit(df$wallclock, split="+", fixed=T) , function (x) sum(as.numeric(unlist(x))))
df$time <- as.numeric(gsub("@|s","", df$time))*1000 # Making it elaspsed time in ms.
df$isforced <- ifelse(is.na(df$isforced),0,df$isforced)
df$cleaned <-sapply(strsplit(df$mem, split="->", fixed=T), function (x) as.numeric(x[1])-as.numeric(x[3]))
# Removing the first and last 10 seconds from the experiment results.
df <- filter(df, time > 10000 & time < (EXPERIMENT_DURATION-10000))
return(na.omit(df))
}
print(paste("Total number of collections considered (GCI On):", NROW(gctrace.gci)))
## [1] "Total number of collections considered (GCI On): 889"
summary(gctrace.gci$walltime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.411 5.126 6.509 6.835 8.073 25.020
print(paste("Total number of collections considered (GCI Off):", NROW(gctrace.nogci)))
## [1] "Total number of collections considered (GCI Off): 193"
summary(gctrace.nogci$walltime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.177 5.251 7.490 8.103 10.100 31.240
hist(gctrace.gci$walltime, breaks=50, main="Garbage Collection Duration (GCI On)", xlab="GC Duration - Wall time (ms)", ylab="Frequency")
hist(gctrace.nogci$walltime, breaks=50, main="Garbage Collection Duration (GCI Off)", xlab="GC Duration - Wall time (ms)", ylab="Frequency")
quantile(gctrace.gci$walltime, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 9.75560 15.02000 24.11835 24.93434 25.01593
quantile(gctrace.nogci$walltime, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 12.86800 19.48832 29.88406 31.10801 31.23040
summary(gctrace.gci$cleaned)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 362.0 362.0 350.3 362.0 543.0
summary(gctrace.nogci$cleaned)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 1671 1671 1566 1672 1675
hist(gctrace.gci$cleaned, breaks=50, main="Amount of garbage cleaned at each collection (GCI On)", xlab="Amount of garbage cleaned (MB)")
hist(gctrace.nogci$cleaned, breaks=50, main="Amount of garbage cleaned at each collection (GCI Off)", xlab="Amount of garbage cleaned (MB)")
quantile(gctrace.gci$cleaned, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 362.0000 362.0000 383.1600 527.0160 541.4016
quantile(gctrace.nogci$cleaned, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 1673 1675 1675 1675 1675
read.cpu <- function(f) {
df <- read.csv(f)
return(na.omit(df[11:NROW(df-10),]))
}
summary(cpu.gci$cpu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.0 107.0 113.0 114.2 122.0 142.0
summary(cpu.nogci$cpu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.0 113.0 120.4 119.8 127.0 174.3
hist((cpu.gci$cpu/200)*100, breaks=50, main="CPU Usage (GCI On)", xlab="CPU Usage (%)", xlim = c(0, 100))
hist((cpu.nogci$cpu/200)*100, breaks=50, main="CPU Usage (GCI Off)", xlab="CPU Usage (%)", xlim = c(0, 100))
quantile(cpu.gci$cpu, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 129.0000 139.6880 141.2860 141.9286 141.9929
quantile(cpu.nogci$cpu, c(0.9,0.99,0.999,0.9999, 0.99999))
## 90% 99% 99.9% 99.99% 99.999%
## 134.0000 142.0000 156.2726 172.5333 174.1593
# Non parametric test which checks the pseudo-median
# https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
wilcox.test(cpu.gci$cpu, conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon signed rank test with continuity correction
##
## data: cpu.gci$cpu
## V = 255970, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 113.4999 115.0000
## sample estimates:
## (pseudo)median
## 114.45
wilcox.test(cpu.nogci$cpu, conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon signed rank test with continuity correction
##
## data: cpu.nogci$cpu
## V = 255260, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 119.5 121.0
## sample estimates:
## (pseudo)median
## 120
Note: Using fake IP addresses.
$ INSTANCES="1.1.1.1 1.1.1.2" ./update-msgpush.sh
$ ROUND_START=1 ROUND_END=5 EXPERIMENT_DURATION=80s SUFFIX=_3K_300KB_5K THROUGHPUT=3000 MSG_SIZE=361200 WINDOW_SIZE=5000 USE_GCI=true INSTANCES="1.1.1.1 1.1.1.2 LB=1.1.1.5 ./run-msgpush.sh
$ ROUND_START=1 ROUND_END=5 EXPERIMENT_DURATION=80s SUFFIX=_3K_300KB_5K THROUGHPUT=3000 MSG_SIZE=361200 WINDOW_SIZE=5000 USE_GCI=false INSTANCES="1.1.1.2 1.1.1.2" LB=1.1.1.5 ./run-msgpush.sh
Note: Using fake IP addresses.
user www-data;
worker_processes auto;
pid /run/nginx.pid;
events {
worker_connections 768;
# multi_accept on;
}
http {
upstream backends {
server 1.1.1.1:3000 max_fails=0 fail_timeout=15s;
server 1.1.1.2:3000 max_fails=0 fail_timeout=15s;
#server 1.1.1.3:3000 max_fails=0 fail_timeout=15s;
#server 1.1.1.4:3000 max_fails=0 fail_timeout=15s;
}
server {
listen 80;
location / {
proxy_pass http://backends;
proxy_next_upstream timeout error http_503;
proxy_next_upstream_tries 0;
proxy_read_timeout 5m;
}
}
##
# Logging Settings
##
log_format exp '$msec;$status;$request_time;$upstream_response_time';
access_log /var/log/nginx/access.log exp;
error_log /var/log/nginx/error.log;
##
# Virtual Host Configs
##
#include /etc/nginx/conf.d/*.conf;
#include /etc/nginx/sites-enabled/*;
}