#=========================================================
There were 20 warnings (use warnings() to see them)
# PREAMBLE
#=========================================================
# load the plotting library
suppressMessages(library(ggplot2))
library(gridExtra)
library(tikzDevice)
theme_set(theme_bw())
options(scipen=999) # turn-off scientific notation like 1e+48
# size of point for scatterplots
POINT_SIZE = 0.5
#POINT_SIZE = 1
# timeout
TIMEOUT = params$timeout
TIMEOUT_VAL = 1.05 * TIMEOUT
TIME_MIN = 0.01 # seconds
# FUNCTIONS
read_file <- function(file) {
filename = paste0(file)
df <- read.csv2(filename,
header=TRUE,
sep=";",
dec=",",
comment.char="",
strip.white=TRUE,
stringsAsFactors=FALSE)
return(df)
}
plot_scatter_log <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
pscat <- ggplot(df, aes_string(x=xlab, y=ylab)) +
geom_point(size=POINT_SIZE) +
geom_abline(size=0.1) +
geom_vline(size=0.1, xintercept=TIMEOUT_VAL, linetype="dashed") +
geom_hline(size=0.1, yintercept=TIMEOUT_VAL, linetype="dashed") +
scale_x_log10() +
scale_y_log10() +
theme(axis.text.y = element_text(angle = 90, hjust = 0.5)) +
#coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(0.1, TIMEOUT_VAL)) +
coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
labs(
#title="Title",
#subtitle="Subtitle",
x=xstring,
y=ystring)
return(pscat)
}
make_tikz <- function(file, picture, width=2.5, height=2.5) {
font_size <- 1
tikz(file=file, onefile=T, width=width, height=height)
plot(picture)
garbage <- dev.off()
}
df <- read_file(params$file_cmp)
######################### SANITIZE ###############################
tools.times <- c("re2g", "cad", "grep", "srm", "dot.net")
tools.matches <- c("re2g.matches", "cad.matches", "grep.matches", "srm.matches", "dot.net.matches")
# change the type of columns other than the name
for (i in tools.times) {
df[,i] <- sub(",", ".", df[,i])
suppressWarnings(df[,i] <- as.numeric(df[,i]))
}
for (i in tools.matches) {
suppressWarnings(df[,i] <- as.integer(df[,i]))
}
# get rid of extremal values
df[tools.times][is.na(df[tools.times])] <- TIMEOUT_VAL
#df[is.na(df)] <- TIMEOUT_VAL
#df[df == 0.00] <- TIME_MIN
df[,tools.times][df[,tools.times] == 0] <- TIME_MIN
# clean the data
#df_new <- df[df$Lines != "ERROR WHILE CONVERTING TO DCA.",]
#print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to converting to DCA error"))
#df <- df_new
############################## COUNTING TIMEOUTS #######################
timeouts.re2 <- nrow(df[df$re2g == TIMEOUT_VAL,])
timeouts.ca <- nrow(df[df$cad == TIMEOUT_VAL,])
timeouts.grep <- nrow(df[df$grep == TIMEOUT_VAL,])
timeouts.srm <- nrow(df[df$srm == TIMEOUT_VAL,])
timeouts.dot.net <- nrow(df[df$dot.net == TIMEOUT_VAL,])
timeouts.re2.and.ca <- nrow(df[df$cad == TIMEOUT_VAL & df$re2g == TIMEOUT_VAL,])
These are results of the experiments for Counting Set Automata:
| File |
results-10-05-2020/table-ALL.csv |
| Timeout |
300 s |
| Benchmarks |
4315 |
| Timeouts CA |
0 |
| Timeouts RE2 |
25 |
| Timeouts SRM |
25 |
| Timeouts grep |
239 |
| Timeouts .NET |
42 |
df
Sanity checks
df$inconsistent <- df$re2g.matches != df$grep.matches | df$re2g.matches != df$srm.matches | df$re2g.matches != df$dot.net.matches | df$re2g.matches != df$cad.matches
df$grep.re2.mismatch <- !is.na(df$re2g.matches) & !is.na(df$grep.matches) & df$re2g.matches != df$grep.matches
df_grep_re2_mismatch <- df[df$grep.re2.mismatch,]
df$re2.ca.mismatch <- !is.na(df$re2g.matches) & !is.na(df$cad.matches) & df$re2g.matches != df$cad.matches
df_re2_ca_mismatch <- df[df$re2.ca.mismatch,]
| grep and RE2 mismatched |
1110 |
| CA and RE2 mismatched |
103 |
RE2 and CA mismatches
df_re2_ca_mismatch
Scatter Plots
plot.and.tikz <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
pic <- plot_scatter_log(df, xlab, ylab, xstring, ystring)
make_tikz(paste0("figs/", xlab, "-vs-", ylab, ".tikz"), pic, width=4, height=4)
pic
}
plot1 <- plot.and.tikz(df, "re2g", "cad", xstring="RE2 [s]", ystring="CA [s]")
plot2 <- plot.and.tikz(df, "grep", "cad", xstring="grep [s]", ystring="CA [s]")
plot3 <- plot.and.tikz(df, "srm", "cad", xstring="SRM [s]", ystring="CA [s]")
plot4 <- plot.and.tikz(df, "dot.net", "cad", xstring=".NET [s]", ystring="CA [s]")
plot5 <- plot.and.tikz(df, "srm", "re2g")
plot6 <- plot.and.tikz(df, "grep", "re2g")
plot7 <- plot.and.tikz(df, "dot.net", "re2g")
plot8 <- plot.and.tikz(df, "srm", "grep")
plot9 <- plot.and.tikz(df, "dot.net", "grep")
plot10 <- plot.and.tikz(df, "dot.net", "srm")
grid.arrange(plot1, plot2, ncol = 2)

grid.arrange(plot3, plot4, ncol = 2)

grid.arrange(plot5, plot6, ncol = 2)

grid.arrange(plot7, plot8, ncol = 2)

grid.arrange(plot9, plot10, ncol = 2)

Histograms
hist1 <- ggplot(df, aes(x=re2g)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
hist2 <- ggplot(df, aes(x=cad)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
hist3 <- ggplot(df, aes(x=srm)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
hist4 <- ggplot(df, aes(x=grep)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
hist5 <- ggplot(df, aes(x=dot.net)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
grid.arrange(hist1, hist2, ncol = 2)

grid.arrange(hist3, hist4, ncol = 2)

grid.arrange(hist5, ncol = 2)

NA
NA
Finding winners
df$min <-pmin(df$grep, df$srm, df$re2g, df$dot.net, df$cad)
df$enemy.min <- pmin(df$grep, df$srm, df$re2g, df$dot.net)
winners.grep <- nrow(df[df$min == df$grep,])
winners.re2 <- nrow(df[df$min == df$re2g,])
winners.ca <- nrow(df[df$min == df$cad,])
winners.srm <- nrow(df[df$min == df$srm,])
winners.dot.net <- nrow(df[df$min == df$dot.net,])
| CA |
32 |
| RE2 |
3759 |
| SRM |
7 |
| grep |
1736 |
| .NET |
64 |
plot.and.tikz(df, "enemy.min", "cad", xstring="best enemy", ystring="CA [s]")

How much we are better than RE2
df$re2.vs.ca <- df$re2g / df$cad
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
9: In readChar(file, size, TRUE) : truncating string with embedded nuls
10: In readChar(file, size, TRUE) : truncating string with embedded nuls
df_sorted <- df[order(df$re2.vs.ca, decreasing=TRUE),]
df_sorted[,c("src", "pattern", "file", "re2g", "cad", "re2.vs.ca")]
Experiments with increasing counter value
big <- read_file(params$file_big)
big$re2g <- sub(",", ".", big$re2g)
big$re2g <- as.numeric(big$re2g)
NAs introduced by coercion
big
together <- big[, c("Counter", "re2g")]
names(together)[2] <- "time"
together$approach <- "re2g"
tmp <- big[, c("Counter", "ca")]
names(tmp)[2] <- "time"
tmp$approach <- "ca"
together <- rbind(together, tmp)
big_plot <- ggplot(data=together, aes(x=Counter, y=time, colour=approach)) +
geom_line() +
xlim(NA,1250) +
theme(legend.position = c(.05, .95),
legend.justification = c("left", "top"),
#legend.box.background = element_rect(color="black", size=0.5),
legend.box.just = "right",
legend.margin = margin(1, 1, 1, 1),
legend.title = element_blank()) +
labs(
#title="Title",
#subtitle="Subtitle",
x="$k$",
y="time [s]")
# geom_line(data = big, aes(x = Counter, y = re2g), color = "red") +
# geom_line(data = big, aes(x = Counter, y = ca), color = "blue") +
# xlab('counter value') +
# ylab('time [s]')
make_tikz(paste0("figs/big_plot.tikz"), big_plot, width=2.7, height=2.7)
plot(big_plot)

# Information about DCAs
df_dcas = read.csv2(params$file_dca,
header=TRUE,
sep="\t",
dec=".",
comment.char="",
quote="",
strip.white=TRUE,
stringsAsFactors=FALSE)
# sanitize
df_dcas$timeouts.classical[is.na(df_dcas$timeouts.classical)] <- 0
#df_dcas
tms.classical <- df_dcas[df_dcas$timeouts.classical == 1,]
compute_timeouts <- function(df, col) {
tmp <- df[df[, col] == TIMEOUT_VAL,]
tmp
}
