#=========================================================
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
9: In readChar(file, size, TRUE) : truncating string with embedded nuls
# PREAMBLE
#=========================================================
# load the plotting library
suppressMessages(library(ggplot2))
library(gridExtra)
library(ggExtra)
library(tikzDevice)
library(Hmisc)
library(pastecs)
theme_set(theme_bw())
options(scipen=999) # turn-off scientific notation like 1e+48
# size of point for scatterplots
POINT_SIZE = 0.1
#POINT_SIZE = 1
# timeout
TIMEOUT = params$timeout
TIMEOUT_VAL = 1.05 * TIMEOUT
# saturate
#TIME_MIN = 0.01 # seconds
TIME_MIN = 0.1 # seconds
BIG_SIZE=3
SMALL_SIZE=2
# FUNCTIONS
read_file <- function(file) {
filename = paste0(file)
df <- read.csv2(filename,
header=TRUE,
sep=";",
dec=",",
comment.char="",
quote="\"",
strip.white=TRUE,
allowEscapes=FALSE,
stringsAsFactors=FALSE)
return(df)
}
plot_scatter_log <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
pscat <- ggplot(df, aes_string(x=xlab, y=ylab)) +
geom_point(size=POINT_SIZE) +
geom_abline(size=0.1) +
geom_vline(size=0.1, xintercept=TIMEOUT_VAL, linetype="dashed") +
geom_hline(size=0.1, yintercept=TIMEOUT_VAL, linetype="dashed") +
geom_rug(alpha = 0.2) +
scale_x_log10() +
scale_y_log10() +
theme(axis.text.y = element_text(angle = 90, hjust = 0.5)) +
#coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(0.1, TIMEOUT_VAL)) +
#coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
labs(
#title="Title",
#subtitle="Subtitle",
x=xstring,
y=ystring)
# theme(
# panel.grid.major = element_blank(),
# panel.grid.minor = element_blank(),
# panel.background = element_rect(fill = "transparent",colour = NA),
# plot.background = element_rect(fill = "transparent",colour = NA)
# )
# theme_minimal()
# theme_bw()
# theme(plot.background = element_rect(fill = NA))
# pscat <- ggMarginal(pscat, type = "density", size=10)
# pscat <- pscat + theme_bw()
return(pscat)
}
make_tikz <- function(file, picture, width=2.5, height=2.5) {
font_size <- 1
tikz(file=file, onefile=T, width=width, height=height)
plot(picture)
garbage <- dev.off()
}
make_pdf <- function(file, picture, width=5, height=5) {
pdf(file=file, onefile=T, width, height)
plot(picture)
garbage <- dev.off()
}
df <- read_file(params$file_cmp)
orig_size <- nrow(df)
colnames(df)[colnames(df) == "dot.net"] <- "dotnet"
colnames(df)[names(df) == "dot.net.matches"] <- "dotnet.matches"
######################### SANITIZE ###############################
# clean the data
df_new <- df[!grepl("File not found", df$srm.matches),]
print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to generation of input text"))
[1] "Removing 2466 lines due to generation of input text"
df <- df_new
tools.times <- c("re2g", "cad", "grep", "srm", "dotnet")
#tools.times <- c("re2g", "cad", "srm", "dotnet")
tools.matches <- c("re2g.matches", "cad.matches", "grep.matches", "srm.matches", "dotnet.matches")
#tools.matches <- c("re2g.matches", "cad.matches", "srm.matches", "dotnet.matches")
# checking errors
errors.re2g <- nrow(df[grepl('ERR', df$re2g),])
errors.grep <- nrow(df[grepl('ERR', df$grep),])
errors.srm <- nrow(df[grepl('ERR', df$srm),])
errors.cad <- nrow(df[grepl('ERR', df$cad),])
errors.dotnet <- nrow(df[grepl('ERR', df$dotnet),])
df <- df[!grepl('ERR', df$re2g),]
# change the type of columns other than the name
for (i in tools.times) {
df[,i] <- sub(",", ".", df[,i])
suppressWarnings(df[,i] <- as.numeric(df[,i]))
}
for (i in tools.matches) {
suppressWarnings(df[,i] <- as.integer(df[,i]))
}
df$src <- as.factor(df$src)
# get rid of extremal values
df[,tools.times][df[,tools.times] > TIMEOUT] <- TIMEOUT_VAL
df[tools.times][is.na(df[tools.times])] <- TIMEOUT_VAL
#df[is.na(df)] <- TIMEOUT_VAL
#df[df == 0.00] <- TIME_MIN
df[,tools.times][df[,tools.times] < TIME_MIN] <- TIME_MIN
# clean the data
#df_new <- df[df$Lines != "ERROR WHILE CONVERTING TO DCA.",]
#print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to converting to DCA error"))
#df <- df_new
############################## COUNTING TIMEOUTS #######################
timeouts.re2g <- nrow(df[df$re2g == TIMEOUT_VAL,])
timeouts.cad <- nrow(df[df$cad == TIMEOUT_VAL,])
timeouts.grep <- nrow(df[df$grep == TIMEOUT_VAL,])
timeouts.srm <- nrow(df[df$srm == TIMEOUT_VAL,])
timeouts.dotnet <- nrow(df[df$dotnet == TIMEOUT_VAL,])
timeouts.re2.and.ca <- nrow(df[df$cad == TIMEOUT_VAL & df$re2g == TIMEOUT_VAL,])
# | **Timeouts grep** | `r timeouts.grep` |
These are results of the experiments for Counting Set Automata:
| File |
results-12-05-2020/table-ALL-ondra-processed.csv |
|
| Timeout |
600 s |
|
| TIMEOUT_VAL |
630 s |
|
| TIME_MIN |
0.1 |
|
| original size |
4397 |
|
| Benchmarks |
1866 |
|
| Timeouts CA |
0 |
|
| Timeouts RE2 |
0 |
|
| Timeouts SRM |
10 |
|
| Timeouts grep |
52 |
|
| Timeouts .NET |
7 |
|
| Errors CA |
0 |
|
| Errors RE2 |
48 |
(removed) |
| Errors SRM |
0 |
|
| Errors grep |
51 |
|
| Errors .NET |
0 |
|
df
Summary of benchmarks
df_benches <- data.frame(summary(df$src))
df_benches
Sanity checks
df$inconsistent <- df$re2g.matches != df$grep.matches | df$re2g.matches != df$srm.matches | df$re2g.matches != df$dotnet.matches | df$re2g.matches != df$cad.matches
#df$inconsistent <- df$re2g.matches != df$srm.matches | df$re2g.matches != df$dotnet.matches | df$re2g.matches != df$cad.matches
df$grep.re2.mismatch <- !is.na(df$re2g.matches) & !is.na(df$grep.matches) & df$re2g.matches != df$grep.matches
df_grep_re2_mismatch <- df[df$grep.re2.mismatch,]
df$re2.ca.mismatch <- !is.na(df$re2g.matches) & !is.na(df$cad.matches) & df$re2g.matches != df$cad.matches
df_re2_ca_mismatch <- df[df$re2.ca.mismatch,]
df <- df[is.na(df$re2g.matches) | is.na(df$grep.matches) | df$re2g.matches == df$cad.matches,]
| CA and RE2 mismatched |
18 |
(removed) |
| grep and RE2 mismatched |
295 |
|
RE2 and CA mismatches
df_re2_ca_mismatch
Scatter Plots
plot.and.tikz <- function(df, xlab, ylab, xstring=xlab, ystring=ylab, width=4, height=width) {
pic <- plot_scatter_log(df, xlab, ylab, xstring, ystring)
#make_tikz(paste0("figs/", xlab, "-vs-", ylab, ".tikz"), pic, width, height)
make_pdf(paste0("figs/", xlab, "-vs-", ylab, ".pdf"), pic, width, height)
pic
}
df_grep <- df[is.na(df$grep.matches) | is.na(df$cad.matches) | df$grep.matches == df$cad.matches,]
plot1 <- plot.and.tikz(df, "re2g", "cad", xstring="RE2 [s]", ystring="CA [s]", width=BIG_SIZE)
plot2 <- plot.and.tikz(df_grep, "grep", "cad", xstring="grep [s]", ystring="CA [s]", width=SMALL_SIZE)
plot3 <- plot.and.tikz(df, "srm", "cad", xstring="SRM [s]", ystring="CA [s]", width=SMALL_SIZE)
plot4 <- plot.and.tikz(df, "dotnet", "cad", xstring=".NET [s]", ystring="CA [s]", width=SMALL_SIZE)
plot5 <- plot.and.tikz(df, "srm", "re2g")
plot6 <- plot.and.tikz(df, "grep", "re2g")
plot7 <- plot.and.tikz(df, "dotnet", "re2g")
plot8 <- plot.and.tikz(df, "srm", "grep")
plot9 <- plot.and.tikz(df, "dotnet", "grep")
plot10 <- plot.and.tikz(df, "dotnet", "srm")
#grid.arrange(plot1, plot3, ncol = 2)
#grid.arrange(plot4, plot5, ncol = 2)
#grid.arrange(plot7, plot10, ncol = 2)
grid.arrange(plot1, plot2, ncol = 2)

grid.arrange(plot3, plot4, ncol = 2)

grid.arrange(plot5, plot6, ncol = 2)

grid.arrange(plot7, plot8, ncol = 2)

grid.arrange(plot9, plot10, ncol = 2)

Histograms
hist1 <- ggplot(df, aes(x=re2g)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
hist2 <- ggplot(df, aes(x=cad)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
hist3 <- ggplot(df, aes(x=srm)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
#hist4 <- ggplot(df, aes(x=grep)) +
# geom_histogram(color="blue", fill="lightblue") +
# scale_y_log10()
hist5 <- ggplot(df, aes(x=dotnet)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
grid.arrange(hist1, hist2, ncol = 2)

grid.arrange(hist3, hist5, ncol = 2)

#grid.arrange(hist5, ncol = 2)
Finding winners
df$min <-pmin(df$grep, df$srm, df$re2g, df$dotnet, df$cad)
df$enemy.min <- pmin(df$grep, df$srm, df$re2g, df$dotnet)
#df$min <-pmin(df$srm, df$re2g, df$dotnet, df$cad)
#df$enemy.min <- pmin(df$srm, df$re2g, df$dotnet)
winners.grep <- nrow(df[df$min == df$grep,])
winners.re2 <- nrow(df[df$min == df$re2g,])
winners.ca <- nrow(df[df$min == df$cad,])
winners.srm <- nrow(df[df$min == df$srm,])
winners.dotnet <- nrow(df[df$min == df$dotnet,])
winners.ca.over.re2 <- nrow(df[df$cad <= df$re2g,])
winners.ca.over.grep <- nrow(df[df_grep$cad <= df_grep$grep,])
winners.ca.over.srm <- nrow(df[df$cad <= df$srm,])
winners.ca.over.dotnet <- nrow(df[df$cad <= df$dotnet,])
winners.10.ca.over.re2 <- nrow(df[10* df$cad <= df$re2g,])
winners.10.ca.over.grep <- nrow(df[10* df_grep$cad <= df_grep$grep,])
winners.10.ca.over.srm <- nrow(df[10* df$cad <= df$srm,])
winners.10.ca.over.dotnet <- nrow(df[10* df$cad <= df$dotnet,])
winners.100.ca.over.re2 <- nrow(df[100* df$cad <= df$re2g,])
winners.100.ca.over.grep <- nrow(df[100* df_grep$cad <= df_grep$grep,])
winners.100.ca.over.srm <- nrow(df[100* df$cad <= df$srm,])
winners.100.ca.over.dotnet <- nrow(df[100* df$cad <= df$dotnet,])
longer.than.10.seconds.ca <- nrow(df[df$cad > 10,])
longer.than.10.seconds.re2 <- nrow(df[df$re2g > 10,])
longer.than.10.seconds.srm <- nrow(df[df$srm > 10,])
longer.than.10.seconds.dotnet <- nrow(df[df$dotnet > 10,])
longer.than.10.seconds.grep <- nrow(df[df$grep > 10,])
| CA |
72 |
| RE2 |
1392 |
| SRM |
19 |
| .NET |
45 |
| grep |
833 |
| RE2 |
171 / 1866 |
| SRM |
266 / 1866 |
| .NET |
619 / 1866 |
| grep |
668 / 1581 |
| RE2 |
63 / 1866 |
| SRM |
127 / 1866 |
| .NET |
112 / 1866 |
| grep |
200 / 1581 |
| RE2 |
15 / 1866 |
| SRM |
82 / 1866 |
| .NET |
18 / 1866 |
| grep |
67 / 1581 |
| CA |
20 |
| RE2 |
63 |
| SRM |
127 |
| .NET |
132 |
| grep |
215 |
plot.and.tikz(df, "enemy.min", "cad", xstring="best enemy", ystring="CA [s]")

How much we are better than RE2
df$re2.vs.ca <- df$re2g / df$cad
df_sorted <- df[order(df$re2.vs.ca, decreasing=TRUE),]
#df_sorted[,c("src", "pattern", "file", "re2g", "cad", "re2.vs.ca", "re2.ca.mismatch")]
df_sorted <- df_sorted[1:10,c("src", "pattern", "file", tools.times)]
df_sorted
haf <- latex(df_sorted,
file="figs/best_results.tex",
booktabs=TRUE,
table.env=FALSE,
center="none")
Summaries
df_for_summary <- df[,c("re2g", "cad", "srm", "dotnet", "grep")]
#df.summary <- do.call(cbind, lapply(df_for_summary, summary))
#df.summary
desc <- stat.desc(df_for_summary)
desc
haf <- latex(desc,
file="figs/stats.tex",
booktabs=TRUE,
table.env=FALSE,
center="none")
Experiments with increasing counter value
big <- read_file(params$file_big)
big$re2g <- sub(",", ".", big$re2g)
big$re2g <- as.numeric(big$re2g)
NAs introduced by coercion
big
together <- big[, c("Counter", "re2g")]
names(together)[2] <- "time"
together$approach <- "RE2"
tmp <- big[, c("Counter", "cad")]
names(tmp)[2] <- "time"
tmp$approach <- "CA"
together <- rbind(together, tmp)
tmp <- big[, c("Counter", "srm")]
names(tmp)[2] <- "time"
tmp$approach <- "SRM"
together <- rbind(together, tmp)
tmp <- big[, c("Counter", "dot.net")]
names(tmp)[2] <- "time"
tmp$approach <- ".NET"
together <- rbind(together, tmp)
tmp <- big[, c("Counter", "grep")]
names(tmp)[2] <- "time"
tmp$approach <- "grep"
together <- rbind(together, tmp)
BIG_STEP=100
# remove too many points
together <- together[together$Counter %% BIG_STEP == 0,]
big_plot <- ggplot(data=together, aes(x=Counter, y=time, colour=approach)) +
geom_line() +
geom_point(aes(shape=approach)) +
xlim(NA,2000) +
ylim(NA,20) +
geom_hline(size=0.1, yintercept=0, linetype="dashed") +
theme(legend.position = c(.02, .98),
legend.justification = c("left", "top"),
#legend.box.background = element_rect(color="black", size=0.5),
legend.box.just = "right",
legend.margin = margin(1, 1, 1, 1),
legend.title = element_blank()) +
labs(
#title="Title",
#subtitle="Subtitle",
x="k",
y="time [s]")
# geom_line(data = big, aes(x = Counter, y = re2g), color = "red") +
# geom_line(data = big, aes(x = Counter, y = ca), color = "blue") +
# xlab('counter value') +
# ylab('time [s]')
#make_tikz(paste0("figs/big_plot.tikz"), big_plot, width=2.7, height=2.7)
make_pdf(paste0("figs/big_plot.pdf"), big_plot, width=BIG_SIZE, height=BIG_SIZE)
plot(big_plot)

# Information about DCAs
# df_dcas = read.csv2(params$file_dca,
# header=TRUE,
# sep="\t",
# dec=".",
# comment.char="",
# quote="",
# strip.white=TRUE,
# stringsAsFactors=FALSE)
#
# # sanitize
# df_dcas$timeouts.classical[is.na(df_dcas$timeouts.classical)] <- 0
#df_dcas
#tms.classical <- df_dcas[df_dcas$timeouts.classical == 1,]
#compute_timeouts <- function(df, col) {
# tmp <- df[df[, col] == TIMEOUT_VAL,]
# tmp
#}
