#=========================================================
There were 48 warnings (use warnings() to see them)
# PREAMBLE
#=========================================================
# load the plotting library
suppressMessages(library(ggplot2))
library(gridExtra)
library(ggExtra)
library(tikzDevice)
library(Hmisc)
library(pastecs)
theme_set(theme_bw())
options(scipen=999) # turn-off scientific notation like 1e+48
# size of point for scatterplots
POINT_SIZE = 0.1
#POINT_SIZE = 1
# timeout
TIMEOUT = params$timeout
TIMEOUT_VAL = 1.05 * TIMEOUT
ERR_VAL = 1.3 * TIMEOUT
# saturate
#TIME_MIN = 0.01 # seconds
TIME_MIN = 0.1 # seconds
BIG_SIZE=3
SMALL_SIZE=2
# FUNCTIONS
read_file <- function(file) {
filename = paste0(file)
df <- read.csv2(filename,
header=TRUE,
sep=";",
dec=",",
comment.char="",
quote="\"",
strip.white=TRUE,
allowEscapes=FALSE,
stringsAsFactors=FALSE)
return(df)
}
plot_scatter_log <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
pscat <- ggplot(df, aes_string(x=xlab, y=ylab)) +
geom_point(size=POINT_SIZE) +
geom_abline(size=0.1) +
geom_vline(size=0.1, xintercept=TIMEOUT_VAL, linetype="dashed") +
geom_hline(size=0.1, yintercept=TIMEOUT_VAL, linetype="dashed") +
geom_rug(alpha = 0.2) +
scale_x_log10() +
scale_y_log10() +
theme(axis.text.y = element_text(angle = 90, hjust = 0.5)) +
#coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(0.1, TIMEOUT_VAL)) +
#coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
labs(
#title="Title",
#subtitle="Subtitle",
x=xstring,
y=ystring)
# theme(
# panel.grid.major = element_blank(),
# panel.grid.minor = element_blank(),
# panel.background = element_rect(fill = "transparent",colour = NA),
# plot.background = element_rect(fill = "transparent",colour = NA)
# )
# theme_minimal()
# theme_bw()
# theme(plot.background = element_rect(fill = NA))
# pscat <- ggMarginal(pscat, type = "density", size=10)
# pscat <- pscat + theme_bw()
return(pscat)
}
make_tikz <- function(file, picture, width=2.5, height=2.5) {
font_size <- 1
tikz(file=file, onefile=T, width=width, height=height)
plot(picture)
garbage <- dev.off()
}
make_pdf <- function(file, picture, width=5, height=5) {
pdf(file=file, onefile=T, width, height)
plot(picture)
garbage <- dev.off()
}
df <- read_file(params$file_cmp)
orig_size <- nrow(df)
colnames(df)[colnames(df) == "dot.net"] <- "dotnet"
colnames(df)[names(df) == "dot.net.matches"] <- "dotnet.matches"
######################### SANITIZE ###############################
# clean the data
df_new <- df[!grepl("File not found", df$srm.matches),]
print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to generation of input text"))
[1] "Removing 0 lines due to generation of input text"
df <- df_new
tools.times <- c("re2g", "cad", "cam", "grep", "srm", "dotnet")
#tools.times <- c("re2g", "cad", "grep", "srm", "dotnet")
#tools.times <- c("re2g", "cad", "srm", "dotnet")
tools.matches <- c("re2g.matches", "cad.matches", "cam.matches", "grep.matches", "srm.matches", "dotnet.matches")
#tools.matches <- c("re2g.matches", "cad.matches", "grep.matches", "srm.matches", "dotnet.matches")
#tools.matches <- c("re2g.matches", "cad.matches", "srm.matches", "dotnet.matches")
# checking errors
errors.re2g <- nrow(df[grepl('ERR', df$re2g),])
errors.grep <- nrow(df[grepl('ERR', df$grep),])
errors.srm <- nrow(df[grepl('ERR', df$srm),])
errors.cad <- nrow(df[grepl('ERR', df$cad),])
errors.cam <- nrow(df[grepl('ERR', df$cam),])
errors.dotnet <- nrow(df[grepl('ERR', df$dotnet),])
#df <- df[!grepl('ERR', df$re2g),]
#df$re2g[df$re2g == 'ERR'] <- ERR_VAL
df[,tools.times][df[,tools.times] == 'ERR'] <- ERR_VAL
# change the type of columns other than the name
for (i in tools.times) {
df[,i] <- sub(",", ".", df[,i])
suppressWarnings(df[,i] <- as.numeric(df[,i]))
}
for (i in tools.matches) {
suppressWarnings(df[,i] <- as.integer(df[,i]))
}
df$src <- as.factor(df$src)
# get rid of extremal values
#df[,tools.times][df[,tools.times] > TIMEOUT] <- TIMEOUT_VAL
df[tools.times][is.na(df[tools.times])] <- TIMEOUT_VAL
#df[is.na(df)] <- TIMEOUT_VAL
#df[df == 0.00] <- TIME_MIN
df[,tools.times][df[,tools.times] < TIME_MIN] <- TIME_MIN
df <- df[!is.na(df$pattern),]
# clean the data
#df_new <- df[df$Lines != "ERROR WHILE CONVERTING TO DCA.",]
#print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to converting to DCA error"))
#df <- df_new
############################## COUNTING TIMEOUTS #######################
timeouts.re2g <- nrow(df[df$re2g == TIMEOUT_VAL,])
timeouts.cad <- nrow(df[df$cad == TIMEOUT_VAL,])
timeouts.cam <- nrow(df[df$cam == TIMEOUT_VAL,])
timeouts.grep <- nrow(df[df$grep == TIMEOUT_VAL,])
timeouts.srm <- nrow(df[df$srm == TIMEOUT_VAL,])
timeouts.dotnet <- nrow(df[df$dotnet == TIMEOUT_VAL,])
timeouts.re2.and.ca <- nrow(df[df$cad == TIMEOUT_VAL & df$re2g == TIMEOUT_VAL,])
# | **Timeouts grep** | `r timeouts.grep` |
These are results of the experiments for Counting Set Automata:
| File |
results-03-06-2020/table-ALL-processed.csv |
|
| Timeout |
600 s |
|
| TIMEOUT_VAL |
630 s |
|
| TIME_MIN |
0.1 |
|
| original size |
1764 |
|
| Benchmarks |
1740 |
|
| Timeouts CA |
0 |
|
| Timeouts CAM |
0 |
|
| Timeouts RE2 |
0 |
|
| Timeouts SRM |
12 |
|
| Timeouts grep |
10 |
|
| Timeouts .NET |
10 |
|
| Errors CA |
0 |
|
| Errors CAM |
0 |
|
| Errors RE2 |
74 |
(removed) |
| Errors SRM |
0 |
|
| Errors grep |
58 |
|
| Errors .NET |
0 |
|
df
Summary of benchmarks
df_benches <- data.frame(summary(df$src))
df_benches
Sanity checks
df$inconsistent <- df$re2g.matches != df$grep.matches | df$re2g.matches != df$srm.matches | df$re2g.matches != df$dotnet.matches | df$re2g.matches != df$cad.matches
#df$inconsistent <- df$re2g.matches != df$srm.matches | df$re2g.matches != df$dotnet.matches | df$re2g.matches != df$cad.matches
df$grep.re2.mismatch <- !is.na(df$re2g.matches) & !is.na(df$grep.matches) & df$re2g.matches != df$grep.matches
df_grep_re2_mismatch <- df[df$grep.re2.mismatch,]
df$re2.ca.mismatch <- !is.na(df$re2g.matches) & !is.na(df$cad.matches) & df$re2g.matches != df$cad.matches
df_re2_ca_mismatch <- df[df$re2.ca.mismatch,]
df <- df[is.na(df$re2g.matches) | is.na(df$grep.matches) | df$re2g.matches == df$cad.matches,]
| CA and RE2 mismatched |
24 |
(removed) |
| grep and RE2 mismatched |
363 |
|
RE2 and CA mismatches
df_re2_ca_mismatch
Scatter Plots
plot.and.tikz <- function(df, xlab, ylab, xstring=xlab, ystring=ylab, width=4, height=width) {
pic <- plot_scatter_log(df, xlab, ylab, xstring, ystring)
#make_tikz(paste0("figs/", xlab, "-vs-", ylab, ".tikz"), pic, width, height)
make_pdf(paste0("figs/", xlab, "-vs-", ylab, ".pdf"), pic, width, height)
pic
}
df_grep <- df[is.na(df$grep.matches) | is.na(df$cad.matches) | df$grep.matches == df$cad.matches,]
plot1 <- plot.and.tikz(df, "re2g", "cad", xstring="RE2 [s]", ystring="CA [s]", width=BIG_SIZE)
plot2 <- plot.and.tikz(df_grep, "grep", "cad", xstring="grep [s]", ystring="CA [s]", width=SMALL_SIZE)
plot3 <- plot.and.tikz(df, "srm", "cad", xstring="SRM [s]", ystring="CA [s]", width=SMALL_SIZE)
plot4 <- plot.and.tikz(df, "dotnet", "cad", xstring=".NET [s]", ystring="CA [s]", width=SMALL_SIZE)
plot5 <- plot.and.tikz(df, "srm", "re2g")
plot6 <- plot.and.tikz(df, "grep", "re2g")
plot7 <- plot.and.tikz(df, "dotnet", "re2g")
plot8 <- plot.and.tikz(df, "srm", "grep")
plot9 <- plot.and.tikz(df, "dotnet", "grep")
plot10 <- plot.and.tikz(df, "dotnet", "srm")
plot11 <- plot.and.tikz(df, "re2g", "cam", xstring="RE2 [s]", ystring="CAM [s]", width=BIG_SIZE)
plot12 <- plot.and.tikz(df_grep, "grep", "cam", xstring="grep [s]", ystring="CAM [s]", width=SMALL_SIZE)
plot13 <- plot.and.tikz(df, "srm", "cam", xstring="SRM [s]", ystring="CA [s]", width=SMALL_SIZE)
plot14 <- plot.and.tikz(df, "dotnet", "cam", xstring=".NET [s]", ystring="CA [s]", width=SMALL_SIZE)
plot15 <- plot.and.tikz(df, "cad", "cam", xstring="CAD [s]", ystring="CAM [s]", width=BIG_SIZE)
#grid.arrange(plot1, plot3, ncol = 2)
#grid.arrange(plot4, plot5, ncol = 2)
#grid.arrange(plot7, plot10, ncol = 2)
grid.arrange(plot1, plot2, ncol = 2)

grid.arrange(plot3, plot4, ncol = 2)

grid.arrange(plot5, plot6, ncol = 2)

grid.arrange(plot7, plot8, ncol = 2)

grid.arrange(plot9, plot10, ncol = 2)

grid.arrange(plot11, plot12, ncol = 2)

grid.arrange(plot13, plot14, ncol = 2)

grid.arrange(plot15, ncol = 2)

Histograms
hist1 <- ggplot(df, aes(x=re2g)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
hist2 <- ggplot(df, aes(x=cad)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
hist3 <- ggplot(df, aes(x=srm)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
#hist4 <- ggplot(df, aes(x=grep)) +
# geom_histogram(color="blue", fill="lightblue") +
# scale_y_log10()
hist5 <- ggplot(df, aes(x=dotnet)) +
geom_histogram(color="blue", fill="lightblue") +
scale_y_log10()
grid.arrange(hist1, hist2, ncol = 2)

grid.arrange(hist3, hist5, ncol = 2)

#grid.arrange(hist5, ncol = 2)
Finding winners
df$min <-pmin(df$grep, df$srm, df$re2g, df$dotnet, df$cad, df$cam)
#df$min <-pmin(df$grep, df$srm, df$re2g, df$dotnet, df$cad)
df$enemy.min <- pmin(df$grep, df$srm, df$re2g, df$dotnet)
#df$min <-pmin(df$srm, df$re2g, df$dotnet, df$cad)
#df$enemy.min <- pmin(df$srm, df$re2g, df$dotnet)
winners.grep <- nrow(df[df$min == df$grep,])
winners.re2 <- nrow(df[df$min == df$re2g,])
winners.ca <- nrow(df[df$min == df$cad,])
winners.cam <- nrow(df[df$min == df$cam,])
winners.srm <- nrow(df[df$min == df$srm,])
winners.dotnet <- nrow(df[df$min == df$dotnet,])
winners.ca.over.re2 <- nrow(df[df$cad <= df$re2g,])
winners.ca.over.grep <- nrow(df[df_grep$cad <= df_grep$grep,])
winners.ca.over.srm <- nrow(df[df$cad <= df$srm,])
winners.ca.over.dotnet <- nrow(df[df$cad <= df$dotnet,])
winners.cam.over.re2 <- nrow(df[df$cam <= df$re2g,])
winners.cam.over.grep <- nrow(df[df_grep$cam <= df_grep$grep,])
winners.cam.over.srm <- nrow(df[df$cam <= df$srm,])
winners.cam.over.dotnet <- nrow(df[df$cam <= df$dotnet,])
winners.cam.over.cad <- nrow(df[df$cam <= df$cad,])
winners.10.ca.over.re2 <- nrow(df[10* df$cad <= df$re2g,])
winners.10.ca.over.grep <- nrow(df[10* df_grep$cad <= df_grep$grep,])
winners.10.ca.over.srm <- nrow(df[10* df$cad <= df$srm,])
winners.10.ca.over.dotnet <- nrow(df[10* df$cad <= df$dotnet,])
winners.100.ca.over.re2 <- nrow(df[100* df$cad <= df$re2g,])
winners.100.ca.over.grep <- nrow(df[100* df_grep$cad <= df_grep$grep,])
winners.100.ca.over.srm <- nrow(df[100* df$cad <= df$srm,])
winners.100.ca.over.dotnet <- nrow(df[100* df$cad <= df$dotnet,])
longer.than.10.seconds.ca <- nrow(df[df$cad > 10,])
longer.than.10.seconds.re2 <- nrow(df[df$re2g > 10,])
longer.than.10.seconds.srm <- nrow(df[df$srm > 10,])
longer.than.10.seconds.dotnet <- nrow(df[df$dotnet > 10,])
longer.than.10.seconds.grep <- nrow(df[df$grep > 10,])
| CA |
15 |
| RE2 |
1220 |
| SRM |
31 |
| .NET |
79 |
| grep |
670 |
| RE2 |
263 / 1740 |
| SRM |
307 / 1740 |
| .NET |
685 / 1740 |
| grep |
856 / 1366 |
| RE2 |
266 / 1740 |
| SRM |
382 / 1740 |
| .NET |
739 / 1740 |
| grep |
875 / 1366 |
| cad |
1640 / 1740 |
| RE2 |
150 / 1740 |
| SRM |
163 / 1740 |
| .NET |
128 / 1740 |
| grep |
236 / 1366 |
| RE2 |
89 / 1740 |
| SRM |
104 / 1740 |
| .NET |
29 / 1740 |
| grep |
100 / 1366 |
| CA |
29 |
| RE2 |
155 |
| SRM |
163 |
| .NET |
144 |
| grep |
234 |
plot.and.tikz(df, "enemy.min", "cad", xstring="best enemy", ystring="CA [s]")

How much we are better than RE2
df$re2.vs.ca <- df$re2g / df$cad
df_sorted <- df[order(df$re2.vs.ca, decreasing=TRUE),]
#df_sorted[,c("src", "pattern", "file", "re2g", "cad", "re2.vs.ca", "re2.ca.mismatch")]
df_sorted <- df_sorted[1:10,c("src", "pattern", "file", tools.times)]
df_sorted
haf <- latex(df_sorted,
file="figs/best_results.tex",
booktabs=TRUE,
table.env=FALSE,
center="none")
Sorting according to total time
df$total.time <- df$grep + df$re2g + df$cad + df$srm + df$dotnet
df_sorted <- df[order(df$total.time, decreasing=TRUE),]
df_sorted
df_sorted <- df_sorted[1:10,c("src", "pattern", "file", tools.times)]
haf <- latex(df_sorted,
file="figs/total_time.tex",
booktabs=TRUE,
table.env=FALSE,
center="none")
Sorting according to enemy.min
df_sorted <- df[order(df$enemy.min, decreasing=TRUE),]
df_sorted
df_sorted <- df_sorted[1:10,c("src", "pattern", "file", tools.times)]
haf <- latex(df_sorted,
file="figs/enemy_min.tex",
booktabs=TRUE,
table.env=FALSE,
center="none")
Summaries
df_for_summary <- df[,c("re2g", "cad", "cam", "srm", "dotnet", "grep")]
#df.summary <- do.call(cbind, lapply(df_for_summary, summary))
#df.summary
desc <- stat.desc(df_for_summary)
desc
haf <- latex(desc,
file="figs/stats.tex",
booktabs=TRUE,
table.env=FALSE,
center="none")
Experiments with increasing counter value
big <- read_file(params$file_big)
big$re2g <- sub(",", ".", big$re2g)
big$re2g <- as.numeric(big$re2g)
NAs introduced by coercion
big
together <- big[, c("Counter", "re2g")]
names(together)[2] <- "time"
together$approach <- "RE2"
tmp <- big[, c("Counter", "cad")]
names(tmp)[2] <- "time"
tmp$approach <- "CA"
together <- rbind(together, tmp)
tmp <- big[, c("Counter", "srm")]
names(tmp)[2] <- "time"
tmp$approach <- "SRM"
together <- rbind(together, tmp)
tmp <- big[, c("Counter", "dot.net")]
names(tmp)[2] <- "time"
tmp$approach <- ".NET"
together <- rbind(together, tmp)
tmp <- big[, c("Counter", "grep")]
names(tmp)[2] <- "time"
tmp$approach <- "grep"
together <- rbind(together, tmp)
BIG_STEP=100
# remove too many points
together <- together[together$Counter %% BIG_STEP == 0,]
big_plot <- ggplot(data=together, aes(x=Counter, y=time, colour=approach)) +
geom_line() +
geom_point(aes(shape=approach)) +
xlim(NA,2000) +
ylim(NA,20) +
geom_hline(size=0.1, yintercept=0, linetype="dashed") +
theme(legend.position = c(.02, .98),
legend.justification = c("left", "top"),
#legend.box.background = element_rect(color="black", size=0.5),
legend.box.just = "right",
legend.margin = margin(1, 1, 1, 1),
legend.title = element_blank()) +
labs(
#title="Title",
#subtitle="Subtitle",
x="k",
y="time [s]")
# geom_line(data = big, aes(x = Counter, y = re2g), color = "red") +
# geom_line(data = big, aes(x = Counter, y = ca), color = "blue") +
# xlab('counter value') +
# ylab('time [s]')
#make_tikz(paste0("figs/big_plot.tikz"), big_plot, width=2.7, height=2.7)
make_pdf(paste0("figs/big_plot.pdf"), big_plot, width=BIG_SIZE, height=BIG_SIZE)
plot(big_plot)

# Information about DCAs
# df_dcas = read.csv2(params$file_dca,
# header=TRUE,
# sep="\t",
# dec=".",
# comment.char="",
# quote="",
# strip.white=TRUE,
# stringsAsFactors=FALSE)
#
# # sanitize
# df_dcas$timeouts.classical[is.na(df_dcas$timeouts.classical)] <- 0
#df_dcas
#tms.classical <- df_dcas[df_dcas$timeouts.classical == 1,]
#compute_timeouts <- function(df, col) {
# tmp <- df[df[, col] == TIMEOUT_VAL,]
# tmp
#}
---
title: "Cnt-Set-Mata Analysis"
params:
  #file_cmp: data/results-23-04-2020.csv
  #file_cmp: results-05-05-2020/between/results.csv
  #file_cmp: results-09-05-2020/between/RESULTS-ALL-between.csv
  #file_cmp: results-10-05-2020/table-ALL.csv
  #file_cmp: results-11-05-2020/table-ALL-ondra-processed.csv
  #file_cmp: results-12-05-2020/table-ALL-ondra-processed.csv
  #file_cmp: results-15-05-2020/nogrep/cut/table-ALL-processed.csv
  #file_cmp: results-15-05-2020/nogrep/nocut/table-ALL-processed.csv
  #file_cmp: results-15-05-2020/nogrep/merged/table-ALL-processed.csv
  #file_cmp: results-15-05-2020/nogrep/mergedResults/table-ALL-processed.csv
  #file_cmp: results-23-05-2020/table-ALL-processed.csv
  #file_big: results-05-05-2020/table-big-25036102.csv
  #file_cmp: results-28-05-2020/table-ALL-processed.csv
  file_cmp: results-03-06-2020/table-ALL-processed.csv
  file_big: results-11-05-2020/graph/table-big-1065289704.csv
  file_dca: DCAs/results-translation.tsv
  timeout: 600 # seconds
output:
  html_notebook:
    code_folding: hide
  pdf_document: default
  html_document:
    df_print: paged
    toc: true
    toc_float: true
---

```{r}
#=========================================================
# PREAMBLE
#=========================================================

# load the plotting library
suppressMessages(library(ggplot2))
library(gridExtra)
library(ggExtra)
library(tikzDevice)
library(Hmisc)
library(pastecs)



theme_set(theme_bw())

options(scipen=999)  # turn-off scientific notation like 1e+48

# size of point for scatterplots
POINT_SIZE = 0.1
#POINT_SIZE = 1

# timeout
TIMEOUT = params$timeout
TIMEOUT_VAL = 1.05 * TIMEOUT

ERR_VAL = 1.3 * TIMEOUT

# saturate
#TIME_MIN = 0.01 # seconds
TIME_MIN = 0.1 # seconds

BIG_SIZE=3
SMALL_SIZE=2


# FUNCTIONS
read_file <- function(file) {
  filename = paste0(file)
  df <- read.csv2(filename,
                  header=TRUE,
                  sep=";",
                  dec=",",
                  comment.char="",
                  quote="\"",
                  strip.white=TRUE,
                  allowEscapes=FALSE,
                  stringsAsFactors=FALSE)
  
  
  return(df)
}

plot_scatter_log <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
  pscat <- ggplot(df, aes_string(x=xlab, y=ylab)) +
    geom_point(size=POINT_SIZE) +
    geom_abline(size=0.1) +
    geom_vline(size=0.1, xintercept=TIMEOUT_VAL, linetype="dashed") +
    geom_hline(size=0.1, yintercept=TIMEOUT_VAL, linetype="dashed") +
    geom_rug(alpha = 0.2) +
    scale_x_log10() +
    scale_y_log10() +
    theme(axis.text.y = element_text(angle = 90, hjust = 0.5)) +
    #coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(0.1, TIMEOUT_VAL)) +
    #coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
        coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
    labs(
      #title="Title",
      #subtitle="Subtitle",
      x=xstring,
      y=ystring)
#    theme(
#        panel.grid.major = element_blank(), 
#        panel.grid.minor = element_blank(),
#        panel.background = element_rect(fill = "transparent",colour = NA),
#        plot.background = element_rect(fill = "transparent",colour = NA)
#        )
#  theme_minimal()
#  theme_bw()
 # theme(plot.background = element_rect(fill = NA))
 # pscat <- ggMarginal(pscat, type = "density", size=10)
#  pscat <- pscat + theme_bw()
  return(pscat)
}

make_tikz <- function(file, picture, width=2.5, height=2.5) {
  font_size <- 1
  tikz(file=file, onefile=T, width=width, height=height)
  plot(picture)
  garbage <- dev.off()
}

make_pdf <- function(file, picture, width=5, height=5) {
  pdf(file=file, onefile=T, width, height)
  plot(picture)
  garbage <- dev.off()
}
```

```{r}
df <- read_file(params$file_cmp)
orig_size <- nrow(df)

colnames(df)[colnames(df) == "dot.net"] <- "dotnet"
colnames(df)[names(df) == "dot.net.matches"] <- "dotnet.matches"

######################### SANITIZE ###############################
# clean the data
df_new <- df[!grepl("File not found", df$srm.matches),]
print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to generation of input text"))
df <- df_new

tools.times <- c("re2g", "cad", "cam", "grep", "srm", "dotnet")
#tools.times <- c("re2g", "cad", "grep", "srm", "dotnet")
#tools.times <- c("re2g", "cad", "srm", "dotnet")
tools.matches <- c("re2g.matches", "cad.matches", "cam.matches", "grep.matches", "srm.matches", "dotnet.matches")
#tools.matches <- c("re2g.matches", "cad.matches", "grep.matches", "srm.matches", "dotnet.matches")
#tools.matches <- c("re2g.matches", "cad.matches", "srm.matches", "dotnet.matches")

# checking errors
errors.re2g <- nrow(df[grepl('ERR', df$re2g),])
errors.grep <- nrow(df[grepl('ERR', df$grep),])
errors.srm <- nrow(df[grepl('ERR', df$srm),])
errors.cad <- nrow(df[grepl('ERR', df$cad),])
errors.cam <- nrow(df[grepl('ERR', df$cam),])
errors.dotnet <- nrow(df[grepl('ERR', df$dotnet),])

#df <- df[!grepl('ERR', df$re2g),]
#df$re2g[df$re2g == 'ERR'] <- ERR_VAL
df[,tools.times][df[,tools.times] == 'ERR'] <- ERR_VAL


# change the type of columns other than the name
for (i in tools.times) {
  df[,i] <- sub(",", ".", df[,i])
  suppressWarnings(df[,i] <- as.numeric(df[,i]))
}

for (i in tools.matches) {
  suppressWarnings(df[,i] <- as.integer(df[,i]))
}

df$src <- as.factor(df$src)

# get rid of extremal values
#df[,tools.times][df[,tools.times] > TIMEOUT] <- TIMEOUT_VAL
df[tools.times][is.na(df[tools.times])] <- TIMEOUT_VAL
#df[is.na(df)] <- TIMEOUT_VAL
#df[df == 0.00] <- TIME_MIN
df[,tools.times][df[,tools.times] < TIME_MIN] <- TIME_MIN

df <- df[!is.na(df$pattern),]

# clean the data
#df_new <- df[df$Lines != "ERROR WHILE CONVERTING TO DCA.",]
#print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to converting to DCA error"))
#df <- df_new



############################## COUNTING TIMEOUTS #######################
timeouts.re2g <- nrow(df[df$re2g == TIMEOUT_VAL,])
timeouts.cad <- nrow(df[df$cad == TIMEOUT_VAL,])
timeouts.cam <- nrow(df[df$cam == TIMEOUT_VAL,])
timeouts.grep <- nrow(df[df$grep == TIMEOUT_VAL,])
timeouts.srm <- nrow(df[df$srm == TIMEOUT_VAL,])
timeouts.dotnet <- nrow(df[df$dotnet == TIMEOUT_VAL,])
timeouts.re2.and.ca <- nrow(df[df$cad == TIMEOUT_VAL & df$re2g == TIMEOUT_VAL,])

# | **Timeouts grep**    | `r timeouts.grep`  |
```

These are results of the experiments for Counting Set Automata:

|                     |                     | |
|---------------------|--------------------:|-|
| **File**            | `r params$file_cmp` |
| **Timeout**         | `r TIMEOUT` s       |
| **TIMEOUT_VAL**     | `r TIMEOUT_VAL` s   |
| **TIME_MIN**        | `r TIME_MIN`        |
| **original size**   | `r orig_size`       |
| **Benchmarks**      | `r nrow(df)`        |
| **Timeouts CA**     | `r timeouts.cad`     |
| **Timeouts CAM**     | `r timeouts.cam`     |
| **Timeouts RE2**    | `r timeouts.re2g`    |
| **Timeouts SRM**    | `r timeouts.srm`    | 
| **Timeouts grep**   | `r timeouts.grep`    | 
| **Timeouts .NET**   | `r timeouts.dotnet`  |
| **Errors CA**     | `r errors.cad`     |
| **Errors CAM**     | `r errors.cam`     |
| **Errors RE2**    | `r errors.re2g`    | (removed) |
| **Errors SRM**    | `r errors.srm`    | 
| **Errors grep**   | `r errors.grep`    | 
| **Errors .NET**   | `r errors.dotnet`  |



```{r}
df
```

# Summary of benchmarks

```{r}
df_benches <- data.frame(summary(df$src))
df_benches
```

# Sanity checks

```{r}
df$inconsistent <- df$re2g.matches != df$grep.matches | df$re2g.matches != df$srm.matches | df$re2g.matches != df$dotnet.matches | df$re2g.matches != df$cad.matches

#df$inconsistent <- df$re2g.matches != df$srm.matches | df$re2g.matches != df$dotnet.matches | df$re2g.matches != df$cad.matches

df$grep.re2.mismatch <- !is.na(df$re2g.matches) & !is.na(df$grep.matches) & df$re2g.matches != df$grep.matches
df_grep_re2_mismatch <- df[df$grep.re2.mismatch,]

df$re2.ca.mismatch <- !is.na(df$re2g.matches) & !is.na(df$cad.matches) & df$re2g.matches != df$cad.matches
df_re2_ca_mismatch <- df[df$re2.ca.mismatch,]

df <- df[is.na(df$re2g.matches) | is.na(df$grep.matches) | df$re2g.matches == df$cad.matches,]

```

|                             |                                |         |
|-----------------------------|-------------------------------:| -------:|
| **CA and RE2 mismatched**   | `r nrow(df_re2_ca_mismatch)`   | (removed) |
| **grep and RE2 mismatched** | `r nrow(df_grep_re2_mismatch)` |


## RE2 and CA mismatches
```{r}
df_re2_ca_mismatch
```



# Scatter Plots

```{r}

plot.and.tikz <- function(df, xlab, ylab, xstring=xlab, ystring=ylab, width=4, height=width) {
  pic <- plot_scatter_log(df, xlab, ylab, xstring, ystring)
  #make_tikz(paste0("figs/", xlab, "-vs-", ylab, ".tikz"), pic, width, height)
  make_pdf(paste0("figs/", xlab, "-vs-", ylab, ".pdf"), pic, width, height)
  pic
}

df_grep <- df[is.na(df$grep.matches) | is.na(df$cad.matches) | df$grep.matches == df$cad.matches,]

plot1 <- plot.and.tikz(df, "re2g", "cad", xstring="RE2 [s]", ystring="CA [s]", width=BIG_SIZE)
plot2 <- plot.and.tikz(df_grep, "grep", "cad", xstring="grep [s]", ystring="CA [s]", width=SMALL_SIZE)
plot3 <- plot.and.tikz(df, "srm", "cad", xstring="SRM [s]", ystring="CA [s]", width=SMALL_SIZE)
plot4 <- plot.and.tikz(df, "dotnet", "cad", xstring=".NET [s]", ystring="CA [s]", width=SMALL_SIZE)
plot5 <- plot.and.tikz(df, "srm", "re2g")
plot6 <- plot.and.tikz(df, "grep", "re2g")
plot7 <- plot.and.tikz(df, "dotnet", "re2g")
plot8 <- plot.and.tikz(df, "srm", "grep")
plot9 <- plot.and.tikz(df, "dotnet", "grep")
plot10 <- plot.and.tikz(df, "dotnet", "srm")

plot11 <- plot.and.tikz(df, "re2g", "cam", xstring="RE2 [s]", ystring="CAM [s]", width=BIG_SIZE)
plot12 <- plot.and.tikz(df_grep, "grep", "cam", xstring="grep [s]", ystring="CAM [s]", width=SMALL_SIZE)
plot13 <- plot.and.tikz(df, "srm", "cam", xstring="SRM [s]", ystring="CA [s]", width=SMALL_SIZE)
plot14 <- plot.and.tikz(df, "dotnet", "cam", xstring=".NET [s]", ystring="CA [s]", width=SMALL_SIZE)
plot15 <- plot.and.tikz(df, "cad", "cam", xstring="CAD [s]", ystring="CAM [s]", width=BIG_SIZE)


#grid.arrange(plot1, plot3, ncol = 2)
#grid.arrange(plot4, plot5, ncol = 2)
#grid.arrange(plot7, plot10, ncol = 2)

grid.arrange(plot1, plot2, ncol = 2)
grid.arrange(plot3, plot4, ncol = 2)
grid.arrange(plot5, plot6, ncol = 2)
grid.arrange(plot7, plot8, ncol = 2)
grid.arrange(plot9, plot10, ncol = 2)

grid.arrange(plot11, plot12, ncol = 2)
grid.arrange(plot13, plot14, ncol = 2)
grid.arrange(plot15, ncol = 2)
```

# Histograms

```{r}
hist1 <- ggplot(df, aes(x=re2g)) +
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist2 <- ggplot(df, aes(x=cad)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist3 <- ggplot(df, aes(x=srm)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

#hist4 <- ggplot(df, aes(x=grep)) + 
#  geom_histogram(color="blue", fill="lightblue") +
#  scale_y_log10()

hist5 <- ggplot(df, aes(x=dotnet)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

grid.arrange(hist1, hist2, ncol = 2)
grid.arrange(hist3, hist5, ncol = 2)
#grid.arrange(hist5, ncol = 2)


```

# Finding winners

```{r}
df$min <-pmin(df$grep, df$srm, df$re2g, df$dotnet, df$cad, df$cam)
#df$min <-pmin(df$grep, df$srm, df$re2g, df$dotnet, df$cad)
df$enemy.min <- pmin(df$grep, df$srm, df$re2g, df$dotnet)
#df$min <-pmin(df$srm, df$re2g, df$dotnet, df$cad)
#df$enemy.min <- pmin(df$srm, df$re2g, df$dotnet)

winners.grep <- nrow(df[df$min == df$grep,])
winners.re2 <- nrow(df[df$min == df$re2g,])
winners.ca <- nrow(df[df$min == df$cad,])
winners.cam <- nrow(df[df$min == df$cam,])
winners.srm <- nrow(df[df$min == df$srm,])
winners.dotnet <- nrow(df[df$min == df$dotnet,])

winners.ca.over.re2 <- nrow(df[df$cad <= df$re2g,])
winners.ca.over.grep <- nrow(df[df_grep$cad <= df_grep$grep,])
winners.ca.over.srm <- nrow(df[df$cad <= df$srm,])
winners.ca.over.dotnet <- nrow(df[df$cad <= df$dotnet,])

winners.cam.over.re2 <- nrow(df[df$cam <= df$re2g,])
winners.cam.over.grep <- nrow(df[df_grep$cam <= df_grep$grep,])
winners.cam.over.srm <- nrow(df[df$cam <= df$srm,])
winners.cam.over.dotnet <- nrow(df[df$cam <= df$dotnet,])
winners.cam.over.cad <- nrow(df[df$cam <= df$cad,])



winners.10.ca.over.re2 <- nrow(df[10* df$cad <= df$re2g,])
winners.10.ca.over.grep <- nrow(df[10* df_grep$cad <= df_grep$grep,])
winners.10.ca.over.srm <- nrow(df[10* df$cad <= df$srm,])
winners.10.ca.over.dotnet <- nrow(df[10* df$cad <= df$dotnet,])

winners.100.ca.over.re2 <- nrow(df[100* df$cad <= df$re2g,])
winners.100.ca.over.grep <- nrow(df[100* df_grep$cad <= df_grep$grep,])
winners.100.ca.over.srm <- nrow(df[100* df$cad <= df$srm,])
winners.100.ca.over.dotnet <- nrow(df[100* df$cad <= df$dotnet,])

longer.than.10.seconds.ca <- nrow(df[df$cad > 10,])
longer.than.10.seconds.re2 <- nrow(df[df$re2g > 10,])
longer.than.10.seconds.srm <- nrow(df[df$srm > 10,])
longer.than.10.seconds.dotnet <- nrow(df[df$dotnet > 10,])
longer.than.10.seconds.grep <- nrow(df[df$grep > 10,])


```

| **Winner**          |                     |
|---------------------|--------------------:|
| **CA**     | `r winners.ca`     |
| **RE2**    | `r winners.re2`    |
| **SRM**    | `r winners.srm`    | 
| **.NET**    | `r winners.dotnet`  |
| **grep**    | `r winners.grep`  |

| **Wins of CA over**          |                     |
|---------------------|--------------------:|
| **RE2**    | `r winners.ca.over.re2` / `r nrow(df)`   |
| **SRM**    | `r winners.ca.over.srm` / `r nrow(df)`    | 
| **.NET**    | `r winners.ca.over.dotnet` / `r nrow(df)`  |
| **grep**    | `r winners.ca.over.grep` / `r nrow(df_grep)`  |

| **Wins of CAM over**          |                     |
|---------------------|--------------------:|
| **RE2**    | `r winners.cam.over.re2` / `r nrow(df)`   |
| **SRM**    | `r winners.cam.over.srm` / `r nrow(df)`    | 
| **.NET**    | `r winners.cam.over.dotnet` / `r nrow(df)`  |
| **grep**    | `r winners.cam.over.grep` / `r nrow(df_grep)`  |
| **cad**    | `r winners.cam.over.cad` / `r nrow(df)`  |

| **Wins of at least 10 times of CA over**          |                     |
|---------------------|--------------------:|
| **RE2**    | `r winners.10.ca.over.re2` / `r nrow(df)`   |
| **SRM**    | `r winners.10.ca.over.srm` / `r nrow(df)`    | 
| **.NET**    | `r winners.10.ca.over.dotnet` / `r nrow(df)`  |
| **grep**    | `r winners.10.ca.over.grep` / `r nrow(df_grep)`  |

| **Wins of at least 100 times of CA over**          |                     |
|---------------------|--------------------:|
| **RE2**    | `r winners.100.ca.over.re2` / `r nrow(df)`   |
| **SRM**    | `r winners.100.ca.over.srm` / `r nrow(df)`    | 
| **.NET**    | `r winners.100.ca.over.dotnet` / `r nrow(df)`  |
| **grep**    | `r winners.100.ca.over.grep` / `r nrow(df_grep)`  |

| **Longer than 10 s**          |                     |
|---------------------|--------------------:|
| **CA**     | `r longer.than.10.seconds.ca`     |
| **RE2**    | `r longer.than.10.seconds.re2`    |
| **SRM**    | `r longer.than.10.seconds.srm`    | 
| **.NET**    | `r longer.than.10.seconds.dotnet`  |
| **grep**    | `r longer.than.10.seconds.grep`  |



```{r}
plot.and.tikz(df, "enemy.min", "cad", xstring="best enemy", ystring="CA [s]")
```

# How much we are better than RE2

```{r}
df$re2.vs.ca <- df$re2g / df$cad
df_sorted <- df[order(df$re2.vs.ca, decreasing=TRUE),]

#df_sorted[,c("src", "pattern", "file", "re2g", "cad", "re2.vs.ca", "re2.ca.mismatch")]
df_sorted <- df_sorted[1:10,c("src", "pattern", "file", tools.times)]
df_sorted
haf <- latex(df_sorted,
             file="figs/best_results.tex",
             booktabs=TRUE,
             table.env=FALSE,
             center="none")
```

# Hardest cases for various tools
```{r}
df.pre.sort <- df
df.pre.sort <- df.pre.sort[order(df.pre.sort$re2g, decreasing=TRUE),]
df.pre.sort <- df.pre.sort[order(df.pre.sort$cad, decreasing=TRUE),]
df.pre.sort <- df.pre.sort[order(df.pre.sort$grep, decreasing=TRUE),]
df.pre.sort <- df.pre.sort[order(df.pre.sort$srm, decreasing=TRUE),]
df.pre.sort <- df.pre.sort[order(df.pre.sort$dotnet, decreasing=TRUE),]
  

df.re2.hard <- df.pre.sort[order(df.pre.sort$re2g, decreasing=TRUE),]
df.ca.hard <- df.pre.sort[order(df.pre.sort$cad, decreasing=TRUE),]
df.grep.hard <- df.pre.sort[order(df.pre.sort$grep, decreasing=TRUE),]
df.srm.hard <- df.pre.sort[order(df.pre.sort$srm, decreasing=TRUE),]
df.dotnet.hard <- df.pre.sort[order(df.pre.sort$dotnet, decreasing=TRUE),]

df.re2.hard
df.ca.hard
df.grep.hard
df.srm.hard
df.dotnet.hard

```

# Sorting according to total time
```{r}
df$total.time <- df$grep + df$re2g + df$cad + df$srm + df$dotnet

df_sorted <- df[order(df$total.time, decreasing=TRUE),]
df_sorted

df_sorted <- df_sorted[1:10,c("src", "pattern", "file", tools.times)]
haf <- latex(df_sorted,
             file="figs/total_time.tex",
             booktabs=TRUE,
             table.env=FALSE,
             center="none")
```
# Sorting according to enemy.min
```{r}
df_sorted <- df[order(df$enemy.min, decreasing=TRUE),]
df_sorted

df_sorted <- df_sorted[1:10,c("src", "pattern", "file", tools.times)]
haf <- latex(df_sorted,
             file="figs/enemy_min.tex",
             booktabs=TRUE,
             table.env=FALSE,
             center="none")
```

# Summaries

```{r}
df_for_summary <- df[,c("re2g", "cad", "cam", "srm", "dotnet", "grep")]
#df.summary <- do.call(cbind, lapply(df_for_summary, summary))
#df.summary

desc <- stat.desc(df_for_summary)
desc
haf <- latex(desc,
             file="figs/stats.tex",
             booktabs=TRUE,
             table.env=FALSE,
             center="none")

```

# Experiments with increasing counter value

```{r}
big <- read_file(params$file_big)
big$re2g <- sub(",", ".", big$re2g)
big$re2g <- as.numeric(big$re2g)

big
```


```{r}

together <- big[, c("Counter", "re2g")]
names(together)[2] <- "time"
together$approach <- "RE2"

tmp <- big[, c("Counter", "cad")]
names(tmp)[2] <- "time"
tmp$approach <- "CA"
together <- rbind(together, tmp)

tmp <- big[, c("Counter", "srm")]
names(tmp)[2] <- "time"
tmp$approach <- "SRM"
together <- rbind(together, tmp)

tmp <- big[, c("Counter", "dot.net")]
names(tmp)[2] <- "time"
tmp$approach <- ".NET"
together <- rbind(together, tmp)

tmp <- big[, c("Counter", "grep")]
names(tmp)[2] <- "time"
tmp$approach <- "grep"
together <- rbind(together, tmp)

BIG_STEP=100

# remove too many points
together <- together[together$Counter %% BIG_STEP == 0,]

big_plot <- ggplot(data=together, aes(x=Counter, y=time, colour=approach)) +
  geom_line() +
  geom_point(aes(shape=approach)) +
  xlim(NA,2000) +
  ylim(NA,20) +
  geom_hline(size=0.1, yintercept=0, linetype="dashed") +

  theme(legend.position = c(.02, .98),
        legend.justification = c("left", "top"),
        #legend.box.background = element_rect(color="black", size=0.5),
        legend.box.just = "right",
        legend.margin = margin(1, 1, 1, 1),
        legend.title = element_blank()) +
  labs(
      #title="Title",
      #subtitle="Subtitle",
      x="k",
      y="time [s]")
        

  # geom_line(data = big, aes(x = Counter, y = re2g), color = "red") +
  # geom_line(data = big, aes(x = Counter, y = ca), color = "blue") +
  # xlab('counter value') +
  # ylab('time [s]')

#make_tikz(paste0("figs/big_plot.tikz"), big_plot, width=2.7, height=2.7)
make_pdf(paste0("figs/big_plot.pdf"), big_plot, width=BIG_SIZE, height=BIG_SIZE)


plot(big_plot)
```


```{r}
# Information about DCAs

# df_dcas = read.csv2(params$file_dca,
#                   header=TRUE,
#                   sep="\t",
#                   dec=".",
#                   comment.char="",
#                   quote="",
#                   strip.white=TRUE,
#                   stringsAsFactors=FALSE)
# 
# # sanitize
# df_dcas$timeouts.classical[is.na(df_dcas$timeouts.classical)] <- 0

#df_dcas
```

```{r}
#tms.classical <- df_dcas[df_dcas$timeouts.classical == 1,]
```



```{r}
#compute_timeouts <- function(df, col) {
#  tmp <- df[df[, col] == TIMEOUT_VAL,]
#  tmp
#}

```

