#=========================================================
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
9: In readChar(file, size, TRUE) : truncating string with embedded nuls
# PREAMBLE
#=========================================================

# load the plotting library
suppressMessages(library(ggplot2))
library(gridExtra)
library(ggExtra)
library(tikzDevice)
library(Hmisc)
library(pastecs)



theme_set(theme_bw())

options(scipen=999)  # turn-off scientific notation like 1e+48

# size of point for scatterplots
POINT_SIZE = 0.1
#POINT_SIZE = 1

# timeout
TIMEOUT = params$timeout
TIMEOUT_VAL = 1.05 * TIMEOUT

# saturate
#TIME_MIN = 0.01 # seconds
TIME_MIN = 0.1 # seconds

BIG_SIZE=3
SMALL_SIZE=2


# FUNCTIONS
read_file <- function(file) {
  filename = paste0(file)
  df <- read.csv2(filename,
                  header=TRUE,
                  sep=";",
                  dec=",",
                  comment.char="",
                  quote="\"",
                  strip.white=TRUE,
                  allowEscapes=FALSE,
                  stringsAsFactors=FALSE)
  
  
  return(df)
}

plot_scatter_log <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
  pscat <- ggplot(df, aes_string(x=xlab, y=ylab)) +
    geom_point(size=POINT_SIZE) +
    geom_abline(size=0.1) +
    geom_vline(size=0.1, xintercept=TIMEOUT_VAL, linetype="dashed") +
    geom_hline(size=0.1, yintercept=TIMEOUT_VAL, linetype="dashed") +
    geom_rug(alpha = 0.2) +
    scale_x_log10() +
    scale_y_log10() +
    theme(axis.text.y = element_text(angle = 90, hjust = 0.5)) +
    #coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(0.1, TIMEOUT_VAL)) +
    #coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
        coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +
    labs(
      #title="Title",
      #subtitle="Subtitle",
      x=xstring,
      y=ystring)
#    theme(
#        panel.grid.major = element_blank(), 
#        panel.grid.minor = element_blank(),
#        panel.background = element_rect(fill = "transparent",colour = NA),
#        plot.background = element_rect(fill = "transparent",colour = NA)
#        )
#  theme_minimal()
#  theme_bw()
 # theme(plot.background = element_rect(fill = NA))
 # pscat <- ggMarginal(pscat, type = "density", size=10)
#  pscat <- pscat + theme_bw()
  return(pscat)
}

make_tikz <- function(file, picture, width=2.5, height=2.5) {
  font_size <- 1
  tikz(file=file, onefile=T, width=width, height=height)
  plot(picture)
  garbage <- dev.off()
}

make_pdf <- function(file, picture, width=5, height=5) {
  pdf(file=file, onefile=T, width, height)
  plot(picture)
  garbage <- dev.off()
}
df <- read_file(params$file_cmp)
orig_size <- nrow(df)

colnames(df)[colnames(df) == "dot.net"] <- "dotnet"
colnames(df)[names(df) == "dot.net.matches"] <- "dotnet.matches"

######################### SANITIZE ###############################
# clean the data
df_new <- df[!grepl("File not found", df$srm.matches),]
print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to generation of input text"))
[1] "Removing 2682 lines due to generation of input text"
df <- df_new

tools.times <- c("re2g", "cad", "grep", "srm", "dotnet")
#tools.times <- c("re2g", "cad", "srm", "dotnet")
tools.matches <- c("re2g.matches", "cad.matches", "grep.matches", "srm.matches", "dotnet.matches")
#tools.matches <- c("re2g.matches", "cad.matches", "srm.matches", "dotnet.matches")

# checking errors
errors.re2g <- nrow(df[grepl('ERR', df$re2g),])
errors.grep <- nrow(df[grepl('ERR', df$grep),])
errors.srm <- nrow(df[grepl('ERR', df$srm),])
errors.cad <- nrow(df[grepl('ERR', df$cad),])
errors.dotnet <- nrow(df[grepl('ERR', df$dotnet),])

df <- df[!grepl('ERR', df$re2g),]


# change the type of columns other than the name
for (i in tools.times) {
  df[,i] <- sub(",", ".", df[,i])
  suppressWarnings(df[,i] <- as.numeric(df[,i]))
}

for (i in tools.matches) {
  suppressWarnings(df[,i] <- as.integer(df[,i]))
}

df$src <- as.factor(df$src)

# get rid of extremal values
df[,tools.times][df[,tools.times] > TIMEOUT] <- TIMEOUT_VAL
df[tools.times][is.na(df[tools.times])] <- TIMEOUT_VAL
#df[is.na(df)] <- TIMEOUT_VAL
#df[df == 0.00] <- TIME_MIN
df[,tools.times][df[,tools.times] < TIME_MIN] <- TIME_MIN



# clean the data
#df_new <- df[df$Lines != "ERROR WHILE CONVERTING TO DCA.",]
#print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to converting to DCA error"))
#df <- df_new



############################## COUNTING TIMEOUTS #######################
timeouts.re2g <- nrow(df[df$re2g == TIMEOUT_VAL,])
timeouts.cad <- nrow(df[df$cad == TIMEOUT_VAL,])
timeouts.grep <- nrow(df[df$grep == TIMEOUT_VAL,])
timeouts.srm <- nrow(df[df$srm == TIMEOUT_VAL,])
timeouts.dotnet <- nrow(df[df$dotnet == TIMEOUT_VAL,])
timeouts.re2.and.ca <- nrow(df[df$cad == TIMEOUT_VAL & df$re2g == TIMEOUT_VAL,])

# | **Timeouts grep**    | `r timeouts.grep`  |

These are results of the experiments for Counting Set Automata:

File results-15-05-2020/nogrep/mergedResults/table-ALL-processed.csv
Timeout 600 s
TIMEOUT_VAL 630 s
TIME_MIN 0.1
original size 4405
Benchmarks 1644
Timeouts CA 0
Timeouts RE2 1
Timeouts SRM 9
Timeouts grep 47
Timeouts .NET 2
Errors CA 0
Errors RE2 60 (removed)
Errors SRM 0
Errors grep 46
Errors .NET 0
df

Summary of benchmarks

df_benches <- data.frame(summary(df$src))
df_benches

Sanity checks

df$inconsistent <- df$re2g.matches != df$grep.matches | df$re2g.matches != df$srm.matches | df$re2g.matches != df$dotnet.matches | df$re2g.matches != df$cad.matches

#df$inconsistent <- df$re2g.matches != df$srm.matches | df$re2g.matches != df$dotnet.matches | df$re2g.matches != df$cad.matches

df$grep.re2.mismatch <- !is.na(df$re2g.matches) & !is.na(df$grep.matches) & df$re2g.matches != df$grep.matches
df_grep_re2_mismatch <- df[df$grep.re2.mismatch,]

df$re2.ca.mismatch <- !is.na(df$re2g.matches) & !is.na(df$cad.matches) & df$re2g.matches != df$cad.matches
df_re2_ca_mismatch <- df[df$re2.ca.mismatch,]

df <- df[is.na(df$re2g.matches) | is.na(df$grep.matches) | df$re2g.matches == df$cad.matches,]
CA and RE2 mismatched 19 (removed)
grep and RE2 mismatched 338

RE2 and CA mismatches

df_re2_ca_mismatch

Scatter Plots


plot.and.tikz <- function(df, xlab, ylab, xstring=xlab, ystring=ylab, width=4, height=width) {
  pic <- plot_scatter_log(df, xlab, ylab, xstring, ystring)
  #make_tikz(paste0("figs/", xlab, "-vs-", ylab, ".tikz"), pic, width, height)
  make_pdf(paste0("figs/", xlab, "-vs-", ylab, ".pdf"), pic, width, height)
  pic
}

df_grep <- df[is.na(df$grep.matches) | is.na(df$cad.matches) | df$grep.matches == df$cad.matches,]

plot1 <- plot.and.tikz(df, "re2g", "cad", xstring="RE2 [s]", ystring="CA [s]", width=BIG_SIZE)
plot2 <- plot.and.tikz(df_grep, "grep", "cad", xstring="grep [s]", ystring="CA [s]", width=SMALL_SIZE)
plot3 <- plot.and.tikz(df, "srm", "cad", xstring="SRM [s]", ystring="CA [s]", width=SMALL_SIZE)
plot4 <- plot.and.tikz(df, "dotnet", "cad", xstring=".NET [s]", ystring="CA [s]", width=SMALL_SIZE)
plot5 <- plot.and.tikz(df, "srm", "re2g")
plot6 <- plot.and.tikz(df, "grep", "re2g")
plot7 <- plot.and.tikz(df, "dotnet", "re2g")
plot8 <- plot.and.tikz(df, "srm", "grep")
plot9 <- plot.and.tikz(df, "dotnet", "grep")
plot10 <- plot.and.tikz(df, "dotnet", "srm")



#grid.arrange(plot1, plot3, ncol = 2)
#grid.arrange(plot4, plot5, ncol = 2)
#grid.arrange(plot7, plot10, ncol = 2)

grid.arrange(plot1, plot2, ncol = 2)

grid.arrange(plot3, plot4, ncol = 2)

grid.arrange(plot5, plot6, ncol = 2)

grid.arrange(plot7, plot8, ncol = 2)

grid.arrange(plot9, plot10, ncol = 2)

Histograms

hist1 <- ggplot(df, aes(x=re2g)) +
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist2 <- ggplot(df, aes(x=cad)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist3 <- ggplot(df, aes(x=srm)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

#hist4 <- ggplot(df, aes(x=grep)) + 
#  geom_histogram(color="blue", fill="lightblue") +
#  scale_y_log10()

hist5 <- ggplot(df, aes(x=dotnet)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

grid.arrange(hist1, hist2, ncol = 2)

grid.arrange(hist3, hist5, ncol = 2)

#grid.arrange(hist5, ncol = 2)

Finding winners

df$min <-pmin(df$grep, df$srm, df$re2g, df$dotnet, df$cad)
df$enemy.min <- pmin(df$grep, df$srm, df$re2g, df$dotnet)
#df$min <-pmin(df$srm, df$re2g, df$dotnet, df$cad)
#df$enemy.min <- pmin(df$srm, df$re2g, df$dotnet)

winners.grep <- nrow(df[df$min == df$grep,])
winners.re2 <- nrow(df[df$min == df$re2g,])
winners.ca <- nrow(df[df$min == df$cad,])
winners.srm <- nrow(df[df$min == df$srm,])
winners.dotnet <- nrow(df[df$min == df$dotnet,])

winners.ca.over.re2 <- nrow(df[df$cad <= df$re2g,])
winners.ca.over.grep <- nrow(df[df_grep$cad <= df_grep$grep,])
winners.ca.over.srm <- nrow(df[df$cad <= df$srm,])
winners.ca.over.dotnet <- nrow(df[df$cad <= df$dotnet,])

winners.10.ca.over.re2 <- nrow(df[10* df$cad <= df$re2g,])
winners.10.ca.over.grep <- nrow(df[10* df_grep$cad <= df_grep$grep,])
winners.10.ca.over.srm <- nrow(df[10* df$cad <= df$srm,])
winners.10.ca.over.dotnet <- nrow(df[10* df$cad <= df$dotnet,])

winners.100.ca.over.re2 <- nrow(df[100* df$cad <= df$re2g,])
winners.100.ca.over.grep <- nrow(df[100* df_grep$cad <= df_grep$grep,])
winners.100.ca.over.srm <- nrow(df[100* df$cad <= df$srm,])
winners.100.ca.over.dotnet <- nrow(df[100* df$cad <= df$dotnet,])

longer.than.10.seconds.ca <- nrow(df[df$cad > 10,])
longer.than.10.seconds.re2 <- nrow(df[df$re2g > 10,])
longer.than.10.seconds.srm <- nrow(df[df$srm > 10,])
longer.than.10.seconds.dotnet <- nrow(df[df$dotnet > 10,])
longer.than.10.seconds.grep <- nrow(df[df$grep > 10,])
Winner
CA 73
RE2 1211
SRM 13
.NET 39
grep 680
Wins of CA over
RE2 175 / 1644
SRM 265 / 1644
.NET 657 / 1644
grep 693 / 1311
Wins of at least 10 times of CA over
RE2 63 / 1644
SRM 128 / 1644
.NET 125 / 1644
grep 170 / 1311
Wins of at least 100 times of CA over
RE2 11 / 1644
SRM 83 / 1644
.NET 12 / 1644
grep 56 / 1311
Longer than 10 s
CA 13
RE2 64
SRM 128
.NET 146
grep 190
plot.and.tikz(df, "enemy.min", "cad", xstring="best enemy", ystring="CA [s]")

How much we are better than RE2

df$re2.vs.ca <- df$re2g / df$cad
df_sorted <- df[order(df$re2.vs.ca, decreasing=TRUE),]

#df_sorted[,c("src", "pattern", "file", "re2g", "cad", "re2.vs.ca", "re2.ca.mismatch")]
df_sorted <- df_sorted[1:10,c("src", "pattern", "file", tools.times)]
df_sorted
haf <- latex(df_sorted,
             file="figs/best_results.tex",
             booktabs=TRUE,
             table.env=FALSE,
             center="none")

Summaries

df_for_summary <- df[,c("re2g", "cad", "srm", "dotnet", "grep")]
#df.summary <- do.call(cbind, lapply(df_for_summary, summary))
#df.summary

desc <- stat.desc(df_for_summary)
desc
haf <- latex(desc,
             file="figs/stats.tex",
             booktabs=TRUE,
             table.env=FALSE,
             center="none")

Experiments with increasing counter value

big <- read_file(params$file_big)
big$re2g <- sub(",", ".", big$re2g)
big$re2g <- as.numeric(big$re2g)
NAs introduced by coercion
big

together <- big[, c("Counter", "re2g")]
names(together)[2] <- "time"
together$approach <- "RE2"

tmp <- big[, c("Counter", "cad")]
names(tmp)[2] <- "time"
tmp$approach <- "CA"
together <- rbind(together, tmp)

tmp <- big[, c("Counter", "srm")]
names(tmp)[2] <- "time"
tmp$approach <- "SRM"
together <- rbind(together, tmp)

tmp <- big[, c("Counter", "dot.net")]
names(tmp)[2] <- "time"
tmp$approach <- ".NET"
together <- rbind(together, tmp)

tmp <- big[, c("Counter", "grep")]
names(tmp)[2] <- "time"
tmp$approach <- "grep"
together <- rbind(together, tmp)

BIG_STEP=100

# remove too many points
together <- together[together$Counter %% BIG_STEP == 0,]

big_plot <- ggplot(data=together, aes(x=Counter, y=time, colour=approach)) +
  geom_line() +
  geom_point(aes(shape=approach)) +
  xlim(NA,2000) +
  ylim(NA,20) +
  geom_hline(size=0.1, yintercept=0, linetype="dashed") +

  theme(legend.position = c(.02, .98),
        legend.justification = c("left", "top"),
        #legend.box.background = element_rect(color="black", size=0.5),
        legend.box.just = "right",
        legend.margin = margin(1, 1, 1, 1),
        legend.title = element_blank()) +
  labs(
      #title="Title",
      #subtitle="Subtitle",
      x="k",
      y="time [s]")
        

  # geom_line(data = big, aes(x = Counter, y = re2g), color = "red") +
  # geom_line(data = big, aes(x = Counter, y = ca), color = "blue") +
  # xlab('counter value') +
  # ylab('time [s]')

#make_tikz(paste0("figs/big_plot.tikz"), big_plot, width=2.7, height=2.7)
make_pdf(paste0("figs/big_plot.pdf"), big_plot, width=BIG_SIZE, height=BIG_SIZE)


plot(big_plot)

# Information about DCAs

# df_dcas = read.csv2(params$file_dca,
#                   header=TRUE,
#                   sep="\t",
#                   dec=".",
#                   comment.char="",
#                   quote="",
#                   strip.white=TRUE,
#                   stringsAsFactors=FALSE)
# 
# # sanitize
# df_dcas$timeouts.classical[is.na(df_dcas$timeouts.classical)] <- 0

#df_dcas
#tms.classical <- df_dcas[df_dcas$timeouts.classical == 1,]
#compute_timeouts <- function(df, col) {
#  tmp <- df[df[, col] == TIMEOUT_VAL,]
#  tmp
#}
