#=========================================================
There were 20 warnings (use warnings() to see them)
# PREAMBLE
#=========================================================

# load the plotting library
suppressMessages(library(ggplot2))
library(gridExtra)
library(tikzDevice)


theme_set(theme_bw())

options(scipen=999)  # turn-off scientific notation like 1e+48

# size of point for scatterplots
POINT_SIZE = 0.5
#POINT_SIZE = 1

# timeout
TIMEOUT = params$timeout
TIMEOUT_VAL = 1.05 * TIMEOUT

TIME_MIN = 0.01 # seconds

# FUNCTIONS
read_file <- function(file) {
  filename = paste0(file)
  df <- read.csv2(filename,
                  header=TRUE,
                  sep=";",
                  dec=",",
                  comment.char="",
                  strip.white=TRUE,
                  stringsAsFactors=FALSE)
  
  
  return(df)
}

plot_scatter_log <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
  pscat <- ggplot(df, aes_string(x=xlab, y=ylab)) +
    geom_point(size=POINT_SIZE) +
    geom_abline(size=0.1) +
    geom_vline(size=0.1, xintercept=TIMEOUT_VAL, linetype="dashed") +
    geom_hline(size=0.1, yintercept=TIMEOUT_VAL, linetype="dashed") +
    scale_x_log10() +
    scale_y_log10() +
    theme(axis.text.y = element_text(angle = 90, hjust = 0.5)) +
    #coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(0.1, TIMEOUT_VAL)) +
    coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +

    labs(
      #title="Title",
      #subtitle="Subtitle",
      x=xstring,
      y=ystring)
  return(pscat)
}

make_tikz <- function(file, picture, width=2.5, height=2.5) {
  font_size <- 1
  tikz(file=file, onefile=T, width=width, height=height)
  plot(picture)
  garbage <- dev.off()
}
df <- read_file(params$file_cmp)

######################### SANITIZE ###############################
tools.times <- c("re2g", "cad", "grep", "srm", "dot.net")
tools.matches <- c("re2g.matches", "cad.matches", "grep.matches", "srm.matches", "dot.net.matches")

# change the type of columns other than the name
for (i in tools.times) {
  df[,i] <- sub(",", ".", df[,i])
  suppressWarnings(df[,i] <- as.numeric(df[,i]))
}

for (i in tools.matches) {
  suppressWarnings(df[,i] <- as.integer(df[,i]))
}

# get rid of extremal values
df[tools.times][is.na(df[tools.times])] <- TIMEOUT_VAL
#df[is.na(df)] <- TIMEOUT_VAL
#df[df == 0.00] <- TIME_MIN
df[,tools.times][df[,tools.times] == 0] <- TIME_MIN

# clean the data
#df_new <- df[df$Lines != "ERROR WHILE CONVERTING TO DCA.",]
#print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to converting to DCA error"))
#df <- df_new



############################## COUNTING TIMEOUTS #######################
timeouts.re2 <- nrow(df[df$re2g == TIMEOUT_VAL,])
timeouts.ca <- nrow(df[df$cad == TIMEOUT_VAL,])
timeouts.grep <- nrow(df[df$grep == TIMEOUT_VAL,])
timeouts.srm <- nrow(df[df$srm == TIMEOUT_VAL,])
timeouts.dot.net <- nrow(df[df$dot.net == TIMEOUT_VAL,])
timeouts.re2.and.ca <- nrow(df[df$cad == TIMEOUT_VAL & df$re2g == TIMEOUT_VAL,])

These are results of the experiments for Counting Set Automata:

File results-10-05-2020/table-ALL.csv
Timeout 300 s
Benchmarks 4315
Timeouts CA 0
Timeouts RE2 25
Timeouts SRM 25
Timeouts grep 239
Timeouts .NET 42
df

Sanity checks

df$inconsistent <- df$re2g.matches != df$grep.matches | df$re2g.matches != df$srm.matches | df$re2g.matches != df$dot.net.matches | df$re2g.matches != df$cad.matches

df$grep.re2.mismatch <- !is.na(df$re2g.matches) & !is.na(df$grep.matches) & df$re2g.matches != df$grep.matches
df_grep_re2_mismatch <- df[df$grep.re2.mismatch,]

df$re2.ca.mismatch <- !is.na(df$re2g.matches) & !is.na(df$cad.matches) & df$re2g.matches != df$cad.matches
df_re2_ca_mismatch <- df[df$re2.ca.mismatch,]
grep and RE2 mismatched 1110
CA and RE2 mismatched 103

RE2 and CA mismatches

df_re2_ca_mismatch

Scatter Plots


plot.and.tikz <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
  pic <- plot_scatter_log(df, xlab, ylab, xstring, ystring)
  make_tikz(paste0("figs/", xlab, "-vs-", ylab, ".tikz"), pic, width=4, height=4)
  pic
}

plot1 <- plot.and.tikz(df, "re2g", "cad", xstring="RE2 [s]", ystring="CA [s]")
plot2 <- plot.and.tikz(df, "grep", "cad", xstring="grep [s]", ystring="CA [s]")
plot3 <- plot.and.tikz(df, "srm", "cad", xstring="SRM [s]", ystring="CA [s]")
plot4 <- plot.and.tikz(df, "dot.net", "cad", xstring=".NET [s]", ystring="CA [s]")
plot5 <- plot.and.tikz(df, "srm", "re2g")
plot6 <- plot.and.tikz(df, "grep", "re2g")
plot7 <- plot.and.tikz(df, "dot.net", "re2g")
plot8 <- plot.and.tikz(df, "srm", "grep")
plot9 <- plot.and.tikz(df, "dot.net", "grep")
plot10 <- plot.and.tikz(df, "dot.net", "srm")



grid.arrange(plot1, plot2, ncol = 2)

grid.arrange(plot3, plot4, ncol = 2)

grid.arrange(plot5, plot6, ncol = 2)

grid.arrange(plot7, plot8, ncol = 2)

grid.arrange(plot9, plot10, ncol = 2)

Histograms

hist1 <- ggplot(df, aes(x=re2g)) +
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist2 <- ggplot(df, aes(x=cad)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist3 <- ggplot(df, aes(x=srm)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist4 <- ggplot(df, aes(x=grep)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist5 <- ggplot(df, aes(x=dot.net)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

grid.arrange(hist1, hist2, ncol = 2)

grid.arrange(hist3, hist4, ncol = 2)

grid.arrange(hist5, ncol = 2)

NA
NA

Finding winners

df$min <-pmin(df$grep, df$srm, df$re2g, df$dot.net, df$cad)
df$enemy.min <- pmin(df$grep, df$srm, df$re2g, df$dot.net)

winners.grep <- nrow(df[df$min == df$grep,])
winners.re2 <- nrow(df[df$min == df$re2g,])
winners.ca <- nrow(df[df$min == df$cad,])
winners.srm <- nrow(df[df$min == df$srm,])
winners.dot.net <- nrow(df[df$min == df$dot.net,])
Winner
CA 32
RE2 3759
SRM 7
grep 1736
.NET 64
plot.and.tikz(df, "enemy.min", "cad", xstring="best enemy", ystring="CA [s]")

How much we are better than RE2

df$re2.vs.ca <- df$re2g / df$cad
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
9: In readChar(file, size, TRUE) : truncating string with embedded nuls
10: In readChar(file, size, TRUE) : truncating string with embedded nuls
df_sorted <- df[order(df$re2.vs.ca, decreasing=TRUE),]

df_sorted[,c("src", "pattern", "file", "re2g", "cad", "re2.vs.ca")]

Experiments with increasing counter value

big <- read_file(params$file_big)
big$re2g <- sub(",", ".", big$re2g)
big$re2g <- as.numeric(big$re2g)
NAs introduced by coercion
big

together <- big[, c("Counter", "re2g")]
names(together)[2] <- "time"
together$approach <- "re2g"

tmp <- big[, c("Counter", "ca")]
names(tmp)[2] <- "time"
tmp$approach <- "ca"

together <- rbind(together, tmp)

big_plot <- ggplot(data=together, aes(x=Counter, y=time, colour=approach)) +
  geom_line() +
  xlim(NA,1250) +
  theme(legend.position = c(.05, .95),
        legend.justification = c("left", "top"),
        #legend.box.background = element_rect(color="black", size=0.5),
        legend.box.just = "right",
        legend.margin = margin(1, 1, 1, 1),
        legend.title = element_blank()) +
  labs(
      #title="Title",
      #subtitle="Subtitle",
      x="$k$",
      y="time [s]")
        

  # geom_line(data = big, aes(x = Counter, y = re2g), color = "red") +
  # geom_line(data = big, aes(x = Counter, y = ca), color = "blue") +
  # xlab('counter value') +
  # ylab('time [s]')

make_tikz(paste0("figs/big_plot.tikz"), big_plot, width=2.7, height=2.7)

plot(big_plot)

# Information about DCAs

df_dcas = read.csv2(params$file_dca,
                  header=TRUE,
                  sep="\t",
                  dec=".",
                  comment.char="",
                  quote="",
                  strip.white=TRUE,
                  stringsAsFactors=FALSE)

# sanitize
df_dcas$timeouts.classical[is.na(df_dcas$timeouts.classical)] <- 0

#df_dcas
tms.classical <- df_dcas[df_dcas$timeouts.classical == 1,]
compute_timeouts <- function(df, col) {
  tmp <- df[df[, col] == TIMEOUT_VAL,]
  tmp
}
---
title: "Cnt-Set-Mata Analysis"
params:
  #file_cmp: data/results-23-04-2020.csv
  #file_cmp: results-05-05-2020/between/results.csv
  #file_cmp: results-09-05-2020/between/RESULTS-ALL-between.csv
  file_cmp: results-10-05-2020/table-ALL.csv
  file_big: results-05-05-2020/table-big-25036102.csv
  file_dca: DCAs/results-translation.tsv
  timeout: 300 # seconds
output:
  html_notebook:
    code_folding: hide
  pdf_document: default
  html_document:
    df_print: paged
    toc: true
    toc_float: true
---

```{r}
#=========================================================
# PREAMBLE
#=========================================================

# load the plotting library
suppressMessages(library(ggplot2))
library(gridExtra)
library(tikzDevice)


theme_set(theme_bw())

options(scipen=999)  # turn-off scientific notation like 1e+48

# size of point for scatterplots
POINT_SIZE = 0.5
#POINT_SIZE = 1

# timeout
TIMEOUT = params$timeout
TIMEOUT_VAL = 1.05 * TIMEOUT

TIME_MIN = 0.01 # seconds

# FUNCTIONS
read_file <- function(file) {
  filename = paste0(file)
  df <- read.csv2(filename,
                  header=TRUE,
                  sep=";",
                  dec=",",
                  comment.char="",
                  strip.white=TRUE,
                  stringsAsFactors=FALSE)
  
  
  return(df)
}

plot_scatter_log <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
  pscat <- ggplot(df, aes_string(x=xlab, y=ylab)) +
    geom_point(size=POINT_SIZE) +
    geom_abline(size=0.1) +
    geom_vline(size=0.1, xintercept=TIMEOUT_VAL, linetype="dashed") +
    geom_hline(size=0.1, yintercept=TIMEOUT_VAL, linetype="dashed") +
    scale_x_log10() +
    scale_y_log10() +
    theme(axis.text.y = element_text(angle = 90, hjust = 0.5)) +
    #coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(0.1, TIMEOUT_VAL)) +
    coord_fixed(xlim = c(TIME_MIN, TIMEOUT_VAL), ylim = c(TIME_MIN, TIMEOUT_VAL)) +

    labs(
      #title="Title",
      #subtitle="Subtitle",
      x=xstring,
      y=ystring)
  return(pscat)
}

make_tikz <- function(file, picture, width=2.5, height=2.5) {
  font_size <- 1
  tikz(file=file, onefile=T, width=width, height=height)
  plot(picture)
  garbage <- dev.off()
}
```

```{r}
df <- read_file(params$file_cmp)

######################### SANITIZE ###############################
tools.times <- c("re2g", "cad", "grep", "srm", "dot.net")
tools.matches <- c("re2g.matches", "cad.matches", "grep.matches", "srm.matches", "dot.net.matches")

# change the type of columns other than the name
for (i in tools.times) {
  df[,i] <- sub(",", ".", df[,i])
  suppressWarnings(df[,i] <- as.numeric(df[,i]))
}

for (i in tools.matches) {
  suppressWarnings(df[,i] <- as.integer(df[,i]))
}

# get rid of extremal values
df[tools.times][is.na(df[tools.times])] <- TIMEOUT_VAL
#df[is.na(df)] <- TIMEOUT_VAL
#df[df == 0.00] <- TIME_MIN
df[,tools.times][df[,tools.times] == 0] <- TIME_MIN

# clean the data
#df_new <- df[df$Lines != "ERROR WHILE CONVERTING TO DCA.",]
#print(paste0("Removing ", nrow(df) - nrow(df_new), " lines due to converting to DCA error"))
#df <- df_new



############################## COUNTING TIMEOUTS #######################
timeouts.re2 <- nrow(df[df$re2g == TIMEOUT_VAL,])
timeouts.ca <- nrow(df[df$cad == TIMEOUT_VAL,])
timeouts.grep <- nrow(df[df$grep == TIMEOUT_VAL,])
timeouts.srm <- nrow(df[df$srm == TIMEOUT_VAL,])
timeouts.dot.net <- nrow(df[df$dot.net == TIMEOUT_VAL,])
timeouts.re2.and.ca <- nrow(df[df$cad == TIMEOUT_VAL & df$re2g == TIMEOUT_VAL,])
```

These are results of the experiments for Counting Set Automata:

|                     |                     |
|---------------------|--------------------:|
| **File**            | `r params$file_cmp` |
| **Timeout**         | `r TIMEOUT` s       |
| **Benchmarks**      | `r nrow(df)`        |
| **Timeouts CA**     | `r timeouts.ca`     |
| **Timeouts RE2**    | `r timeouts.re2`    |
| **Timeouts SRM**    | `r timeouts.srm`    | 
| **Timeouts grep**    | `r timeouts.grep`  |
| **Timeouts .NET**    | `r timeouts.dot.net`  |


```{r}
df
```

# Sanity checks

```{r}
df$inconsistent <- df$re2g.matches != df$grep.matches | df$re2g.matches != df$srm.matches | df$re2g.matches != df$dot.net.matches | df$re2g.matches != df$cad.matches

df$grep.re2.mismatch <- !is.na(df$re2g.matches) & !is.na(df$grep.matches) & df$re2g.matches != df$grep.matches
df_grep_re2_mismatch <- df[df$grep.re2.mismatch,]

df$re2.ca.mismatch <- !is.na(df$re2g.matches) & !is.na(df$cad.matches) & df$re2g.matches != df$cad.matches
df_re2_ca_mismatch <- df[df$re2.ca.mismatch,]

```

|                             |                                |
|-----------------------------|-------------------------------:|
| **grep and RE2 mismatched** | `r nrow(df_grep_re2_mismatch)` |
| **CA and RE2 mismatched**   | `r nrow(df_re2_ca_mismatch)`   |

## RE2 and CA mismatches
```{r}
df_re2_ca_mismatch
```



# Scatter Plots

```{r}

plot.and.tikz <- function(df, xlab, ylab, xstring=xlab, ystring=ylab) {
  pic <- plot_scatter_log(df, xlab, ylab, xstring, ystring)
  make_tikz(paste0("figs/", xlab, "-vs-", ylab, ".tikz"), pic, width=4, height=4)
  pic
}

plot1 <- plot.and.tikz(df, "re2g", "cad", xstring="RE2 [s]", ystring="CA [s]")
plot2 <- plot.and.tikz(df, "grep", "cad", xstring="grep [s]", ystring="CA [s]")
plot3 <- plot.and.tikz(df, "srm", "cad", xstring="SRM [s]", ystring="CA [s]")
plot4 <- plot.and.tikz(df, "dot.net", "cad", xstring=".NET [s]", ystring="CA [s]")
plot5 <- plot.and.tikz(df, "srm", "re2g")
plot6 <- plot.and.tikz(df, "grep", "re2g")
plot7 <- plot.and.tikz(df, "dot.net", "re2g")
plot8 <- plot.and.tikz(df, "srm", "grep")
plot9 <- plot.and.tikz(df, "dot.net", "grep")
plot10 <- plot.and.tikz(df, "dot.net", "srm")



grid.arrange(plot1, plot2, ncol = 2)
grid.arrange(plot3, plot4, ncol = 2)
grid.arrange(plot5, plot6, ncol = 2)
grid.arrange(plot7, plot8, ncol = 2)
grid.arrange(plot9, plot10, ncol = 2)
```


# Histograms

```{r}
hist1 <- ggplot(df, aes(x=re2g)) +
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist2 <- ggplot(df, aes(x=cad)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist3 <- ggplot(df, aes(x=srm)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist4 <- ggplot(df, aes(x=grep)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

hist5 <- ggplot(df, aes(x=dot.net)) + 
  geom_histogram(color="blue", fill="lightblue") +
  scale_y_log10()

grid.arrange(hist1, hist2, ncol = 2)
grid.arrange(hist3, hist4, ncol = 2)
grid.arrange(hist5, ncol = 2)


```

# Finding winners

```{r}
df$min <-pmin(df$grep, df$srm, df$re2g, df$dot.net, df$cad)
df$enemy.min <- pmin(df$grep, df$srm, df$re2g, df$dot.net)

winners.grep <- nrow(df[df$min == df$grep,])
winners.re2 <- nrow(df[df$min == df$re2g,])
winners.ca <- nrow(df[df$min == df$cad,])
winners.srm <- nrow(df[df$min == df$srm,])
winners.dot.net <- nrow(df[df$min == df$dot.net,])
```

| **Winner**          |                     |
|---------------------|--------------------:|
| **CA**     | `r winners.ca`     |
| **RE2**    | `r winners.re2`    |
| **SRM**    | `r winners.srm`    | 
| **grep**    | `r winners.grep`  |
| **.NET**    | `r winners.dot.net`  |

```{r}
plot.and.tikz(df, "enemy.min", "cad", xstring="best enemy", ystring="CA [s]")
```

# How much we are better than RE2

```{r}
df$re2.vs.ca <- df$re2g / df$cad
df_sorted <- df[order(df$re2.vs.ca, decreasing=TRUE),]

df_sorted[,c("src", "pattern", "file", "re2g", "cad", "re2.vs.ca")]
```


# Experiments with increasing counter value

```{r}
big <- read_file(params$file_big)
big$re2g <- sub(",", ".", big$re2g)
big$re2g <- as.numeric(big$re2g)

big
```


```{r}

together <- big[, c("Counter", "re2g")]
names(together)[2] <- "time"
together$approach <- "re2g"

tmp <- big[, c("Counter", "ca")]
names(tmp)[2] <- "time"
tmp$approach <- "ca"

together <- rbind(together, tmp)

big_plot <- ggplot(data=together, aes(x=Counter, y=time, colour=approach)) +
  geom_line() +
  xlim(NA,1250) +
  theme(legend.position = c(.05, .95),
        legend.justification = c("left", "top"),
        #legend.box.background = element_rect(color="black", size=0.5),
        legend.box.just = "right",
        legend.margin = margin(1, 1, 1, 1),
        legend.title = element_blank()) +
  labs(
      #title="Title",
      #subtitle="Subtitle",
      x="$k$",
      y="time [s]")
        

  # geom_line(data = big, aes(x = Counter, y = re2g), color = "red") +
  # geom_line(data = big, aes(x = Counter, y = ca), color = "blue") +
  # xlab('counter value') +
  # ylab('time [s]')

make_tikz(paste0("figs/big_plot.tikz"), big_plot, width=2.7, height=2.7)

plot(big_plot)
```


```{r}
# Information about DCAs

df_dcas = read.csv2(params$file_dca,
                  header=TRUE,
                  sep="\t",
                  dec=".",
                  comment.char="",
                  quote="",
                  strip.white=TRUE,
                  stringsAsFactors=FALSE)

# sanitize
df_dcas$timeouts.classical[is.na(df_dcas$timeouts.classical)] <- 0

#df_dcas
```

```{r}
tms.classical <- df_dcas[df_dcas$timeouts.classical == 1,]
```



```{r}
compute_timeouts <- function(df, col) {
  tmp <- df[df[, col] == TIMEOUT_VAL,]
  tmp
}

```

