library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
warning(getwd())
## Warning: C:/Users/Benoit/Insync/UHasselt/Workspaces/RStudio/
## processdiscoverybenchmark/analysis
knitr::opts_knit$set(root.dir="..")
knitr::opts_chunk$set(echo = FALSE)

Exploratory analysis

Here are some plots to get an idea about the data

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

Models

Regression model 1

## 
## Call:
## lm(formula = avg_recall ~ ReocurringTasks + InfrequentPaths + 
##     Miner, data = summarized_results)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82221 -0.17471  0.04338  0.28616  0.52807 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.713838   0.019090  37.393  < 2e-16 ***
## ReocurringTasks0.05 -0.059788   0.021537  -2.776  0.00553 ** 
## ReocurringTasks0.1  -0.109908   0.021537  -5.103 3.52e-07 ***
## ReocurringTasks0.15 -0.184704   0.021537  -8.576  < 2e-16 ***
## ReocurringTasks0.2  -0.171076   0.021537  -7.943 2.64e-15 ***
## ReocurringTasks0.25 -0.173407   0.021537  -8.052 1.11e-15 ***
## ReocurringTasks0.3  -0.241906   0.021537 -11.232  < 2e-16 ***
## InfrequentPathsTRUE  0.005599   0.011512   0.486  0.62671    
## MinerHeuristics      0.102774   0.016280   6.313 3.09e-10 ***
## MinerILP             0.416193   0.016280  25.564  < 2e-16 ***
## MinerInductive       0.041874   0.016280   2.572  0.01015 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3392 on 3461 degrees of freedom
## Multiple R-squared:  0.2215, Adjusted R-squared:  0.2192 
## F-statistic: 98.44 on 10 and 3461 DF,  p-value: < 2.2e-16

Regression model 2

Now we are going to treat the ReoccuringTasks as a continuous variable

## 
## Call:
## lm(formula = avg_recall ~ as.numeric(as.character(ReocurringTasks)) + 
##     InfrequentPaths + Miner, data = summarized_results)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79647 -0.17916  0.04542  0.28157  0.52922 
## 
## Coefficients:
##                                            Estimate Std. Error t value
## (Intercept)                                0.688096   0.015530  44.307
## as.numeric(as.character(ReocurringTasks)) -0.724374   0.057677 -12.559
## InfrequentPathsTRUE                        0.005599   0.011535   0.485
## MinerHeuristics                            0.102774   0.016314   6.300
## MinerILP                                   0.416193   0.016314  25.512
## MinerInductive                             0.041874   0.016314   2.567
##                                           Pr(>|t|)    
## (Intercept)                                < 2e-16 ***
## as.numeric(as.character(ReocurringTasks))  < 2e-16 ***
## InfrequentPathsTRUE                         0.6274    
## MinerHeuristics                           3.35e-10 ***
## MinerILP                                   < 2e-16 ***
## MinerInductive                              0.0103 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3399 on 3466 degrees of freedom
## Multiple R-squared:  0.2171, Adjusted R-squared:  0.216 
## F-statistic: 192.3 on 5 and 3466 DF,  p-value: < 2.2e-16

Anova 1

##                                         Df Sum Sq Mean Sq F value Pr(>F)
## ReocurringTasks                          6   20.4   3.405  30.497 <2e-16
## InfrequentPaths                          1    0.0   0.027   0.244 0.6215
## Miner                                    3   92.8  30.929 277.047 <2e-16
## ReocurringTasks:InfrequentPaths          6    0.7   0.109   0.979 0.4377
## ReocurringTasks:Miner                   18   14.1   0.782   7.006 <2e-16
## InfrequentPaths:Miner                    3    1.0   0.325   2.914 0.0331
## ReocurringTasks:InfrequentPaths:Miner   18    1.1   0.059   0.526 0.9477
## Residuals                             3416  381.4   0.112               
##                                          
## ReocurringTasks                       ***
## InfrequentPaths                          
## Miner                                 ***
## ReocurringTasks:InfrequentPaths          
## ReocurringTasks:Miner                 ***
## InfrequentPaths:Miner                 *  
## ReocurringTasks:InfrequentPaths:Miner    
## Residuals                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## InfrequentPaths
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## ReocurringTasks, InfrequentPaths
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## InfrequentPaths, Miner
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## ReocurringTasks, InfrequentPaths, Miner
## Warning in TukeyHSD.aov(model3, conf.level = 0.95): 'which' specified some
## non-factors which will be dropped

Regression model 3

With some interaction terms based on the ANOVA analysis

## 
## Call:
## lm(formula = avg_recall ~ as.numeric(as.character(ReocurringTasks)) + 
##     InfrequentPaths + Miner + Miner * InfrequentPaths + Miner * 
##     as.numeric(as.character(ReocurringTasks)), data = summarized_results)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79387 -0.19095  0.00221  0.27727  0.61505 
## 
## Coefficients:
##                                                           Estimate
## (Intercept)                                                0.72961
## as.numeric(as.character(ReocurringTasks))                 -1.08145
## InfrequentPathsTRUE                                        0.02970
## MinerHeuristics                                           -0.01222
## MinerILP                                                   0.26934
## MinerInductive                                             0.13767
## InfrequentPathsTRUE:MinerHeuristics                        0.01068
## InfrequentPathsTRUE:MinerILP                              -0.03149
## InfrequentPathsTRUE:MinerInductive                        -0.07560
## as.numeric(as.character(ReocurringTasks)):MinerHeuristics  0.73099
## as.numeric(as.character(ReocurringTasks)):MinerILP         1.08399
## as.numeric(as.character(ReocurringTasks)):MinerInductive  -0.38666
##                                                           Std. Error
## (Intercept)                                                  0.02342
## as.numeric(as.character(ReocurringTasks))                    0.11361
## InfrequentPathsTRUE                                          0.02272
## MinerHeuristics                                              0.03312
## MinerILP                                                     0.03312
## MinerInductive                                               0.03312
## InfrequentPathsTRUE:MinerHeuristics                          0.03213
## InfrequentPathsTRUE:MinerILP                                 0.03213
## InfrequentPathsTRUE:MinerInductive                           0.03213
## as.numeric(as.character(ReocurringTasks)):MinerHeuristics    0.16067
## as.numeric(as.character(ReocurringTasks)):MinerILP           0.16067
## as.numeric(as.character(ReocurringTasks)):MinerInductive     0.16067
##                                                           t value Pr(>|t|)
## (Intercept)                                                31.151  < 2e-16
## as.numeric(as.character(ReocurringTasks))                  -9.519  < 2e-16
## InfrequentPathsTRUE                                         1.307   0.1913
## MinerHeuristics                                            -0.369   0.7123
## MinerILP                                                    8.132 5.85e-16
## MinerInductive                                              4.156 3.31e-05
## InfrequentPathsTRUE:MinerHeuristics                         0.332   0.7396
## InfrequentPathsTRUE:MinerILP                               -0.980   0.3271
## InfrequentPathsTRUE:MinerInductive                         -2.353   0.0187
## as.numeric(as.character(ReocurringTasks)):MinerHeuristics   4.550 5.56e-06
## as.numeric(as.character(ReocurringTasks)):MinerILP          6.747 1.77e-11
## as.numeric(as.character(ReocurringTasks)):MinerInductive   -2.407   0.0162
##                                                              
## (Intercept)                                               ***
## as.numeric(as.character(ReocurringTasks))                 ***
## InfrequentPathsTRUE                                          
## MinerHeuristics                                              
## MinerILP                                                  ***
## MinerInductive                                            ***
## InfrequentPathsTRUE:MinerHeuristics                          
## InfrequentPathsTRUE:MinerILP                                 
## InfrequentPathsTRUE:MinerInductive                        *  
## as.numeric(as.character(ReocurringTasks)):MinerHeuristics ***
## as.numeric(as.character(ReocurringTasks)):MinerILP        ***
## as.numeric(as.character(ReocurringTasks)):MinerInductive  *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3347 on 3460 degrees of freedom
## Multiple R-squared:  0.2419, Adjusted R-squared:  0.2395 
## F-statistic: 100.4 on 11 and 3460 DF,  p-value: < 2.2e-16

Now, let’s have a look at some diagnostic plots

Robust regression model

## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Call: rlm(formula = avg_recall ~ as.numeric(as.character(ReocurringTasks)) + 
##     InfrequentPaths + Miner + Miner * InfrequentPaths + Miner * 
##     as.numeric(as.character(ReocurringTasks)), data = summarized_results)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.878210 -0.215771  0.002209  0.217716  0.660602 
## 
## Coefficients:
##                                                           Value   
## (Intercept)                                                 0.8037
## as.numeric(as.character(ReocurringTasks))                  -1.5476
## InfrequentPathsTRUE                                         0.0745
## MinerHeuristics                                            -0.0453
## MinerILP                                                    0.1953
## MinerInductive                                              0.0874
## InfrequentPathsTRUE:MinerHeuristics                        -0.0171
## InfrequentPathsTRUE:MinerILP                               -0.0763
## InfrequentPathsTRUE:MinerInductive                         -0.1099
## as.numeric(as.character(ReocurringTasks)):MinerHeuristics   1.2177
## as.numeric(as.character(ReocurringTasks)):MinerILP          1.5501
## as.numeric(as.character(ReocurringTasks)):MinerInductive    0.0058
##                                                           Std. Error
## (Intercept)                                                 0.0242  
## as.numeric(as.character(ReocurringTasks))                   0.1176  
## InfrequentPathsTRUE                                         0.0235  
## MinerHeuristics                                             0.0343  
## MinerILP                                                    0.0343  
## MinerInductive                                              0.0343  
## InfrequentPathsTRUE:MinerHeuristics                         0.0333  
## InfrequentPathsTRUE:MinerILP                                0.0333  
## InfrequentPathsTRUE:MinerInductive                          0.0333  
## as.numeric(as.character(ReocurringTasks)):MinerHeuristics   0.1663  
## as.numeric(as.character(ReocurringTasks)):MinerILP          0.1663  
## as.numeric(as.character(ReocurringTasks)):MinerInductive    0.1663  
##                                                           t value 
## (Intercept)                                                33.1605
## as.numeric(as.character(ReocurringTasks))                 -13.1642
## InfrequentPathsTRUE                                         3.1698
## MinerHeuristics                                            -1.3216
## MinerILP                                                    5.6971
## MinerInductive                                              2.5504
## InfrequentPathsTRUE:MinerHeuristics                        -0.5155
## InfrequentPathsTRUE:MinerILP                               -2.2953
## InfrequentPathsTRUE:MinerInductive                         -3.3054
## as.numeric(as.character(ReocurringTasks)):MinerHeuristics   7.3241
## as.numeric(as.character(ReocurringTasks)):MinerILP          9.3238
## as.numeric(as.character(ReocurringTasks)):MinerInductive    0.0350
## 
## Residual standard error: 0.3222 on 3460 degrees of freedom