Confidence Intervals

Stuff for calculating confidence intervals for GECCO benchmarks paper.

Data

lex = c(98, 5, 1, 7, 6, 0, 51, 0, 2, 0, 66, 21, 16, 8, 78, 0, 6, 1, 8, 0, 45, 2, 0, 0, 7, 4, 45, 81, 18)
tourney = c(68, 3, 0, 3, 0, 0, 8, 0, 0, 0, 7, 8, 14, 0, 46, 0, 2, 0, 0, 0, 10, 0, 0, 0, 0, 0, 7, 75, 1)
ifs = c(72, 3, 0, 6, 0, 0, 16, 0, 0, 0, 10, 4, 13, 0, 64, 0, 0, 0, 0, 0, 8, 0, 0, 0, 1, 0, 43, 98, 7)

Differences

(lex - tourney) / 100
##  [1] 0.30 0.02 0.01 0.04 0.06 0.00 0.43 0.00 0.02 0.00 0.59 0.13 0.02 0.08
## [15] 0.32 0.00 0.04 0.01 0.08 0.00 0.35 0.02 0.00 0.00 0.07 0.04 0.38 0.06
## [29] 0.17
(lex - ifs) / 100
##  [1]  0.26  0.02  0.01  0.01  0.06  0.00  0.35  0.00  0.02  0.00  0.56
## [12]  0.17  0.03  0.08  0.14  0.00  0.06  0.01  0.08  0.00  0.37  0.02
## [23]  0.00  0.00  0.06  0.04  0.02 -0.17  0.11

Confidence Intervals of Differences

# Lower bounds of confidence intervals, lexicase and tourney
sapply(seq(1, 29), function(x) prop.test(c(lex[x], tourney[x]), c(100, 100))$conf.int[1])
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
##  [1]  0.194543516 -0.044245345 -0.019501395 -0.030155251  0.003453434
##  [6]  0.000000000  0.308523010  0.000000000 -0.017439496  0.000000000
## [11]  0.474543738  0.024081838 -0.088934509  0.016827510  0.182979816
## [16]  0.000000000 -0.024032478 -0.019501395  0.016827510  0.000000000
## [21]  0.226136382 -0.017439496  0.000000000  0.000000000  0.009992104
## [26] -0.008407293  0.260417153 -0.064519607  0.082216341
# Upper bounds of confidence intervals, lexicase and tourney
sapply(seq(1, 29), function(x) prop.test(c(lex[x], tourney[x]), c(100, 100))$conf.int[2])
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], tourney[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
##  [1] 0.40545648 0.08424535 0.03950140 0.11015525 0.11654657 0.00000000
##  [7] 0.55147699 0.00000000 0.05743950 0.00000000 0.70545626 0.23591816
## [13] 0.12893451 0.14317249 0.45702018 0.00000000 0.10403248 0.03950140
## [19] 0.14317249 0.00000000 0.47386362 0.05743950 0.00000000 0.00000000
## [25] 0.13000790 0.08840729 0.49958285 0.18451961 0.25778366
# Lower bounds of confidence intervals, lexicase and ifs
sapply(seq(1, 29), function(x) prop.test(c(lex[x], ifs[x]), c(100, 100))$conf.int[1])
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
##  [1]  0.157819162 -0.044245345 -0.019501395 -0.068318171  0.003453434
##  [6]  0.000000000  0.218498040  0.000000000 -0.017439496  0.000000000
## [11]  0.440102109  0.071410495 -0.077506977  0.016827510  0.005731482
## [16]  0.000000000  0.003453434 -0.019501395  0.016827510  0.000000000
## [21]  0.248937296 -0.017439496  0.000000000  0.000000000 -0.003675824
## [26] -0.008407293 -0.127560991 -0.261639029  0.009607617
# Upper bounds of confidence intervals, lexicase and ifs
sapply(seq(1, 29), function(x) prop.test(c(lex[x], ifs[x]), c(100, 100))$conf.int[2])
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
## Warning in prop.test(c(lex[x], ifs[x]), c(100, 100)): Chi-squared
## approximation may be incorrect
##  [1]  0.36218084  0.08424535  0.03950140  0.08831817  0.11654657
##  [6]  0.00000000  0.48150196  0.00000000  0.05743950  0.00000000
## [11]  0.67989789  0.26858950  0.13750698  0.14317249  0.27426852
## [16]  0.00000000  0.11654657  0.03950140  0.14317249  0.00000000
## [21]  0.49106270  0.05743950  0.00000000  0.00000000  0.12367582
## [26]  0.08840729  0.16756099 -0.07836097  0.21039238

Clojure code to transform results into Latex

(let [lex-tourn-diff [0.30 0.02 0.01 0.04 0.06 0.00 0.43 0.00 0.02 0.00 0.59 0.13 0.02 0.08 0.32 0.00 0.04 0.01 0.08 0.00 0.35 0.02 0.00 0.00 0.07 0.04 0.38 0.06 0.17]
      lex-ifs-diff [0.26  0.02  0.01  0.01  0.06  0.00  0.35  0.00  0.02  0.00  0.56  0.17  0.03  0.08  0.14  0.00  0.06  0.01  0.08  0.00  0.37  0.02  0.00  0.00  0.06  0.04  0.02 -0.17  0.11]
      lex-tourn-low [0.194543516 -0.044245345 -0.019501395 -0.030155251  0.003453434  0.000000000  0.308523010
                     0.000000000 -0.017439496  0.000000000  0.474543738  0.024081838 -0.088934509  0.016827510
                     0.182979816  0.000000000 -0.024032478 -0.019501395  0.016827510  0.000000000  0.226136382
                     -0.017439496  0.000000000  0.000000000  0.009992104 -0.008407293  0.260417153 -0.064519607  0.082216341]
      lex-tourn-up [0.40545648 0.08424535 0.03950140 0.11015525 0.11654657 0.00000000 0.55147699 0.00000000 0.05743950 0.00000000
                    0.70545626 0.23591816 0.12893451 0.14317249 0.45702018 0.00000000 0.10403248 0.03950140 0.14317249 0.00000000
                    0.47386362 0.05743950 0.00000000 0.00000000 0.13000790 0.08840729 0.49958285 0.18451961 0.25778366]
      lex-ifs-low [0.157819162 -0.044245345 -0.019501395 -0.068318171  0.003453434  0.000000000  0.218498040  0.000000000
                   -0.017439496  0.000000000  0.440102109  0.071410495 -0.077506977  0.016827510  0.005731482  0.000000000
                   0.003453434 -0.019501395  0.016827510  0.000000000  0.248937296 -0.017439496  0.000000000  0.000000000
                   -0.003675824 -0.008407293 -0.127560991 -0.261639029  0.009607617]
      lex-ifs-up [0.36218084  0.08424535  0.03950140  0.08831817  0.11654657  0.00000000  0.48150196  0.00000000  0.05743950
                  0.00000000  0.67989789  0.26858950  0.13750698  0.14317249  0.27426852  0.00000000  0.11654657  0.03950140
                  0.14317249  0.00000000  0.49106270  0.05743950  0.00000000  0.00000000  0.12367582  0.08840729  0.16756099
                  -0.07836097  0.21039238]
      ]
  (doseq [line (map (fn [ltd lid ltl ltu lil liu]
                      (format "$%.2f$ & $[%.2f, %.2f]$ & $%.2f$ & $[%.2f, %.2f]$ \\tabularnewline"
                              ltd ltl ltu
                              lid lil liu)
                      )
                    lex-tourn-diff
                    lex-ifs-diff
                    lex-tourn-low
                    lex-tourn-up
                    lex-ifs-low
                    lex-ifs-up)
          ]
    (println line)))

Friedman’s test

Friedman’s test for multiple achievements of multiple subjects.

benchResults = matrix(c(lex, tourney, ifs), nrow=29, dimnames = list(1:29, c("lexicase", "tourney", "ifs")))

apply(benchResults, 1, function(x) rank(-x))
##          1   2   3 4   5 6 7 8   9 10 11 12 13  14 15 16 17  18  19 20 21
## lexicase 1 1.0 1.0 1 1.0 2 1 2 1.0  2  1  1  1 1.0  1  2  1 1.0 1.0  2  1
## tourney  3 2.5 2.5 3 2.5 2 3 2 2.5  2  3  2  2 2.5  3  2  2 2.5 2.5  2  2
## ifs      2 2.5 2.5 2 2.5 2 2 2 2.5  2  2  3  3 2.5  2  2  3 2.5 2.5  2  3
##           22 23 24 25  26 27 28 29
## lexicase 1.0  2  2  1 1.0  1  2  1
## tourney  2.5  2  2  3 2.5  3  3  3
## ifs      2.5  2  2  2 2.5  2  1  2
rowMeans(apply(benchResults, 1, function(x) rank(-x)))
## lexicase  tourney      ifs 
## 1.275862 2.465517 2.258621
friedman.test(benchResults)
## 
##  Friedman rank sum test
## 
## data:  benchResults
## Friedman chi-squared = 34.4051, df = 2, p-value = 3.381e-08