build-vs-recall

Cagra

Setup

library(readr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

dat <- read_csv("h100/sift-128-euclidean/result/search/raft_cagra_hnswlib,base,k100,bs1,raw.csv")
Rows: 720 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (3): algo_name, index_name, label
dbl (17): recall, throughput, latency, threads, cpu_time, GPU, ef, end_to_en...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat$build_time <- dat$`build GPU`
dat$build_time0 <- dat$build_time/min(dat$build_time)
dat$logit<--log(1/dat$recall-1)
dat$ef<-factor(dat$ef)
dat$gd<-paste0(dat$graph_degree,"-i",dat$intermediate_graph_degree)

build <- read_csv("h100/sift-128-euclidean/result/build/raft_cagra_hnswlib,base.csv")
Rows: 16 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): algo_name, index_name, label
dbl (6): time, threads, cpu_time, GPU, graph_degree, intermediate_graph_degree

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gd<-dat %>%
  group_by(gd) %>%
  summarize(recall = mean(recall), build_time=mean(build_time))
ggplot(gd, aes(build_time, recall)) + 
  geom_point() +
  geom_smooth(span=0.8) +
  ggtitle("Average Recall vs Average Build Time", subtitle = "Sift 128 Euclidean RAFT CAGRA HNSW k100 bs1")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

gd$recall_rnk<-rank(gd$recall)
gd$build_time_rnk<-rank(gd$build_time)
gd$ratio<-gd$recall_rnk/gd$build_time_rnk
gd[order(gd$ratio,decreasing=TRUE),]
# A tibble: 16 × 6
   gd       recall build_time recall_rnk build_time_rnk ratio
   <chr>     <dbl>      <dbl>      <dbl>          <dbl> <dbl>
 1 64-i32    0.989       3.08          2              1 2    
 2 128-i64   0.998       4.27          8              5 1.6  
 3 256-i32   0.989       3.16          3              2 1.5  
 4 256-i64   0.998       4.36          9              6 1.5  
 5 256-i96   0.999       4.99         13             10 1.3  
 6 64-i96    0.998       4.84         11              9 1.22 
 7 128-i96   0.999       5.11         14             12 1.17 
 8 256-i128  1.00        6.03         16             15 1.07 
 9 32-i32    0.989       3.43          4              4 1    
10 32-i64    0.990       4.39          7              7 1    
11 128-i128  1.00        6.22         15             16 0.938
12 64-i64    0.998       5.05         10             11 0.909
13 64-i128   0.998       5.68         12             14 0.857
14 32-i96    0.990       4.46          6              8 0.75 
15 32-i128   0.989       5.44          5             13 0.385
16 128-i32   0.989       3.21          1              3 0.333
ggplot(gd, aes(build_time_rnk, recall_rnk)) +
  geom_point() +
  geom_smooth(span=0.8)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

  ggtitle("Average Recall Ranks vs Average Build Time Ranks", subtitle = "Sift 128 Euclidean RAFT CAGRA HNSW k100 bs1")
$title
[1] "Average Recall Ranks vs Average Build Time Ranks"

$subtitle
[1] "Sift 128 Euclidean RAFT CAGRA HNSW k100 bs1"

attr(,"class")
[1] "labels"

Plots

ggplot(dat, aes(build_time, recall, color=ef)) +
  geom_point() +
  geom_smooth(span = 0.8) +
  ggtitle("Recall vs Build Time by ef", subtitle = "Sift 128 Euclidean RAFT CAGRA HNSW k100 bs1")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(dat, aes(build_time, recall)) +
  geom_point() +
  geom_smooth(span = 0.8) +
  facet_wrap(~ef) +
  ggtitle("Recall vs Build Time by ef", subtitle = "Sift 128 Euclidean RAFT CAGRA HNSW k100 bs1")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

lm1<-lm(logit~ef+build_time,data=dat)
summary(lm1)

Call:
lm(formula = logit ~ ef + build_time, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.0813 -0.3888  0.0784  0.5396  1.6171 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.62993    0.13762   4.577 5.56e-06 ***
ef200        1.26344    0.08301  15.220  < 2e-16 ***
ef400        2.95730    0.08301  35.625  < 2e-16 ***
ef600        3.65674    0.08301  44.051  < 2e-16 ***
ef800        4.06981    0.08301  49.027  < 2e-16 ***
build_time   0.82072    0.02702  30.377  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.7044 on 714 degrees of freedom
Multiple R-squared:  0.8584,    Adjusted R-squared:  0.8574 
F-statistic: 865.4 on 5 and 714 DF,  p-value: < 2.2e-16
dat$plogit<-predict(lm1)
dat$p<-1/(1+exp(-dat$plogit))

ggplot(dat, aes(build_time, p, color=ef)) +
  geom_line() +
  ggtitle("Recall vs Build Time by ef", subtitle = "Sift 128 Euclidean RAFT CAGRA HNSW k100 bs1")

ggplot(dat, aes(build_time, recall, color=gd)) +
  geom_point() +
  geom_point(data = gd, size = 4, shape=3)

  ggtitle("Recall vs Build Time by Graph Degree", subtitle = "Sift 128 Euclidean RAFT CAGRA HNSW k100 bs1")
$title
[1] "Recall vs Build Time by Graph Degree"

$subtitle
[1] "Sift 128 Euclidean RAFT CAGRA HNSW k100 bs1"

attr(,"class")
[1] "labels"

How do you get low build time and high recall?

  • High recall ~ high ef + high_graph_degree + high intermediate_graph_degree

  • Low build time ~ low graph_degree

lm2<-lm(logit~ef+factor(graph_degree)+factor(intermediate_graph_degree),dat)
summary(lm2)

Call:
lm(formula = logit ~ ef + factor(graph_degree) + factor(intermediate_graph_degree), 
    data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.0795 -0.3208 -0.0930  0.2863  1.4358 

Coefficients:
                                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)                           2.23736    0.06418   34.86   <2e-16 ***
ef200                                 1.26344    0.06119   20.65   <2e-16 ***
ef400                                 2.95730    0.06119   48.33   <2e-16 ***
ef600                                 3.65674    0.06119   59.76   <2e-16 ***
ef800                                 4.06981    0.06119   66.51   <2e-16 ***
factor(graph_degree)64                1.07026    0.05473   19.55   <2e-16 ***
factor(graph_degree)128               1.40340    0.05473   25.64   <2e-16 ***
factor(graph_degree)256               1.41816    0.05473   25.91   <2e-16 ***
factor(intermediate_graph_degree)64   1.26139    0.05473   23.05   <2e-16 ***
factor(intermediate_graph_degree)96   1.68729    0.05473   30.83   <2e-16 ***
factor(intermediate_graph_degree)128  1.85474    0.05473   33.89   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5193 on 709 degrees of freedom
Multiple R-squared:  0.9236,    Adjusted R-squared:  0.9225 
F-statistic: 856.7 on 10 and 709 DF,  p-value: < 2.2e-16
lm3<-lm(time~factor(graph_degree)+factor(intermediate_graph_degree),build)
summary(lm3)

Call:
lm(formula = time ~ factor(graph_degree) + factor(intermediate_graph_degree), 
    data = build)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.34442 -0.19748 -0.07847  0.15978  0.47934 

Coefficients:
                                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)                            3.0434     0.2070  14.705 1.34e-07 ***
factor(graph_degree)64                 0.2319     0.2213   1.048 0.321871    
factor(graph_degree)128                0.2717     0.2213   1.228 0.250654    
factor(graph_degree)256                0.2045     0.2213   0.924 0.379530    
factor(intermediate_graph_degree)64    1.2962     0.2213   5.858 0.000241 ***
factor(intermediate_graph_degree)96    1.6313     0.2213   7.373 4.22e-05 ***
factor(intermediate_graph_degree)128   2.6198     0.2213  11.841 8.63e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3129 on 9 degrees of freedom
Multiple R-squared:  0.9417,    Adjusted R-squared:  0.9028 
F-statistic: 24.21 on 6 and 9 DF,  p-value: 4.536e-05
tapply(dat$build_time,list(dat$graph_degree,dat$intermediate_graph_degree),mean)
          32       64       96      128
32  3.428088 4.389299 4.464433 5.439049
64  3.082024 5.050846 4.839544 5.676099
128 3.213393 4.266828 5.108272 6.219005
256 3.157951 4.359390 4.994607 6.026751

Example

  • Graph degree = 64

  • Intermediate graph degree = 128

  • EF = 800

ind1<-build$graph_degree==64&build$intermediate_graph_degree==128
ind2<-dat$graph_degree==64&dat$intermediate_graph_degree==128&dat$ef==800
mean(dat$recall<dat[ind2,"recall",drop=TRUE])
[1] 0.8680556
mean(build$time<build[ind1,"time",drop=T])
[1] 0.8125