library(bigMap)
# load aux. stuff 
source('../primes.R')

Load data

The Primes1G data set is a structured representation of the first 10^6 integers based on their prime factor decomposition. This is a highly sparse matrix with 78498 dimensions (primes lower than 10^6) where the values are the prime factorisation powers. We use a compact matrix format where odd columns hold the values of the columns (primes) in the original matrix and even columns hold the powers themselves. This results in a matrix with 10^6 rows and 14 columns.

This data set is inspired by the work of John Williamson https://johnhw.github.io/umap_primes/index.md.html, though we note some differences:

load('../P1M.RData')

Run pt-SNE

We use the script below to run pt-SNE on this data set using a HPC platform with 101 cores and 8GB/core,

# ./P1M_start.R: 
# Run pt-SNE on Primes1M
# Compute kNP and HL-correlation

# +++ load package
library(bigMap)

# +++ load data
load('./P1M.RData')

# +++ start MPI cluster
threads <- 100
mpi.cl <- bdm.mpi.start(threads)
if (is.null(mpi.cl)) return()

# +++ run pt-SNE
ppx.list <- c(1000, 5000, 10000, 20000, 30000, 40000, 50000, 100000)
m.list <- lapply(ppx.list, function(ppx){
  # +++ compute betas
  m <- bdm.init(P1M, is.sparse = T, ppx = ppx, threads = threads, mpi.cl = mpi.cl)
  # +++ skip re-exporting data to workers
  P1M <- NULL
  # +++ run ptSNE
  m <- bdm.ptsne(P1M, m, theta = 0.5, layers = 2, threads = threads, mpi.cl = mpi.cl)
  # +++ compute kNP
  m <- bdm.knp(P1M, m, k.max = 250000, sampling = 0.25, threads = threads, mpi.cl = mpi.cl)
  # +++ compute hlC
  m <- bdm.hlCorr(P1M, m, threads = threads, mpi.cl = mpi.cl)
  # +++ return embedding
  m
})

# +++ save 
save(m.list, file = './P1M_list.RData')

# +++ stop cluster
bdm.mpi.stop(mpi.cl)

Submit:

$ qsub -pe ompi 101 -l h_vmem=8G Rsnow ~/P1M_start.R

Load results

load('./P1M_list.RData')

Embedding cost/size function

nulL <- lapply(m.list, function(m) bdm.cost(m))

Output

nulL <- lapply(m.list, function(m) primes.plot(P1M, m))

hl-Correlation

hlTable <- sapply(m.list, function(m) mean(m$hlC))
hlTable <- matrix(round(hlTable, 4), nrow = 1)
colnames(hlTable) <- c('1k', '5k', '10k', '20k', '30k', '40k', '50k', '100k')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
  kable_styling(full_width = F)
hl-Correlation
1k 5k 10k 20k 30k 40k 50k 100k
-0.0129 -0.1178 -0.1117 -0.043 -0.0449 0.0984 0.2043 0.4649

k-ary neighborhood preservation

bdm.knp.plot(m.list)

Running Times

rTimes <- sapply(m.list, function(m) c(m$ppx$t[3], m$t$epoch, m$t$ptsne[3], (m$ppx$t[3] +m$t$ptsne[3])))
rTimes <- round(rTimes /60, 1)
colnames(rTimes) <- c('1k$^{(1)}$', '5k$^{(1)}$', '10k$^{(1)}$', '20k$^{(1)}$', '30k$^{(2)}$', '40k$^{(2)}$', '50k$^{(2)}$', '100k$^{(3)}$')
rownames(rTimes) <- c('betas', 'epoch', 'pt-SNE', 'total')
knitr::kable(rTimes, caption = 'Computation times (min)') %>%
  kable_styling(full_width = F)
Computation times (min)
1k\(^{(1)}\) 5k\(^{(1)}\) 10k\(^{(1)}\) 20k\(^{(1)}\) 30k\(^{(2)}\) 40k\(^{(2)}\) 50k\(^{(2)}\) 100k\(^{(3)}\)
betas 17.8 51.6 45.8 56.3 55.6 44.9 50.0 75.3
epoch 2.1 2.2 3.6 3.6 3.5 4.0 4.0 4.2
pt-SNE 115.5 124.0 202.5 203.2 197.6 223.6 226.4 232.3
total 133.4 175.5 248.3 259.5 253.2 268.5 276.5 307.6

Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 101 cores. (1) 4GB/core. (2) 6GB/core. (3) 8GB/core.