library(bigMap)
# load aux. stuff 
source('../primes.R')

Load data

load('../P1M.RData')

Run FIt-SNE

Use the script below to run FIt-SNE on a HPC platform with 20 cores and 8GB/core,

# ./FItSNE/P1M_start.py: 
# Run FIt-SNE on Primes1M

import numpy as np
import scipy.sparse
import sympy
import json
import time

from openTSNE import TSNEEmbedding
from openTSNE.affinity import PerplexityBasedNN
from openTSNE import initialization

# silence warnings
import warnings
warnings.filterwarnings("ignore")

# +++ data
max_ints = 1000000
primes = list(sympy.primerange(2, max_ints +1))
prime_to_column = {p:i for i, p in enumerate(primes)}

X_rows = []
X_data = []
for n in range(2, max_ints +1):
    f = sympy.factorint(n)
    X_rows.append([prime_to_column[p] for p in f.keys()])
    X_data.append([v for v in f.values()])

X = scipy.sparse.X_matrix((len(X_rows), len(primes)), dtype=np.float32)
X.rows = np.array(X_rows)
X.data = np.array(X_data)

output = {}
# +++ perplexities
for ppx in [50, 100, 200, 300, 400]:

  # +++ affinities
  strt = time.time()
  X_affi = PerplexityBasedNN(X, perplexity = ppx, metric = "euclidean", n_jobs = -1, random_state = 55, verbose = True)
  tX2P = time.time() - strt
  
  # +++ FIt-SNE
  strt = time.time()
  Y_init = initialization.random(max_ints, random_state = 55, verbose = T)
  Emb = TSNEEmbedding(Y_init, X_affi, negative_gradient_method = "fft", n_jobs = -1, verbose = True)
  Emb = Emb.optimize(n_iter = 250, exaggeration = 12, momentum = 0.5)
  Emb = Emb.optimize(n_iter = 750, momentum = 0.8)
  t = time.time() -strt
  
  output[str(ppx)] = {'Y': Emb.tolist(), 'ppx': ppx, 'tX2P': tX2P, 't': t}
  # +++ save up to here
  with open('./FItSNE/P1M_fitsne.json', 'w') as g:
          json.dump(output, g)

Submit:

$ qsub -pe make 20 -l h_vmem=8G pySckt ~/FItSNE/P1M_start.py

Convert from json to RData

json.list <- rjson::fromJSON(file = './FItSNE/P1M_fitsne.json')
m.list <- lapply(json.list, function(j){
  m <- list(dSet = 'P1M', is.distance = F, is.sparse = T, normalize = T)
  m$ppx <- list(ppx = j$ppx)
  m$ptsne <- list(threads = 20, GB = 160, Y = matrix(unlist(j$Y), ncol = 2, byrow = T))
  m$t <- list(affinities = j$tX2P, fitsne = j$t)
  m
})
save(m.list, file = './P1M_fitsne.RData')
# load m.list 
load('./P1M_fitsne.RData')

Output

nulL <- lapply(m.list, function(m) primes.plot(P1M, m))

hl-Correlation

hlTable <- sapply(m.list, function(m) summary(m$hlC)[4])
hlTable <- matrix(round(hlTable, 4), nrow = 1)
colnames(hlTable) <- sapply(m.list, function(m) m$ppx$ppx)
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
  kable_styling(full_width = F)
hl-Correlation
50 100 200 300 400
<hlC> 0.3528 0.3883 0.4061 0.4339 0.4446

k-ary neighborhood preservation

bdm.knp.plot(m.list, ppxfrmt = 0)

Running Times

rTimes <- sapply(m.list, function(m) c(m$t$affinities, m$t$fitsne))
rTimes <- round(rbind(rTimes, apply(rTimes, 2, sum)) /60, 2)
colnames(rTimes) <- sapply(m.list, function(m) m$ppx$ppx)
rownames(rTimes) <- c('affinities', 'FIt-SNE', 'total')
knitr::kable(rTimes, caption = 'Computation times (min)') %>%
  kable_styling(full_width = F)
Computation times (min)
50 100 200 300 400
affinities 271.52 302.93 374.57 407.52 438.35
FIt-SNE 13.78 19.83 30.02 38.99 52.57
total 285.30 322.77 404.59 446.51 490.91

Run on: Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 32Mb cache, 20 cores, 8GB/core RAM.