library(bigMap)
# load aux. stuff
source('../primes.R')
load('../P1M.RData')
Use the script below to run FIt-SNE on a HPC platform with 20 cores and 8GB/core,
# ./FItSNE/P1M_start.py:
# Run FIt-SNE on Primes1M
import numpy as np
import scipy.sparse
import sympy
import json
import time
from openTSNE import TSNEEmbedding
from openTSNE.affinity import PerplexityBasedNN
from openTSNE import initialization
# silence warnings
import warnings
warnings.filterwarnings("ignore")
# +++ data
max_ints = 1000000
primes = list(sympy.primerange(2, max_ints +1))
prime_to_column = {p:i for i, p in enumerate(primes)}
X_rows = []
X_data = []
for n in range(2, max_ints +1):
f = sympy.factorint(n)
X_rows.append([prime_to_column[p] for p in f.keys()])
X_data.append([v for v in f.values()])
X = scipy.sparse.X_matrix((len(X_rows), len(primes)), dtype=np.float32)
X.rows = np.array(X_rows)
X.data = np.array(X_data)
output = {}
# +++ perplexities
for ppx in [50, 100, 200, 300, 400]:
# +++ affinities
strt = time.time()
X_affi = PerplexityBasedNN(X, perplexity = ppx, metric = "euclidean", n_jobs = -1, random_state = 55, verbose = True)
tX2P = time.time() - strt
# +++ FIt-SNE
strt = time.time()
Y_init = initialization.random(max_ints, random_state = 55, verbose = T)
Emb = TSNEEmbedding(Y_init, X_affi, negative_gradient_method = "fft", n_jobs = -1, verbose = True)
Emb = Emb.optimize(n_iter = 250, exaggeration = 12, momentum = 0.5)
Emb = Emb.optimize(n_iter = 750, momentum = 0.8)
t = time.time() -strt
output[str(ppx)] = {'Y': Emb.tolist(), 'ppx': ppx, 'tX2P': tX2P, 't': t}
# +++ save up to here
with open('./FItSNE/P1M_fitsne.json', 'w') as g:
json.dump(output, g)
Submit:
$ qsub -pe make 20 -l h_vmem=8G pySckt ~/FItSNE/P1M_start.py
json.list <- rjson::fromJSON(file = './FItSNE/P1M_fitsne.json')
m.list <- lapply(json.list, function(j){
m <- list(dSet = 'P1M', is.distance = F, is.sparse = T, normalize = T)
m$ppx <- list(ppx = j$ppx)
m$ptsne <- list(threads = 20, GB = 160, Y = matrix(unlist(j$Y), ncol = 2, byrow = T))
m$t <- list(affinities = j$tX2P, fitsne = j$t)
m
})
save(m.list, file = './P1M_fitsne.RData')
# load m.list
load('./P1M_fitsne.RData')
nulL <- lapply(m.list, function(m) primes.plot(P1M, m))
hlTable <- sapply(m.list, function(m) summary(m$hlC)[4])
hlTable <- matrix(round(hlTable, 4), nrow = 1)
colnames(hlTable) <- sapply(m.list, function(m) m$ppx$ppx)
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
kable_styling(full_width = F)
| 50 | 100 | 200 | 300 | 400 | |
|---|---|---|---|---|---|
| <hlC> | 0.3528 | 0.3883 | 0.4061 | 0.4339 | 0.4446 |
bdm.knp.plot(m.list, ppxfrmt = 0)
rTimes <- sapply(m.list, function(m) c(m$t$affinities, m$t$fitsne))
rTimes <- round(rbind(rTimes, apply(rTimes, 2, sum)) /60, 2)
colnames(rTimes) <- sapply(m.list, function(m) m$ppx$ppx)
rownames(rTimes) <- c('affinities', 'FIt-SNE', 'total')
knitr::kable(rTimes, caption = 'Computation times (min)') %>%
kable_styling(full_width = F)
| 50 | 100 | 200 | 300 | 400 | |
|---|---|---|---|---|---|
| affinities | 271.52 | 302.93 | 374.57 | 407.52 | 438.35 |
| FIt-SNE | 13.78 | 19.83 | 30.02 | 38.99 | 52.57 |
| total | 285.30 | 322.77 | 404.59 | 446.51 | 490.91 |
Run on: Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 32Mb cache, 20 cores, 8GB/core RAM.