tSNE_introduction

Load Required Packages

library(readr)
library(Rtsne)

Load data

training_set <- read_csv("/Users/mshruti/GIT/Rcodes/MINST/train.csv")

## Parsed with column specification:
## cols(
##   .default = col_integer()
## )

## See spec(...) for full column specifications.

training_set$label <- as.factor(training_set$label)
dim(training_set)

## [1] 42000   785

# shrinking the size for the time limit
numTrain <- 5000
set.seed(1)
rows <- sample(1:nrow(training_set), numTrain)
train <- training_set[rows,]

Running tSNE

# using tsne
set.seed(1) # for reproducibility
tsne <- Rtsne(train[,-1], dims = 2, perplexity=30, verbose=TRUE, max_iter = 500)

## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 2.75 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 50: error is 89.338270 (50 iterations in 2.60 seconds)
## Iteration 100: error is 83.069224 (50 iterations in 4.34 seconds)
## Iteration 150: error is 81.788717 (50 iterations in 2.44 seconds)
## Iteration 200: error is 81.725660 (50 iterations in 2.36 seconds)
## Iteration 250: error is 81.723231 (50 iterations in 2.32 seconds)
## Iteration 300: error is 2.524986 (50 iterations in 1.91 seconds)
## Iteration 350: error is 2.170094 (50 iterations in 1.99 seconds)
## Iteration 400: error is 1.990478 (50 iterations in 2.04 seconds)
## Iteration 450: error is 1.878187 (50 iterations in 2.21 seconds)
## Iteration 500: error is 1.803537 (50 iterations in 2.20 seconds)
## Fitting performed in 24.41 seconds.

# visualizing
colors = rainbow(length(unique(train$label)))
names(colors) = unique(train$label)
par(mgp=c(2.5,1,0))
plot(tsne$Y, t='n', main="tSNE", xlab="tSNE dimension 1", ylab="tSNE dimension 2", "cex.main"=2, "cex.lab"=1.5)
text(tsne$Y, labels=train$label, col=colors[train$label])

Compare with PCA

pca = princomp(train[,-1])$scores[,1:2]
plot(pca, t='n', main="pca", "cex.main"=2, "cex.lab"=1.5)
text(pca, labels=train$label,col=colors[train$label])

Paramter tweaking

tsne_plot <- function(perpl=30,iterations=500,learning=200){
  set.seed(1) # for reproducibility
  tsne <- Rtsne(train[,-1], dims = 2, perplexity=perpl, verbose=TRUE, max_iter=iterations, eta=learning)
  plot(tsne$Y, t='n', main = print(paste0("perplexity = ",perpl, ", max_iter = ",iterations, ", learning rate = ",learning)), xlab="tSNE dimension 1", ylab="tSNE dimension 2", "cex.main"=1, "cex.lab"=1.5)
  text(tsne$Y, labels=train$label, col=colors[train$label])
}

perplexity_values <- c(2,5,30,50,100)
sapply(perplexity_values,function(i){tsne_plot(perpl=i)})

## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 2.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 2.44 seconds (sparsity = 0.001715)!
## Learning embedding...
## Iteration 50: error is 120.053079 (50 iterations in 2.46 seconds)
## Iteration 100: error is 104.452633 (50 iterations in 2.39 seconds)
## Iteration 150: error is 97.712370 (50 iterations in 2.14 seconds)
## Iteration 200: error is 94.370307 (50 iterations in 2.12 seconds)
## Iteration 250: error is 92.213103 (50 iterations in 1.95 seconds)
## Iteration 300: error is 4.128365 (50 iterations in 2.05 seconds)
## Iteration 350: error is 3.380734 (50 iterations in 2.53 seconds)
## Iteration 400: error is 2.901471 (50 iterations in 2.50 seconds)
## Iteration 450: error is 2.572941 (50 iterations in 2.63 seconds)
## Iteration 500: error is 2.340770 (50 iterations in 2.66 seconds)
## Fitting performed in 23.44 seconds.

## [1] "perplexity = 2, max_iter = 500, learning rate = 200"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 5.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 3.09 seconds (sparsity = 0.004179)!
## Learning embedding...
## Iteration 50: error is 110.020009 (50 iterations in 1.93 seconds)
## Iteration 100: error is 96.789592 (50 iterations in 2.15 seconds)
## Iteration 150: error is 92.620157 (50 iterations in 2.16 seconds)
## Iteration 200: error is 91.143382 (50 iterations in 1.97 seconds)
## Iteration 250: error is 90.381350 (50 iterations in 1.99 seconds)
## Iteration 300: error is 3.661096 (50 iterations in 1.83 seconds)
## Iteration 350: error is 3.037770 (50 iterations in 1.91 seconds)
## Iteration 400: error is 2.675372 (50 iterations in 2.00 seconds)
## Iteration 450: error is 2.434342 (50 iterations in 2.17 seconds)
## Iteration 500: error is 2.262674 (50 iterations in 2.24 seconds)
## Fitting performed in 20.36 seconds.

## [1] "perplexity = 5, max_iter = 500, learning rate = 200"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 3.66 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 50: error is 89.338270 (50 iterations in 2.71 seconds)
## Iteration 100: error is 83.069224 (50 iterations in 3.71 seconds)
## Iteration 150: error is 81.788717 (50 iterations in 2.37 seconds)
## Iteration 200: error is 81.725660 (50 iterations in 2.34 seconds)
## Iteration 250: error is 81.723231 (50 iterations in 2.30 seconds)
## Iteration 300: error is 2.524986 (50 iterations in 1.94 seconds)
## Iteration 350: error is 2.170094 (50 iterations in 1.94 seconds)
## Iteration 400: error is 1.990478 (50 iterations in 1.99 seconds)
## Iteration 450: error is 1.878187 (50 iterations in 2.02 seconds)
## Iteration 500: error is 1.803537 (50 iterations in 2.06 seconds)
## Fitting performed in 23.36 seconds.

## [1] "perplexity = 30, max_iter = 500, learning rate = 200"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 3.18 seconds (sparsity = 0.040749)!
## Learning embedding...
## Iteration 50: error is 83.303538 (50 iterations in 2.90 seconds)
## Iteration 100: error is 79.895090 (50 iterations in 3.68 seconds)
## Iteration 150: error is 78.683152 (50 iterations in 2.94 seconds)
## Iteration 200: error is 78.606307 (50 iterations in 2.56 seconds)
## Iteration 250: error is 78.586525 (50 iterations in 2.55 seconds)
## Iteration 300: error is 2.193748 (50 iterations in 2.12 seconds)
## Iteration 350: error is 1.890925 (50 iterations in 2.09 seconds)
## Iteration 400: error is 1.748897 (50 iterations in 2.34 seconds)
## Iteration 450: error is 1.665720 (50 iterations in 2.13 seconds)
## Iteration 500: error is 1.613797 (50 iterations in 2.19 seconds)
## Fitting performed in 25.50 seconds.

## [1] "perplexity = 50, max_iter = 500, learning rate = 200"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 100.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 5.32 seconds (sparsity = 0.082373)!
## Learning embedding...
## Iteration 50: error is 75.043971 (50 iterations in 3.33 seconds)
## Iteration 100: error is 74.838775 (50 iterations in 4.66 seconds)
## Iteration 150: error is 73.729076 (50 iterations in 3.37 seconds)
## Iteration 200: error is 73.729182 (50 iterations in 3.55 seconds)
## Iteration 250: error is 73.744873 (50 iterations in 4.38 seconds)
## Iteration 300: error is 1.895279 (50 iterations in 3.26 seconds)
## Iteration 350: error is 1.618559 (50 iterations in 2.60 seconds)
## Iteration 400: error is 1.500955 (50 iterations in 2.64 seconds)
## Iteration 450: error is 1.436220 (50 iterations in 2.65 seconds)
## Iteration 500: error is 1.399407 (50 iterations in 2.63 seconds)
## Fitting performed in 33.07 seconds.

## [1] "perplexity = 100, max_iter = 500, learning rate = 200"

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL

iteration_values <- c(10,50,100,1000)
sapply(iteration_values,function(i){tsne_plot(iterations=i)})

## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 2.86 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 10: error is 89.338428 (50 iterations in 0.40 seconds)
## Fitting performed in 0.40 seconds.

## [1] "perplexity = 30, max_iter = 10, learning rate = 200"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 2.79 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 50: error is 89.338270 (50 iterations in 2.55 seconds)
## Fitting performed in 2.55 seconds.

## [1] "perplexity = 30, max_iter = 50, learning rate = 200"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 2.22 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 50: error is 89.338270 (50 iterations in 2.49 seconds)
## Iteration 100: error is 83.069224 (50 iterations in 3.68 seconds)
## Fitting performed in 6.17 seconds.

## [1] "perplexity = 30, max_iter = 100, learning rate = 200"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 3.58 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 50: error is 89.338270 (50 iterations in 3.40 seconds)
## Iteration 100: error is 83.069224 (50 iterations in 4.02 seconds)
## Iteration 150: error is 81.788717 (50 iterations in 2.33 seconds)
## Iteration 200: error is 81.725660 (50 iterations in 2.39 seconds)
## Iteration 250: error is 81.723231 (50 iterations in 2.33 seconds)
## Iteration 300: error is 2.524986 (50 iterations in 1.96 seconds)
## Iteration 350: error is 2.170094 (50 iterations in 1.96 seconds)
## Iteration 400: error is 1.990478 (50 iterations in 2.00 seconds)
## Iteration 450: error is 1.878187 (50 iterations in 2.00 seconds)
## Iteration 500: error is 1.803537 (50 iterations in 2.04 seconds)
## Iteration 550: error is 1.751621 (50 iterations in 2.09 seconds)
## Iteration 600: error is 1.716791 (50 iterations in 2.32 seconds)
## Iteration 650: error is 1.694353 (50 iterations in 2.06 seconds)
## Iteration 700: error is 1.679573 (50 iterations in 2.09 seconds)
## Iteration 750: error is 1.669893 (50 iterations in 2.11 seconds)
## Iteration 800: error is 1.662544 (50 iterations in 2.10 seconds)
## Iteration 850: error is 1.655562 (50 iterations in 2.10 seconds)
## Iteration 900: error is 1.649253 (50 iterations in 2.12 seconds)
## Iteration 950: error is 1.643419 (50 iterations in 2.11 seconds)
## Iteration 1000: error is 1.637823 (50 iterations in 2.09 seconds)
## Fitting performed in 45.62 seconds.

## [1] "perplexity = 30, max_iter = 1000, learning rate = 200"

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL

learning_values <- c(20,200,2000)
sapply(learning_values,function(i){tsne_plot(learning=i)})

## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 2.65 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 50: error is 89.338428 (50 iterations in 1.91 seconds)
## Iteration 100: error is 89.338426 (50 iterations in 2.25 seconds)
## Iteration 150: error is 89.256937 (50 iterations in 2.92 seconds)
## Iteration 200: error is 83.709663 (50 iterations in 2.86 seconds)
## Iteration 250: error is 81.886814 (50 iterations in 2.30 seconds)
## Iteration 300: error is 2.869761 (50 iterations in 1.91 seconds)
## Iteration 350: error is 2.552185 (50 iterations in 1.90 seconds)
## Iteration 400: error is 2.375511 (50 iterations in 1.88 seconds)
## Iteration 450: error is 2.254268 (50 iterations in 1.88 seconds)
## Iteration 500: error is 2.163340 (50 iterations in 1.92 seconds)
## Fitting performed in 21.71 seconds.

## [1] "perplexity = 30, max_iter = 500, learning rate = 20"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 2.98 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 50: error is 89.338270 (50 iterations in 2.45 seconds)
## Iteration 100: error is 83.069224 (50 iterations in 3.67 seconds)
## Iteration 150: error is 81.788717 (50 iterations in 2.27 seconds)
## Iteration 200: error is 81.725660 (50 iterations in 2.25 seconds)
## Iteration 250: error is 81.723231 (50 iterations in 2.26 seconds)
## Iteration 300: error is 2.524986 (50 iterations in 1.87 seconds)
## Iteration 350: error is 2.170094 (50 iterations in 1.86 seconds)
## Iteration 400: error is 1.990478 (50 iterations in 1.90 seconds)
## Iteration 450: error is 1.878187 (50 iterations in 1.95 seconds)
## Iteration 500: error is 1.803537 (50 iterations in 1.99 seconds)
## Fitting performed in 22.49 seconds.

## [1] "perplexity = 30, max_iter = 500, learning rate = 200"
## Read the 5000 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 5000
## Done in 2.76 seconds (sparsity = 0.024410)!
## Learning embedding...
## Iteration 50: error is 88.069153 (50 iterations in 2.49 seconds)
## Iteration 100: error is 82.226399 (50 iterations in 2.19 seconds)
## Iteration 150: error is 81.991775 (50 iterations in 2.17 seconds)
## Iteration 200: error is 81.952705 (50 iterations in 2.17 seconds)
## Iteration 250: error is 81.824926 (50 iterations in 2.46 seconds)
## Iteration 300: error is 1.991787 (50 iterations in 2.23 seconds)
## Iteration 350: error is 1.760548 (50 iterations in 2.01 seconds)
## Iteration 400: error is 1.689184 (50 iterations in 2.00 seconds)
## Iteration 450: error is 1.654980 (50 iterations in 2.05 seconds)
## Iteration 500: error is 1.632784 (50 iterations in 2.09 seconds)
## Fitting performed in 21.86 seconds.

## [1] "perplexity = 30, max_iter = 500, learning rate = 2000"

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL