rm(list = ls())
##############compare PCA, pcoA and TSNE
#PCA (linear)
#t-SNE (non-parametric/ nonlinear)
library(ggplot2)
mydata = iris
dim(mydata); head(mydata)
## [1] 150 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# PCA
pca = prcomp(mydata[,-5],
center = T,
scale. = T)
screeplot(pca,type = "line")

#plot
data_plot <- as.data.frame(pca$x)
data_plot$source <- mydata$Species
#data_plot$source <- gsub("_.*.", "", data_plot$source )
variance.percent_1 <- round(factoextra::get_eig(pca)$variance.percent,digits = 2)
pca.plot <- ggplot(data_plot,aes(x=PC1,y=PC2,color=source)) +
geom_point(aes(alpha=0.5)) +
theme(panel.background = element_blank(),
legend.position = "right",
panel.border=element_rect(fill=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background=element_blank(),
axis.text.x=element_text(colour="black"),
axis.text.y=element_text(colour="black"),
axis.ticks=element_line(colour="black"),
plot.margin=unit(c(1,1,1,1),"line")) +
xlab(paste0("PC1"," (",variance.percent_1[1],"%)")) +
ylab(paste0("PC1"," (",variance.percent_1[2],"%)")) +
scale_alpha_continuous(guide=FALSE) +
stat_ellipse( level = 0.95)+
labs(title = "PCA")
pca.plot

# PCoA
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-6
pcoa = cmdscale(vegdist(mydata[,-5], method = 'bray'), k = nrow(mydata) - 1, eig = T)
## Warning in cmdscale(vegdist(mydata[, -5], method = "bray"), k = nrow(mydata) - :
## only 58 of the first 149 eigenvalues are > 0
pcoa_eig <- (pcoa$eig)[1:2]/sum(pcoa$eig)
pcoa.plot = ggplot(data = as.data.frame(pcoa$points),mapping = aes(x = V1, y = V2, col = mydata$Species))+
geom_point()+
labs(x = paste('PCoA 1:', round(pcoa_eig[1],4)*100,'%'),
y = paste('PCoA 2:', round(pcoa_eig[2],4)*100,'%'),
title = 'PCoA')
pcoa.plot

# t-SNE
library(Rtsne)
mydata.tsne = unique(iris)
str(mydata.tsne); dim(mydata.tsne)
## 'data.frame': 149 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## [1] 149 5
head(mydata.tsne)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
mydata.tsne$Species <- as.factor(mydata.tsne$Species)
class(mydata.tsne)
## [1] "data.frame"
tsne = Rtsne(mydata.tsne[,-5], dims = 2, perplexity = 30,
verbose=TRUE, max_iter = 1000, pca = T)
## Performing PCA
## Read the 149 x 4 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.02 seconds (sparsity = 0.709067)!
## Learning embedding...
## Iteration 50: error is 44.611234 (50 iterations in 0.02 seconds)
## Iteration 100: error is 44.086156 (50 iterations in 0.02 seconds)
## Iteration 150: error is 43.944513 (50 iterations in 0.02 seconds)
## Iteration 200: error is 43.540655 (50 iterations in 0.02 seconds)
## Iteration 250: error is 43.552800 (50 iterations in 0.02 seconds)
## Iteration 300: error is 0.214138 (50 iterations in 0.02 seconds)
## Iteration 350: error is 0.120802 (50 iterations in 0.02 seconds)
## Iteration 400: error is 0.116130 (50 iterations in 0.01 seconds)
## Iteration 450: error is 0.112603 (50 iterations in 0.01 seconds)
## Iteration 500: error is 0.111436 (50 iterations in 0.02 seconds)
## Iteration 550: error is 0.111781 (50 iterations in 0.02 seconds)
## Iteration 600: error is 0.111771 (50 iterations in 0.02 seconds)
## Iteration 650: error is 0.111656 (50 iterations in 0.01 seconds)
## Iteration 700: error is 0.111192 (50 iterations in 0.01 seconds)
## Iteration 750: error is 0.111053 (50 iterations in 0.02 seconds)
## Iteration 800: error is 0.111214 (50 iterations in 0.02 seconds)
## Iteration 850: error is 0.111079 (50 iterations in 0.02 seconds)
## Iteration 900: error is 0.110364 (50 iterations in 0.02 seconds)
## Iteration 950: error is 0.110592 (50 iterations in 0.03 seconds)
## Iteration 1000: error is 0.110569 (50 iterations in 0.02 seconds)
## Fitting performed in 0.36 seconds.
class(tsne); names(tsne)
## [1] "list"
## [1] "N" "Y" "costs"
## [4] "itercosts" "origD" "perplexity"
## [7] "theta" "max_iter" "stop_lying_iter"
## [10] "mom_switch_iter" "momentum" "final_momentum"
## [13] "eta" "exaggeration_factor"
###plOt TSNE
tsne.plot = ggplot(as.data.frame(tsne[["Y"]]), mapping = aes(x = V1, y = V2,
col = mydata.tsne$Species))+
geom_point()+
labs(x = 't-SNE Dimension 1', y = 't-SNE Dimension 2', title = 't-SNE')
tsne.plot

# visualizing
colors = rainbow(length(unique(mydata.tsne$Species)))
names(colors) = unique(mydata.tsne$Species)
par(mgp=c(2.5,1,0))
plot(tsne$Y, t='n', main="tSNE", xlab="tSNE dimension 1", ylab="tSNE dimension 2", "cex.main"=2, "cex.lab"=1.5)
text(tsne$Y, labels=mydata.tsne$Species, col=colors[mydata.tsne$Species])

##REF https://rpubs.com/marwahsi/tnse
##https://rpubs.com/TX-YXL/660194
##https://www.analyticsvidhya.com/blog/2017/01/t-sne-implementation-r-python/