compare_PCA_PCoA

rm(list = ls())
##############compare PCA, pcoA and TSNE
#PCA (linear)
#t-SNE (non-parametric/ nonlinear)
library(ggplot2)
mydata = iris
dim(mydata); head(mydata)

## [1] 150   5

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# PCA
pca = prcomp(mydata[,-5],
             center = T,
             scale. = T)

screeplot(pca,type = "line")

#plot
data_plot <- as.data.frame(pca$x)
data_plot$source <- mydata$Species
#data_plot$source <- gsub("_.*.", "", data_plot$source ) 
variance.percent_1 <- round(factoextra::get_eig(pca)$variance.percent,digits = 2)
pca.plot <- ggplot(data_plot,aes(x=PC1,y=PC2,color=source)) + 
  geom_point(aes(alpha=0.5)) +
  theme(panel.background = element_blank(),
        legend.position = "right",
        panel.border=element_rect(fill=NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background=element_blank(),
        axis.text.x=element_text(colour="black"),
        axis.text.y=element_text(colour="black"),
        axis.ticks=element_line(colour="black"),
        plot.margin=unit(c(1,1,1,1),"line")) +
  xlab(paste0("PC1"," (",variance.percent_1[1],"%)")) +
  ylab(paste0("PC1"," (",variance.percent_1[2],"%)")) +
  scale_alpha_continuous(guide=FALSE) +
  stat_ellipse( level = 0.95)+
  labs(title = "PCA")
pca.plot

# PCoA
library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.5-6

pcoa = cmdscale(vegdist(mydata[,-5], method = 'bray'), k = nrow(mydata) - 1, eig = T)

## Warning in cmdscale(vegdist(mydata[, -5], method = "bray"), k = nrow(mydata) - :
## only 58 of the first 149 eigenvalues are > 0

pcoa_eig <- (pcoa$eig)[1:2]/sum(pcoa$eig)

pcoa.plot = ggplot(data = as.data.frame(pcoa$points),mapping = aes(x = V1, y = V2, col = mydata$Species))+
  geom_point()+
  labs(x = paste('PCoA 1:', round(pcoa_eig[1],4)*100,'%'),
       y = paste('PCoA 2:', round(pcoa_eig[2],4)*100,'%'),
       title = 'PCoA')
pcoa.plot

# t-SNE
library(Rtsne)
mydata.tsne = unique(iris)
str(mydata.tsne); dim(mydata.tsne)

## 'data.frame':    149 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

## [1] 149   5

head(mydata.tsne)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

mydata.tsne$Species <- as.factor(mydata.tsne$Species)
class(mydata.tsne)

## [1] "data.frame"

tsne = Rtsne(mydata.tsne[,-5], dims = 2, perplexity = 30, 
             verbose=TRUE, max_iter = 1000, pca = T)

## Performing PCA
## Read the 149 x 4 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.02 seconds (sparsity = 0.709067)!
## Learning embedding...
## Iteration 50: error is 44.611234 (50 iterations in 0.02 seconds)
## Iteration 100: error is 44.086156 (50 iterations in 0.02 seconds)
## Iteration 150: error is 43.944513 (50 iterations in 0.02 seconds)
## Iteration 200: error is 43.540655 (50 iterations in 0.02 seconds)
## Iteration 250: error is 43.552800 (50 iterations in 0.02 seconds)
## Iteration 300: error is 0.214138 (50 iterations in 0.02 seconds)
## Iteration 350: error is 0.120802 (50 iterations in 0.02 seconds)
## Iteration 400: error is 0.116130 (50 iterations in 0.01 seconds)
## Iteration 450: error is 0.112603 (50 iterations in 0.01 seconds)
## Iteration 500: error is 0.111436 (50 iterations in 0.02 seconds)
## Iteration 550: error is 0.111781 (50 iterations in 0.02 seconds)
## Iteration 600: error is 0.111771 (50 iterations in 0.02 seconds)
## Iteration 650: error is 0.111656 (50 iterations in 0.01 seconds)
## Iteration 700: error is 0.111192 (50 iterations in 0.01 seconds)
## Iteration 750: error is 0.111053 (50 iterations in 0.02 seconds)
## Iteration 800: error is 0.111214 (50 iterations in 0.02 seconds)
## Iteration 850: error is 0.111079 (50 iterations in 0.02 seconds)
## Iteration 900: error is 0.110364 (50 iterations in 0.02 seconds)
## Iteration 950: error is 0.110592 (50 iterations in 0.03 seconds)
## Iteration 1000: error is 0.110569 (50 iterations in 0.02 seconds)
## Fitting performed in 0.36 seconds.

class(tsne); names(tsne)

## [1] "list"

##  [1] "N"                   "Y"                   "costs"              
##  [4] "itercosts"           "origD"               "perplexity"         
##  [7] "theta"               "max_iter"            "stop_lying_iter"    
## [10] "mom_switch_iter"     "momentum"            "final_momentum"     
## [13] "eta"                 "exaggeration_factor"

###plOt TSNE
tsne.plot = ggplot(as.data.frame(tsne[["Y"]]), mapping = aes(x = V1, y = V2,
            col = mydata.tsne$Species))+
  geom_point()+
  labs(x = 't-SNE Dimension 1', y = 't-SNE Dimension 2', title = 't-SNE')
tsne.plot

# visualizing
colors = rainbow(length(unique(mydata.tsne$Species)))
names(colors) = unique(mydata.tsne$Species)
par(mgp=c(2.5,1,0))
plot(tsne$Y, t='n', main="tSNE", xlab="tSNE dimension 1", ylab="tSNE dimension 2", "cex.main"=2, "cex.lab"=1.5)
text(tsne$Y, labels=mydata.tsne$Species, col=colors[mydata.tsne$Species])

##REF https://rpubs.com/marwahsi/tnse
##https://rpubs.com/TX-YXL/660194
##https://www.analyticsvidhya.com/blog/2017/01/t-sne-implementation-r-python/

compare_PCA_PCoA_tSNE.R

liyix

2020-09-24