set.seed(42)
N = 500
p = 100
r.unif = matrix(runif(N * p, min = -1, max = 1), nrow = N)
r.norm = matrix(rnorm(N * p), nrow = N)
r.pois = matrix(rpois(N * p, lambda = 10) + rnorm(N * p, sd = 0.01), nrow = N)
r.zinb = matrix(rnbinom(N * p, size = 30, mu = 10) * ifelse(runif(N * p) < 0.4, 0, 1), nrow = N)Dimension Reduction
Introduction
This article is motivated by this tweet.
Note that the tweet is about “Reduce” the dimension from 2 to 2.
What about reduce from higher dimension? What about other distribution?
Let’s play with PCA, t-SNE, and UMAP for randomly generated numbers in different distributions.
Generate random number
We will consider three cases
uniform distribution: from -1 to 1
Gaussian (Normal) distribution: mean = 0, sd = 1
Poisson distribution: mean = 10
Zero-inflated negative binomial distribution: size = 30, mu = 10, p = 30/(30+10) = 3/4, and inflated zero proportion is 0.4.
We will simulate 500 samples with 100 dimensions.
Codes for visualization
Default parameters for Rtsne and umap is used. When perform t-SNW on 2-dimensional discrete distributes, I frequently encounter duplicated samples, and I added a small noise (Gaussian distribution with mean = 0, sd = 0.01).
library(ggplot2)
library(Rtsne)
library(umap)
library(patchwork)
run.pca <- function(X) {
ret <- prcomp(X)
d <- data.frame(x = ret$x[, 1], y = ret$x[,2])
g <- ggplot(d, aes(x = x, y = y)) + geom_point(alpha = .5) + ggtitle(sprintf("PCA on [%d x %d]", nrow(X), ncol(X)))
return(g)
}
run.tsne <- function(X) {
while (any(duplicated(X))) {
X <- X + matrix(rnorm(prod(dim(X)), sd = 0.01), nrow = nrow(X))
}
ret <- Rtsne(X)
d <- data.frame(x = ret$Y[, 1], y = ret$Y[,2])
g <- ggplot(d, aes(x = x, y = y)) + geom_point(alpha = .5) + ggtitle(sprintf("tSNE on [%d x %d]", nrow(X), ncol(X)))
return(g)
}
run.umap <- function(X) {
ret <- umap(X)
d <- data.frame(x = ret$layout[, 1], y = ret$layout[,2])
g <- ggplot(d, aes(x = x, y = y)) + geom_point(alpha = .5) + ggtitle(sprintf("UMAP on [%d x %d]", nrow(X), ncol(X)))
return(g)
}
dr.plot <- function(X) {
( run.pca(X[,1:2]) | run.pca(X[,1:5]) | run.pca(X[,1:10]) ) /
( run.tsne(X[,1:2]) | run.tsne(X[,1:5]) | run.tsne(X[,1:10]) ) /
( run.umap(X[,1:2]) | run.umap(X[,1:5]) | run.umap(X[,1:10]) )
}Uniform distribution
# plot matrix
#. row: PCA, t-SNE, UMAP
# columns: dim = 2, 50, 100
dr.plot(r.unif)Gaussian distribution
dr.plot(r.norm)Poisson distribution
dr.plot(r.pois)ZINB distribution
dr.plot(r.zinb)Session information
sessionInfo()R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.4.1
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] patchwork_1.1.2 umap_0.2.10.0 Rtsne_0.16 ggplot2_3.4.2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.10 RSpectra_0.16-1 pillar_1.9.0 compiler_4.2.3
[5] tools_4.2.3 digest_0.6.31 lattice_0.21-8 jsonlite_1.8.5
[9] evaluate_0.21 lifecycle_1.0.3 tibble_3.2.1 gtable_0.3.3
[13] png_0.1-8 pkgconfig_2.0.3 rlang_1.1.1 Matrix_1.5-4.1
[17] cli_3.6.1 rstudioapi_0.14 yaml_2.3.7 xfun_0.39
[21] fastmap_1.1.1 withr_2.5.0 dplyr_1.1.2 knitr_1.43
[25] askpass_1.1 generics_0.1.3 vctrs_0.6.2 htmlwidgets_1.6.2
[29] grid_4.2.3 tidyselect_1.2.0 reticulate_1.29 glue_1.6.2
[33] R6_2.5.1 fansi_1.0.4 rmarkdown_2.22 farver_2.1.1
[37] magrittr_2.0.3 scales_1.2.1 htmltools_0.5.5 colorspace_2.1-0
[41] labeling_0.4.2 utf8_1.2.3 openssl_2.0.6 munsell_0.5.0