What is Topological Data Analysis?
http://www.kdnuggets.com/2015/11/crazy-deep-learning-topological-data-analysis.html.
https://www.kaggle.com/triskelion/digit-recognizer/mapping-digits-with-a-t-sne-lens/notebook.
This is an exploratory homework by following
https://cran.r-project.org/web/packages/TDA/vignettes/article.pdf.
They present a short tutorial and introduction to using the R package TDA, which provides some tools for Topological Data Analysis.
In particular, it includes implementations of functions that, given some data, provide topological information about the underlying space, such as
The salient topological features of the sublevel sets (or superlevel sets) of these functions can be quantified with persistent homology
As a first toy example to using the TDA package, we show how to compute distance functions and density estimators over a grid of points. The setting is the typical one in TDA: a set of points X = {x1, . . . , xn} ⊂ R d has been sampled from some distribution P and we are interested in recovering the topological features of the underlying space by studying some functions of the data. The following code generates a sample of 400 points from the unit circle and constructs a grid of points over which we will evaluate the functions.
library("TDA")
## Warning: package 'TDA' was built under R version 3.3.3
X <- circleUnif(400)
Xlim <- c(-1.6, 1.6); Ylim <- c(-1.7, 1.7); by <- 0.065
Xseq <- seq(Xlim[1], Xlim[2], by = by)
Yseq <- seq(Ylim[1], Ylim[2], by = by)
Grid <- expand.grid(Xseq, Yseq)
plot(X,main="Simple X")
### 1. the distance function
distance <- distFct(X = X, Grid = Grid)
### 2. the distance to a measure,
m0 <- 0.1
DTM <- dtm(X = X, Grid = Grid, m0 = m0)
### 3. the kNN density estimator,
k <- 60
kNN <- knnDE(X = X, Grid = Grid, k = k)
### 4. the kernel density estimator,
h <- 0.3
KDE <- kde(X = X, Grid = Grid, h = h)
### 5. and the kernel distance.
h <- 0.3
Kdist <- kernelDist(X = X, Grid = Grid, h = h)
persp(Xseq, Yseq,matrix(distance, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "distance function", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)
persp(Xseq, Yseq,matrix(DTM, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "DTM", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)
persp(Xseq, Yseq,matrix(KDE, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "KDE", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)
persp(Xseq, Yseq,matrix(kNN, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "KNN", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)
persp(Xseq, Yseq,matrix(Kdist, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "Kdist", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)
band <- bootstrapBand(X = X, FUN = kde, Grid = Grid, B = 100,parallel = FALSE, alpha = 0.1, h = h)
Diag <- gridDiag(X = X, FUN = kde, h = 0.3, lim = cbind(Xlim, Ylim), by = by, sublevel = FALSE, library = "Dionysus",printProgress = FALSE)
plot(Diag[["diagram"]], band = 2 * band[["width"]],main = "KDE Diagram")
par(mfrow = c(1, 2), mai = c(0.8, 0.8, 0.3, 0.1))
plot(Diag[["diagram"]], rotated = TRUE, band = band[["width"]], main = "Rotated Diagram")
plot(Diag[["diagram"]], barcode = TRUE, main = "Barcode")
Circle1 <- circleUnif(60)
Circle2 <- circleUnif(60, r = 2) + 3
Circles <- rbind(Circle1, Circle2)
maxscale <- 5 # limit of the filtration
maxdimension <- 1 # components and loops
Diag <- ripsDiag(X = Circles, maxdimension, maxscale,library = "GUDHI", printProgress = FALSE)
par(mfrow = c(1, 2), mai=c(0.8, 0.8, 0.3, 0.3))
plot(Circles, pch = 16, xlab = "",ylab = "")
plot(Diag[["diagram"]])
X <- circleUnif(n = 30)
DiagAlphaComplex <- alphaComplexDiag(X = X, printProgress = TRUE)
## # Generated complex of size: 115
## # Persistence timer: Elapsed time [ 0.000000 ] seconds
# Generated complex of size: 115
# Persistence timer: Elapsed time [ 0.000000 ] seconds
plot(DiagAlphaComplex[["diagram"]])
XX <- sphereUnif(n = 500, d = 2)
DiagAlphaShape <- alphaShapeDiag(X = XX, printProgress = FALSE)
plot(DiagAlphaShape[["diagram"]])
Diag1 <- ripsDiag(Circle1, maxdimension = 1, maxscale = 5)
Diag2 <- ripsDiag(Circle2, maxdimension = 1, maxscale = 5)
print (bottleneck(Diag1[["diagram"]], Diag2[["diagram"]],dimension = 1))
## [1] 1.267598
print (wasserstein(Diag1[["diagram"]], Diag2[["diagram"]], p = 2, dimension = 1))
## [1] 2.009503
tseq <- seq(0, maxscale, length = 1000) #domain
Land <- landscape(Diag[["diagram"]], dimension = 1, KK = 1, tseq)
Sil <- silhouette(Diag[["diagram"]], p = 1, dimension = 1, tseq)
N <- 4000
XX1 <- circleUnif(N / 2)
XX2 <- circleUnif(N / 2, r = 2) + 3
X <- rbind(XX1, XX2)
m <- 80 # subsample size
n <- 10 # we will compute n landscapes using subsamples of size m
tseq <- seq(0, maxscale, length = 500) #domain of landscapes
#here we store n Rips diags
Diags <- list()
#here we store n landscapes
Lands <- matrix(0, nrow = n, ncol = length(tseq))
for (i in seq_len(n)) {
subX <- X[sample(seq_len(N), m), ]
Diags[[i]] <- ripsDiag(subX, maxdimension = 1, maxscale = 5)
Lands[i, ] <- landscape(Diags[[i]][["diagram"]], dimension = 1,
KK = 1, tseq)}
bootLand <- multipBootstrap(Lands, B = 100, alpha = 0.05, parallel = FALSE)
plot(tseq, bootLand[["mean"]], main = "Mean Landscape with 95% band")
polygon(c(tseq, rev(tseq)), c(bootLand[["band"]][, 1], rev(bootLand[["band"]][, 2])), col = "pink")
lines(tseq, bootLand[["mean"]], lwd = 2, col = 2)
XX1 <- circleUnif(600)
XX2 <- circleUnif(1000, r = 1.5) + 2.5
noise <- cbind(runif(80, -2, 5), runif(80, -2, 5))
X <- rbind(XX1, XX2, noise)
# Grid limits
Xlim <- c(-2, 5)
Ylim <- c(-2, 5)
by <- 0.2
parametersKDE <- seq(0.1, 0.6, by = 0.05)
B <- 50 # number of bootstrap iterations. Should be large.
alpha <- 0.1 # level of the confidence bands
maxKDE <- maxPersistence(kde, parametersKDE, X, lim = cbind(Xlim, Ylim), by = by, sublevel = FALSE, B = B, alpha = alpha, parallel = TRUE, printProgress = TRUE, bandFUN = "bootstrapBand")
## 0 10 20 30 40 50 60 70 80 90 100
## |----|----|----|----|----|----|----|----|----|----|
## ***************************************************
print(summary(maxKDE))
## Call:
## maxPersistence(FUN = kde, parameters = parametersKDE, X = X,
## lim = cbind(Xlim, Ylim), by = by, sublevel = FALSE, B = B,
## alpha = alpha, bandFUN = "bootstrapBand", parallel = TRUE,
## printProgress = TRUE)
##
## The number of significant features is maximized by
## [1] 0.25 0.30
##
## The total significant persistence is maximized by
## [1] 0.1
par(mfrow = c(1, 2), mai = c(0.8, 0.8, 0.35, 0.3))
plot(X, pch = 16, cex = 0.5, main = "Two Circles")
plot(maxKDE, main = "Max Persistence - KDE")
X1 <- cbind(rnorm(300, 1, .8), rnorm(300, 5, 0.8))
X2 <- cbind(rnorm(300, 3.5, .8), rnorm(300, 5, 0.8))
X3 <- cbind(rnorm(300, 6, 1), rnorm(300, 1, 1))
XX <- rbind(X1, X2, X3)
Tree <- clusterTree(XX, k = 100, density = "knn", printProgress = FALSE)
TreeKDE <- clusterTree(XX, k = 100, h = 0.3, density = "kde", printProgress = FALSE)
plot(Tree, type = "lambda", main = "lambda Tree (knn)")
plot(Tree, type = "kappa", main = "kappa Tree (knn)")
plot(TreeKDE, type = "lambda", main = "lambda Tree (kde)")
plot(TreeKDE, type = "kappa", main = "kappa Tree (kde)")