Introduction to the R package TDA

1. Introduction

What is Topological Data Analysis?

http://www.kdnuggets.com/2015/11/crazy-deep-learning-topological-data-analysis.html.

https://www.kaggle.com/triskelion/digit-recognizer/mapping-digits-with-a-t-sne-lens/notebook.

This is an exploratory homework by following

https://cran.r-project.org/web/packages/TDA/vignettes/article.pdf.

They present a short tutorial and introduction to using the R package TDA, which provides some tools for Topological Data Analysis.

In particular, it includes implementations of functions that, given some data, provide topological information about the underlying space, such as

1. the distance function,

2. the distance to a measure,

3. the kNN density estimator,

4. the kernel density estimator,

5. and the kernel distance.

The salient topological features of the sublevel sets (or superlevel sets) of these functions can be quantified with persistent homology

As a first toy example to using the TDA package, we show how to compute distance functions and density estimators over a grid of points. The setting is the typical one in TDA: a set of points X = {x1, . . . , xn} ⊂ R d has been sampled from some distribution P and we are interested in recovering the topological features of the underlying space by studying some functions of the data. The following code generates a sample of 400 points from the unit circle and constructs a grid of points over which we will evaluate the functions.

2. Distance Functions and Density Estimators

library("TDA")
## Warning: package 'TDA' was built under R version 3.3.3
X <- circleUnif(400)
Xlim <- c(-1.6, 1.6); Ylim <- c(-1.7, 1.7); by <- 0.065
Xseq <- seq(Xlim[1], Xlim[2], by = by)
Yseq <- seq(Ylim[1], Ylim[2], by = by)
Grid <- expand.grid(Xseq, Yseq)

plot(X,main="Simple X")

### 1. the distance function

distance <- distFct(X = X, Grid = Grid)

### 2. the distance to a measure,

m0 <- 0.1
DTM <- dtm(X = X, Grid = Grid, m0 = m0)

### 3. the kNN density estimator,

k <- 60
kNN <- knnDE(X = X, Grid = Grid, k = k)

### 4. the kernel density estimator, 

h <- 0.3
KDE <- kde(X = X, Grid = Grid, h = h)

### 5. and the kernel distance. 

h <- 0.3
Kdist <- kernelDist(X = X, Grid = Grid, h = h)

persp(Xseq, Yseq,matrix(distance, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "distance function", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)

persp(Xseq, Yseq,matrix(DTM, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "DTM", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)

persp(Xseq, Yseq,matrix(KDE, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "KDE", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)

persp(Xseq, Yseq,matrix(kNN, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "KNN", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)

persp(Xseq, Yseq,matrix(Kdist, ncol = length(Yseq), nrow = length(Xseq)), xlab = "",ylab = "", zlab = "", theta = -20, phi = 35, ltheta = 50,col = 2, border = NA, main = "Kdist", d = 0.5, scale = FALSE,expand = 3, shade = 0.9)

2.1. Bootstrap Confidence Bands

band <- bootstrapBand(X = X, FUN = kde, Grid = Grid, B = 100,parallel = FALSE, alpha = 0.1, h = h)

3. Persistent Homology

3.1. Persistent Homology Over a Grid

Diag <- gridDiag(X = X, FUN = kde, h = 0.3, lim = cbind(Xlim, Ylim), by = by, sublevel = FALSE, library = "Dionysus",printProgress = FALSE)


plot(Diag[["diagram"]], band = 2 * band[["width"]],main = "KDE Diagram")

par(mfrow = c(1, 2), mai = c(0.8, 0.8, 0.3, 0.1))
plot(Diag[["diagram"]], rotated = TRUE, band = band[["width"]], main = "Rotated Diagram")
plot(Diag[["diagram"]], barcode = TRUE, main = "Barcode")

3.2. Rips Diagrams

Circle1 <- circleUnif(60)
Circle2 <- circleUnif(60, r = 2) + 3
Circles <- rbind(Circle1, Circle2)


maxscale <- 5 # limit of the filtration
maxdimension <- 1 # components and loops

Diag <- ripsDiag(X = Circles, maxdimension, maxscale,library = "GUDHI", printProgress = FALSE)


par(mfrow = c(1, 2), mai=c(0.8, 0.8, 0.3, 0.3))
plot(Circles, pch = 16, xlab = "",ylab = "")
plot(Diag[["diagram"]])

3.3. Alpha Complex Persistence Diagram

X <- circleUnif(n = 30)
DiagAlphaComplex <- alphaComplexDiag(X = X, printProgress = TRUE)
## # Generated complex of size: 115 
## # Persistence timer: Elapsed time [ 0.000000 ] seconds
# Generated complex of size: 115
# Persistence timer: Elapsed time [ 0.000000 ] seconds
plot(DiagAlphaComplex[["diagram"]])

3.4. Persistence Diagram of Alpha Shape

XX <- sphereUnif(n = 500, d = 2)
DiagAlphaShape <- alphaShapeDiag(X = XX, printProgress = FALSE)
plot(DiagAlphaShape[["diagram"]])

3.5. Bottleneck and Wasserstein Distances

Diag1 <- ripsDiag(Circle1, maxdimension = 1, maxscale = 5)
Diag2 <- ripsDiag(Circle2, maxdimension = 1, maxscale = 5)

print (bottleneck(Diag1[["diagram"]], Diag2[["diagram"]],dimension = 1))
## [1] 1.267598
print (wasserstein(Diag1[["diagram"]], Diag2[["diagram"]], p = 2, dimension = 1))
## [1] 2.009503

3.6. Landscapes and Silhouettes

tseq <- seq(0, maxscale, length = 1000) #domain
Land <- landscape(Diag[["diagram"]], dimension = 1, KK = 1, tseq)
Sil <- silhouette(Diag[["diagram"]], p = 1, dimension = 1, tseq)

3.7. Confidence Bands for Landscapes and Silhouettes

N <- 4000
XX1 <- circleUnif(N / 2)
XX2 <- circleUnif(N / 2, r = 2) + 3
X <- rbind(XX1, XX2)


m <- 80 # subsample size
n <- 10 # we will compute n landscapes using subsamples of size m
tseq <- seq(0, maxscale, length = 500) #domain of landscapes
#here we store n Rips diags
Diags <- list()
#here we store n landscapes
Lands <- matrix(0, nrow = n, ncol = length(tseq))

for (i in seq_len(n)) {
 subX <- X[sample(seq_len(N), m), ]
 Diags[[i]] <- ripsDiag(subX, maxdimension = 1, maxscale = 5)
 Lands[i, ] <- landscape(Diags[[i]][["diagram"]], dimension = 1,
 KK = 1, tseq)}

bootLand <- multipBootstrap(Lands, B = 100, alpha = 0.05, parallel = FALSE)

plot(tseq, bootLand[["mean"]], main = "Mean Landscape with 95% band")
polygon(c(tseq, rev(tseq)), c(bootLand[["band"]][, 1], rev(bootLand[["band"]][, 2])), col = "pink")
lines(tseq, bootLand[["mean"]], lwd = 2, col = 2)

3.8. Selection of Smoothing Parameters

XX1 <- circleUnif(600)
XX2 <- circleUnif(1000, r = 1.5) + 2.5
noise <- cbind(runif(80, -2, 5), runif(80, -2, 5))
X <- rbind(XX1, XX2, noise)
# Grid limits
Xlim <- c(-2, 5)
Ylim <- c(-2, 5)
by <- 0.2

parametersKDE <- seq(0.1, 0.6, by = 0.05)
B <- 50 # number of bootstrap iterations. Should be large.
alpha <- 0.1 # level of the confidence bands

maxKDE <- maxPersistence(kde, parametersKDE, X, lim = cbind(Xlim, Ylim), by = by, sublevel = FALSE, B = B, alpha = alpha, parallel = TRUE, printProgress = TRUE, bandFUN = "bootstrapBand")
## 0   10   20   30   40   50   60   70   80   90   100
## |----|----|----|----|----|----|----|----|----|----|
## ***************************************************
print(summary(maxKDE))
## Call: 
## maxPersistence(FUN = kde, parameters = parametersKDE, X = X, 
##     lim = cbind(Xlim, Ylim), by = by, sublevel = FALSE, B = B, 
##     alpha = alpha, bandFUN = "bootstrapBand", parallel = TRUE, 
##     printProgress = TRUE)
## 
## The number of significant features is maximized by 
## [1] 0.25 0.30
## 
## The total significant persistence is maximized by 
## [1] 0.1
par(mfrow = c(1, 2), mai = c(0.8, 0.8, 0.35, 0.3))
plot(X, pch = 16, cex = 0.5, main = "Two Circles")
plot(maxKDE, main = "Max Persistence - KDE")

4. Density Clustering

X1 <- cbind(rnorm(300, 1, .8), rnorm(300, 5, 0.8))
X2 <- cbind(rnorm(300, 3.5, .8), rnorm(300, 5, 0.8))
X3 <- cbind(rnorm(300, 6, 1), rnorm(300, 1, 1))
XX <- rbind(X1, X2, X3)

Tree <- clusterTree(XX, k = 100, density = "knn", printProgress = FALSE)
TreeKDE <- clusterTree(XX, k = 100, h = 0.3, density = "kde", printProgress = FALSE)

plot(Tree, type = "lambda", main = "lambda Tree (knn)")

plot(Tree, type = "kappa", main = "kappa Tree (knn)")

plot(TreeKDE, type = "lambda", main = "lambda Tree (kde)")

plot(TreeKDE, type = "kappa", main = "kappa Tree (kde)")