CLUSTER SUBSPACES

nestedMeansClass <- function(x, nsplit) {
    xR <- range(x)
    # include min(x)in the lowest category using cut
    xMin <- xR[1] - 1e-06 * diff(xR)
    xMax <- xR[2]
    xMean <- mean(x)
    xBrks <- c(xMin, xMean, xMax)
    xClass <- cut(x, xBrks, labels = F)

    if (nsplit == 1) 
        return(xClass)
    for (i in 2:nsplit) {
        xMeanNew <- tapply(x, xClass, mean)
        xMean <- sort(c(xMeanNew, xMean))
        xBrks <- c(xMin, xMean, xMax)
        xClass <- cut(x, xBrks, labels = F)
    }
    return(as.vector(xClass))
}

# test x
x <- rnorm(10000)
n <- length(x)
nSplit <- floor(logb(sqrt(n/35), base = 2))
xClass <- nestedMeansClass(x, nSplit)
n
## [1] 10000
nSplit
## [1] 4
table(xClass)
## xClass
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
## 329 533 601 659 703 706 714 747 751 711 722 684 674 604 515 347

equalClass <- function(x, nRow) {
    n <- length(x)
    subs <- round(c(1, 1:nRow * n/nRow))
    xBrks <- sort(x)[subs]
    return(as.vector(cut(x, xBrks, include.lowest = T, labels = F)))
}

x <- rnorm(10000)
n <- length(x)
nRow <- floor(sqrt(n/35))
eClass <- equalClass(x, nRow)
table(eClass)
## eClass
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
## 625 625 625 625 625 625 625 625 625 625 625 625 625 625 625 625
entropySc <- function(p) return(-sum(p * log2(p), na.rm = T)/log2(length(p)))

conditionalEntropy <- function(tableXY) {
    yMarg <- apply(tableXY, 1, sum)  # row margin index by y
    n <- sum(yMarg)  # grand total
    xGyProbs <- sweep(tableXY, 1, yMarg, "/")  # rows are probs
    xGyCE <- apply(xGyProbs, 1, entropySc)  # scaled univariate entropies     
    xGyCEwa <- sum(xGyCE * yMarg)/n  # weighted average

    xMarg <- apply(tableXY, 2, sum)  # column margin index by x
    yGxProbs <- sweep(tableXY, 2, xMarg, "/")  # columns are probs
    yGxCE <- apply(yGxProbs, 2, entropySc)  # scaled univariate entropies 
    yGxCEwa <- sum(yGxCE * xMarg)/n  # weighted average

    return(max(xGyCEwa, yGxCEwa))
}

# test
paperMat <- matrix(c(0, 1, 3, 0, 0, 0, 1, 9, 1, 0, 1, 2, 7, 14, 3, 7, 6, 0, 
    7, 6, 13, 19, 12, 5, 0, 4, 14, 5, 1, 1, 1, 2, 3, 2, 0, 0), ncol = 6, byrow = T)

conditionalEntropy(paperMat)
## [1] 0.8121
scaledEntropyView <- function(ceMat, title = "", colorPalette = NULL, ncolors = 5, 
    colorNA = "red", svdOrd = T, ord = NULL, whiteV = 0.5, whiteH = 0.8, cex = 0.84, 
    labCex = 0.84, tCex = 1.2) {
    if (is.null(colorPalette)) {
        Blues <- c("#000088", "#D0D0FF")
        colorPalette <- colorRampPalette(Blues, space = "Lab")
    }
    colors = c(colorNA, colorPalette(ncolors))

    varNames = colnames(ceMat)
    if (is.null(varNames)) 
        varNames <- paste("V", 1:nc, sep = "")

    nc <- ncol(ceMat)
    ceMat[nrow(ceMat) < ncol(ceMat)] <- 0
    diag(ceMat) <- 0
    ceMat <- ceMat + t(ceMat)

    # get order vector
    if (svdOrd) {
        ord <- rev(order(svd(ceMat)$u[, 1]))
    } else {
        if (is.NULL(ord)) {
            # get spanning tree order)
            fakeValues <- cmdscale(ceMat, k = nc, eig = T)
            nvar <- sum(fakeValues$eig > 0)
            ord <- mstree(fakeValues$points[, 1:nvar])$order[, 1]
        }
    }

    ceMat <- ceMat[ord, ord]
    varNames <- varNames[ord]
    diag(ceMat) <- NA
    panels <- panelLayout(nrow = 1, ncol = 1, leftMar = 0, topMar = 0.8, bottomMar = 0)

    entropy <- ceMat[row(ceMat) > col(ceMat)]
    entropyR <- range(entropy)
    brksEntropy <- seq(entropyR[1], entropyR[2], length = ncolors + 1)

    # shift for NA
    polycol <- cut(ceMat, brksEntropy, include.lowest = T, labels = F) + 1
    polycol <- ifelse(is.na(polycol), 1, polycol)  # NA color 1

    k <- 0.5
    dx <- c(-k, k, k, -k, NA)
    dy <- c(-k, -k, k, k, NA)

    mat <- expand.grid(list(x = 1:nc, y = nc:1))
    newx <- rep(mat$x, rep(5, length(mat$x)))
    newy <- rep(mat$y, rep(5, length(mat$y)))

    rx <- c(0.5, nc + 0.5)
    ry <- c(0.5, nc + 0.5)
    panelSelect(panels, 1, 1)
    panelScale(rx, ry)
    polygon(newx + dx, newy + dy, density = -1, col = colors[polycol])
    polygon(newx + whiteH * dx, newy + whiteV * dy, density = -1, border = NA, 
        col = "white")
    xdiag <- rep(1:nc, rep(5, nc))
    polygon(xdiag + dx, rev(xdiag) + dy, density = -1, col = "#C0C0C0")

    diag(ceMat) <- 0
    valsText <- format(round(ceMat, 2))
    diag(valsText) <- ""
    text(mat[, 1], mat[, 2], valsText, cex = cex, adj = 0.5)
    text(1:nc, nc:1, varNames, adj = 0.6, cex = labCex)
    panelOutline()
    panelSelect(panels, mar = "top")
    panelScale()
    if (length(title) == 1) 
        text(0.5, 0.8, title, cex = tCex, adj = 0.5) else text(c(0.5, 0.5), c(0.9, 0.5), title, cex = tCex, adj = 0.5)
}

# Generating clusters in subspaces to test the approach
library(MASS)
source("panelFunctions.r")
myMVdat <- function(n1, cen1, cov1, row1, col1, n2, cen2, cov2, row2, col2, 
    ntot, dtot) {
    d1 <- length(cen1)
    d2 <- length(cen2)
    cluster1 <- mvrnorm(n = n1, mu = cen1, Sigma = cov1)
    cluster2 <- mvrnorm(n = n2, mu = cen2, Sigma = cov2)
    full <- matrix(rnorm(ntot * dtot), ncol = dtot)
    full[row1:(row1 + n1 - 1), col1:(col1 + d1 - 1)] <- cluster1
    full[row2:(row2 + n2 - 1), col2:(col2 + d2 - 1)] <- cluster2
    return(full)
}

# Describe components cluster 1
n1 <- 1000
d1 <- 3
cen1 <- rep(0, d1)
cor1 <- matrix(c(1, 0.5, -0.3, 0.5, 1, -0.4, -0.3, -0.4, 1), ncol = 3, byrow = T)
sd1 <- c(0.05, 0.02, 0.03)
cov1 <- diag(sd1) %*% cor1 %*% diag(sd1)
row1 <- 1001
col1 <- 3

# cluster 2
n2 <- 1500
d2 <- 5
cen2 <- rep(0.5, d2)
cov2 <- 4e-04 * diag(d2)
row2 <- 8001
col2 <- 8

# background
ntot <- 10000
dtot <- 15

# build test data set
testDat <- myMVdat(n1, cen1, cov1, row1, col1, n2, cen2, cov2, row2, col2, ntot, 
    dtot)

# Data, entropy tables and views
testDim <- dim(testDat)
nr <- testDim[1]
nc <- testDim[2]
nSplit <- floor(logb(sqrt(nr/35), base = 2))
classDat <- apply(testDat, 2, nestedMeansClass, nSplit)
ceTable1 <- matrix(0, nrow = nc, ncol = nc)

for (j in 1:(nc - 1)) {
    for (i in (j + 1):nc) {
        tableIJ <- table(classDat[, i], classDat[, j])
        ceTable1[i, j] <- conditionalEntropy(tableIJ)
    }
}

scaledEntropyView(ceTable1, title = c("Scaled Conditional Entropy", "Original Data"))

plot of chunk unnamed-chunk-1

# Moving cluster centers to lower density locations
cen1 <- rep(-2, 3)
cen2 <- rep(1.5, 5)
testDat <- myMVdat(n1, cen1, cov1, row1, col1, n2, cen2, cov2, row2, col2, ntot, 
    dtot)

testDim <- dim(testDat)
nr <- testDim[1]
nc <- testDim[2]
nSplit <- floor(logb(sqrt(nr/35), base = 2))
classDat <- apply(testDat, 2, nestedMeansClass, nSplit)
ceTable2 <- matrix(0, nrow = nc, ncol = nc)

for (j in 1:(nc - 1)) {
    for (i in (j + 1):nc) {
        tableIJ <- table(classDat[, i], classDat[, j])
        ceTable2[i, j] <- conditionalEntropy(tableIJ)
    }
}
scaledEntropyView(ceTable2, title = c("Scaled Conditional Entropy", "Clusters Moved To Lower Density Region"))

plot of chunk unnamed-chunk-2

# Trying other classifying function
testDim <- dim(testDat)
nr <- testDim[1]
nc <- testDim[2]
nClass <- floor(sqrt(nr/35))
xClass <- equalClass(x, nClass)
classDat <- apply(testDat, 2, equalClass, nClass)
ceTable3 <- matrix(0, nrow = nc, ncol = nc)
for (j in 1:(nc - 1)) {
    for (i in (j + 1):nc) {
        tableIJ <- table(classDat[, i], classDat[, j])
        ceTable3[i, j] <- conditionalEntropy(tableIJ)
    }
}
scaledEntropyView(ceTable3, title = "Equal Count Classes")

plot of chunk unnamed-chunk-3