nestedMeansClass <- function(x, nsplit) {
xR <- range(x)
# include min(x)in the lowest category using cut
xMin <- xR[1] - 1e-06 * diff(xR)
xMax <- xR[2]
xMean <- mean(x)
xBrks <- c(xMin, xMean, xMax)
xClass <- cut(x, xBrks, labels = F)
if (nsplit == 1)
return(xClass)
for (i in 2:nsplit) {
xMeanNew <- tapply(x, xClass, mean)
xMean <- sort(c(xMeanNew, xMean))
xBrks <- c(xMin, xMean, xMax)
xClass <- cut(x, xBrks, labels = F)
}
return(as.vector(xClass))
}
# test x
x <- rnorm(10000)
n <- length(x)
nSplit <- floor(logb(sqrt(n/35), base = 2))
xClass <- nestedMeansClass(x, nSplit)
n
## [1] 10000
nSplit
## [1] 4
table(xClass)
## xClass
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 329 533 601 659 703 706 714 747 751 711 722 684 674 604 515 347
equalClass <- function(x, nRow) {
n <- length(x)
subs <- round(c(1, 1:nRow * n/nRow))
xBrks <- sort(x)[subs]
return(as.vector(cut(x, xBrks, include.lowest = T, labels = F)))
}
x <- rnorm(10000)
n <- length(x)
nRow <- floor(sqrt(n/35))
eClass <- equalClass(x, nRow)
table(eClass)
## eClass
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 625 625 625 625 625 625 625 625 625 625 625 625 625 625 625 625
entropySc <- function(p) return(-sum(p * log2(p), na.rm = T)/log2(length(p)))
conditionalEntropy <- function(tableXY) {
yMarg <- apply(tableXY, 1, sum) # row margin index by y
n <- sum(yMarg) # grand total
xGyProbs <- sweep(tableXY, 1, yMarg, "/") # rows are probs
xGyCE <- apply(xGyProbs, 1, entropySc) # scaled univariate entropies
xGyCEwa <- sum(xGyCE * yMarg)/n # weighted average
xMarg <- apply(tableXY, 2, sum) # column margin index by x
yGxProbs <- sweep(tableXY, 2, xMarg, "/") # columns are probs
yGxCE <- apply(yGxProbs, 2, entropySc) # scaled univariate entropies
yGxCEwa <- sum(yGxCE * xMarg)/n # weighted average
return(max(xGyCEwa, yGxCEwa))
}
# test
paperMat <- matrix(c(0, 1, 3, 0, 0, 0, 1, 9, 1, 0, 1, 2, 7, 14, 3, 7, 6, 0,
7, 6, 13, 19, 12, 5, 0, 4, 14, 5, 1, 1, 1, 2, 3, 2, 0, 0), ncol = 6, byrow = T)
conditionalEntropy(paperMat)
## [1] 0.8121
scaledEntropyView <- function(ceMat, title = "", colorPalette = NULL, ncolors = 5,
colorNA = "red", svdOrd = T, ord = NULL, whiteV = 0.5, whiteH = 0.8, cex = 0.84,
labCex = 0.84, tCex = 1.2) {
if (is.null(colorPalette)) {
Blues <- c("#000088", "#D0D0FF")
colorPalette <- colorRampPalette(Blues, space = "Lab")
}
colors = c(colorNA, colorPalette(ncolors))
varNames = colnames(ceMat)
if (is.null(varNames))
varNames <- paste("V", 1:nc, sep = "")
nc <- ncol(ceMat)
ceMat[nrow(ceMat) < ncol(ceMat)] <- 0
diag(ceMat) <- 0
ceMat <- ceMat + t(ceMat)
# get order vector
if (svdOrd) {
ord <- rev(order(svd(ceMat)$u[, 1]))
} else {
if (is.NULL(ord)) {
# get spanning tree order)
fakeValues <- cmdscale(ceMat, k = nc, eig = T)
nvar <- sum(fakeValues$eig > 0)
ord <- mstree(fakeValues$points[, 1:nvar])$order[, 1]
}
}
ceMat <- ceMat[ord, ord]
varNames <- varNames[ord]
diag(ceMat) <- NA
panels <- panelLayout(nrow = 1, ncol = 1, leftMar = 0, topMar = 0.8, bottomMar = 0)
entropy <- ceMat[row(ceMat) > col(ceMat)]
entropyR <- range(entropy)
brksEntropy <- seq(entropyR[1], entropyR[2], length = ncolors + 1)
# shift for NA
polycol <- cut(ceMat, brksEntropy, include.lowest = T, labels = F) + 1
polycol <- ifelse(is.na(polycol), 1, polycol) # NA color 1
k <- 0.5
dx <- c(-k, k, k, -k, NA)
dy <- c(-k, -k, k, k, NA)
mat <- expand.grid(list(x = 1:nc, y = nc:1))
newx <- rep(mat$x, rep(5, length(mat$x)))
newy <- rep(mat$y, rep(5, length(mat$y)))
rx <- c(0.5, nc + 0.5)
ry <- c(0.5, nc + 0.5)
panelSelect(panels, 1, 1)
panelScale(rx, ry)
polygon(newx + dx, newy + dy, density = -1, col = colors[polycol])
polygon(newx + whiteH * dx, newy + whiteV * dy, density = -1, border = NA,
col = "white")
xdiag <- rep(1:nc, rep(5, nc))
polygon(xdiag + dx, rev(xdiag) + dy, density = -1, col = "#C0C0C0")
diag(ceMat) <- 0
valsText <- format(round(ceMat, 2))
diag(valsText) <- ""
text(mat[, 1], mat[, 2], valsText, cex = cex, adj = 0.5)
text(1:nc, nc:1, varNames, adj = 0.6, cex = labCex)
panelOutline()
panelSelect(panels, mar = "top")
panelScale()
if (length(title) == 1)
text(0.5, 0.8, title, cex = tCex, adj = 0.5) else text(c(0.5, 0.5), c(0.9, 0.5), title, cex = tCex, adj = 0.5)
}
# Generating clusters in subspaces to test the approach
library(MASS)
source("panelFunctions.r")
myMVdat <- function(n1, cen1, cov1, row1, col1, n2, cen2, cov2, row2, col2,
ntot, dtot) {
d1 <- length(cen1)
d2 <- length(cen2)
cluster1 <- mvrnorm(n = n1, mu = cen1, Sigma = cov1)
cluster2 <- mvrnorm(n = n2, mu = cen2, Sigma = cov2)
full <- matrix(rnorm(ntot * dtot), ncol = dtot)
full[row1:(row1 + n1 - 1), col1:(col1 + d1 - 1)] <- cluster1
full[row2:(row2 + n2 - 1), col2:(col2 + d2 - 1)] <- cluster2
return(full)
}
# Describe components cluster 1
n1 <- 1000
d1 <- 3
cen1 <- rep(0, d1)
cor1 <- matrix(c(1, 0.5, -0.3, 0.5, 1, -0.4, -0.3, -0.4, 1), ncol = 3, byrow = T)
sd1 <- c(0.05, 0.02, 0.03)
cov1 <- diag(sd1) %*% cor1 %*% diag(sd1)
row1 <- 1001
col1 <- 3
# cluster 2
n2 <- 1500
d2 <- 5
cen2 <- rep(0.5, d2)
cov2 <- 4e-04 * diag(d2)
row2 <- 8001
col2 <- 8
# background
ntot <- 10000
dtot <- 15
# build test data set
testDat <- myMVdat(n1, cen1, cov1, row1, col1, n2, cen2, cov2, row2, col2, ntot,
dtot)
# Data, entropy tables and views
testDim <- dim(testDat)
nr <- testDim[1]
nc <- testDim[2]
nSplit <- floor(logb(sqrt(nr/35), base = 2))
classDat <- apply(testDat, 2, nestedMeansClass, nSplit)
ceTable1 <- matrix(0, nrow = nc, ncol = nc)
for (j in 1:(nc - 1)) {
for (i in (j + 1):nc) {
tableIJ <- table(classDat[, i], classDat[, j])
ceTable1[i, j] <- conditionalEntropy(tableIJ)
}
}
scaledEntropyView(ceTable1, title = c("Scaled Conditional Entropy", "Original Data"))
# Moving cluster centers to lower density locations
cen1 <- rep(-2, 3)
cen2 <- rep(1.5, 5)
testDat <- myMVdat(n1, cen1, cov1, row1, col1, n2, cen2, cov2, row2, col2, ntot,
dtot)
testDim <- dim(testDat)
nr <- testDim[1]
nc <- testDim[2]
nSplit <- floor(logb(sqrt(nr/35), base = 2))
classDat <- apply(testDat, 2, nestedMeansClass, nSplit)
ceTable2 <- matrix(0, nrow = nc, ncol = nc)
for (j in 1:(nc - 1)) {
for (i in (j + 1):nc) {
tableIJ <- table(classDat[, i], classDat[, j])
ceTable2[i, j] <- conditionalEntropy(tableIJ)
}
}
scaledEntropyView(ceTable2, title = c("Scaled Conditional Entropy", "Clusters Moved To Lower Density Region"))
# Trying other classifying function
testDim <- dim(testDat)
nr <- testDim[1]
nc <- testDim[2]
nClass <- floor(sqrt(nr/35))
xClass <- equalClass(x, nClass)
classDat <- apply(testDat, 2, equalClass, nClass)
ceTable3 <- matrix(0, nrow = nc, ncol = nc)
for (j in 1:(nc - 1)) {
for (i in (j + 1):nc) {
tableIJ <- table(classDat[, i], classDat[, j])
ceTable3[i, j] <- conditionalEntropy(tableIJ)
}
}
scaledEntropyView(ceTable3, title = "Equal Count Classes")