getMeans <- function(Cprev, X,K){
p = dim(X)[2]
# create a K x P matrix for means
M = matrix(data = 0, nrow = K, ncol = p)
# combine X and C so we don't have to carry indices
XC = cbind(X,Cprev)
Ccol = p + 1 # cluster column number
# form new means based on previous cluster assignments
for (k in 1:K) {
in_cluster_k = subset(XC,XC[,Ccol]==k)
M[k,] = colMeans(in_cluster_k[,-Ccol])
}
return(M)
}
reCluster <- function(X,M,K){
n = dim(X)[1]
C = matrix(data = 0, nrow = n, ncol = 1)
# for each sample calculate distance to each mean
# assign cluster to that of closest mean
for (i in 1:n) {
eucdist = matrix(data = NA, nrow = 1, ncol = K)
for (j in 1:K) {
eucdist[j] = stats::dist(rbind(X[i,],M[j,]))
}
C[i] = ramify::argmin(eucdist)
}
return(C)
}
KMCiter <- function(Clast, X, M, K) {
mnew = getMeans(Clast,X,K)
cnew = reCluster(X,mnew,K)
return(mnew,cnew)
}
# Use this to print digits from phi(x) transpose (one "row" of our feature map)
zimage <- function(vec) {
img <- matrix(vec[1:256],nrow=16,ncol=16);
img <- t(apply(-img, 1, rev))
image(img,col=grey(seq(0,1,length=256)))
}
# Load up training data
train <- as.data.frame(read.table(file="zip.train", header=FALSE))
test <- as.data.frame(read.table(file="zip.test", header=FALSE))
# Extract samples as a single digit; put into matrix form
# Examine the first column and only take 3's and 5's
X.train1 <- as.matrix(subset(train,V1 == 1)) # take only the #1s
X.train2 <- as.matrix(subset(train,V1 == 2)) # take only the #2s
X.train3 <- as.matrix(subset(train,V1 == 3)) # take only the #3s
X.train4 <- as.matrix(subset(train,V1 == 4)) # take only the #4s
X.train5 <- as.matrix(subset(train,V1 == 5)) # take only the #5s
X.train6 <- as.matrix(subset(train,V1 == 6)) # take only the #6s
X.train7 <- as.matrix(subset(train,V1 == 7)) # take only the #7s
X.train8 <- as.matrix(subset(train,V1 == 8)) # take only the #8s
X.train9 <- as.matrix(subset(train,V1 == 9)) # take only the #9s
X.train0 <- as.matrix(subset(train,V1 == 0)) # take only the #0s
X.train14 <- rbind(X.train1,X.train4) # combine the two numbers
# <digit, 0-9> <pixel value 1 (of 256)> <pixel 2> ... <pixel 256>
#*****************************************#
## START HERE ##
## select number of clusters (K)
# data range
# number cycles
#*****************************************#
K = 10
numIts = 10
# Great now get rid of the actual labels
X <- test[,-1]
n = dim(X)[1]
# Randomize cluster assignments
c0 = matrix(data = 0, nrow = n, ncol = 1)
for (i in 1:n) {
c0[i] = ceiling(runif(1,min = 0, max = K))
}
plot(c0,
main="Randomized cluster assignments",
xlab = "Indices of items",
ylab = "Cluster Number")
cluster_history = c0
clast = c0
# Do a step 1
means_last = getMeans(clast,X,K)
means_history = means_last
# Do a step 2
cnew = reCluster(X,means_last,K)
p = dim(X)[2]
tempMeans = matrix(data = NA, nrow = K, ncol = 256)
for (i in 2:numIts) {
mnew = getMeans(clast,X,K)
means_history = cbind(means_history,mnew)
tempMeans = mnew
cnew = reCluster(X,mnew, K)
cluster_history = cbind(cluster_history,cnew)
plot(cnew)
clast = cnew
}
# Look at our clusters
for (i in 1:K) {
zimage(tempMeans[i,])
}
b <- 1 # run all from here
```