Numerical constants for this exercise

no_factors <- 7
no_clusters <- 9
no_individuals <- 100

Create data frame representing clusters, in reality this would be the output from the clustering algorithm. These are random, dummy clusters

m1 <- matrix ( sample(1:100, no_factors * no_clusters, replace = TRUE), ncol = no_factors )
clusters <- data.frame(m1)
clusters <- clusters / rowSums(clusters)
clusters
##           X1         X2         X3          X4         X5         X6
## 1 0.14569536 0.02649007 0.25496689 0.009933775 0.29139073 0.24172185
## 2 0.11825193 0.03856041 0.14652956 0.172236504 0.05655527 0.22622108
## 3 0.18461538 0.21538462 0.07472527 0.125274725 0.01098901 0.20659341
## 4 0.02302632 0.02302632 0.29276316 0.144736842 0.17105263 0.26315789
## 5 0.17901235 0.19444444 0.17592593 0.061728395 0.21604938 0.15740741
## 6 0.06686047 0.14244186 0.28779070 0.215116279 0.11337209 0.04941860
## 7 0.07299270 0.23357664 0.13138686 0.018248175 0.19343066 0.09489051
## 8 0.30434783 0.13043478 0.07971014 0.217391304 0.06521739 0.17028986
## 9 0.01013514 0.17905405 0.19932432 0.084459459 0.25000000 0.16216216
##           X7
## 1 0.02980132
## 2 0.24164524
## 3 0.18241758
## 4 0.08223684
## 5 0.01543210
## 6 0.12500000
## 7 0.25547445
## 8 0.03260870
## 9 0.11486486

Create a data frame representing the remainder of the population which was not used to define the clusters

m <- matrix ( sample(1:100, no_factors * 100, replace = TRUE), ncol = no_factors )
remainder <- data.frame(m)
remainder <- remainder / rowSums(remainder)
head(remainder)
##          X1          X2         X3         X4         X5         X6
## 1 0.2037037 0.006172840 0.26234568 0.23148148 0.07716049 0.07098765
## 2 0.1768489 0.003215434 0.01286174 0.12540193 0.32154341 0.05144695
## 3 0.2912621 0.236245955 0.17152104 0.08090615 0.07766990 0.13592233
## 4 0.2197802 0.070329670 0.21978022 0.09010989 0.20659341 0.04615385
## 5 0.1974922 0.068965517 0.15673981 0.06583072 0.20376176 0.25391850
## 6 0.1298701 0.200000000 0.25974026 0.03636364 0.18441558 0.12467532
##            X7
## 1 0.148148148
## 2 0.308681672
## 3 0.006472492
## 4 0.147252747
## 5 0.053291536
## 6 0.064935065

dist function calculates the distance in multi-dimensional space of an individual from a particular cluster

dist <- function(x, row) { sqrt(rowSums((remainder[row,] - clusters[x,])^2)) }

in R the seq function creates a list with a set length

seq(no_clusters)
## [1] 1 2 3 4 5 6 7 8 9
# shortcut for
1:no_clusters
## [1] 1 2 3 4 5 6 7 8 9
  1. for each individual:
    1. apply the distance function for each cluster
    2. convert results to data.frame and append the data.frame to the list
    3. have to rename the columns of the data.frame so they are consistent through the list
  2. at end collapse the list into a single data.frame
  3. normalise the cluster distances to produce probabilities of membership
results <- list()

for (x in seq(no_individuals)) {
  d <- data.frame(lapply(seq(no_clusters), dist, x))
  names(d) <- paste0("Cluster", seq(no_clusters))
  results[[x]] <- d
}

cluster_dist = do.call(rbind, results)
cluster_dist <- cluster_dist / rowSums(cluster_dist)
head(cluster_dist)
##     Cluster1   Cluster2   Cluster3  Cluster4   Cluster5  Cluster6
## 1 0.13222059 0.08490346 0.11877297 0.1061305 0.11988502 0.0706410
## 2 0.11660761 0.09687717 0.11567905 0.1264595 0.11258247 0.1177923
## 3 0.12907321 0.13251866 0.08868462 0.1436998 0.06527542 0.1185207
## 4 0.10754449 0.11654886 0.13072409 0.1259010 0.08789766 0.0950394
## 5 0.06851449 0.11838132 0.12688732 0.1041709 0.07078905 0.1463948
## 6 0.10547524 0.14945714 0.13631003 0.1206893 0.05323609 0.1025768
##    Cluster7   Cluster8   Cluster9
## 1 0.1390566 0.10077994 0.12760993
## 2 0.0893212 0.12058456 0.10409614
## 3 0.1269800 0.07075179 0.12449573
## 4 0.1083555 0.12167851 0.10631043
## 5 0.1430562 0.11378611 0.10801982
## 6 0.1063741 0.15102650 0.07485482