Weighted Constrained K-Means

In this demo, the electricity consumption of 12 buildings is collected. The project aims to setup a number of microgrids powered by solar panel generators and provide electricity to these buildings. While minimization of wiring cost is considered, the optimum assignment has to satisfy the capacity of solar panel generators. In other words, the research question is to group the buildings into a number of clusters such that the aggregate inter-cluster distance is minimized within each cluster while keeping the aggregate electricity consumption (demand) of each cluster at a moderate level defined by upper/lower bound. I develop a weighted constrained k-means approach to find the optimum solution.

Inspired by Bradley et al. (2000), I formulate the constrained k-means clustering as a linear programming problem: \[min_{C,T} \quad \Sigma _{i=1}^m \Sigma _{h=1}^k T_{i,h} ||x^i-C^h||^2\] \[s.t. \quad \Sigma _{h=1}^k T_{i,h}=1, \quad i=1,...,m\] \[\quad \quad \quad \quad \quad \quad L_{h,t} \leq \Sigma _{i=1}^m \sigma _iT_{i,h}\leq U_{h,t}, \quad h=1,...,k \quad t=1,...,\tau\] \[\quad \quad \quad \quad \quad \quad T_{i,h} \geq 0, \quad i=1,...,m, h=1,...,k.\] Note that \(T_{i,h}=1\) if data point \(x^i\) is closest to centroid \(C^h\) and zero otherwise.\(\sigma _i\) is the demand for data point \(x^i\). The second constraint requires that the aggregate demand of each cluster should be bounded by the upper bound \(U_{h,t}\) and lower bound \(L_{h,t}\) during the period \(t\).

It’s worth mentioning that centroid \(C^{h,t+1}\) at iteration \(t+1\) is calculated using centroid at iteration \(t\). The algorithm contains two steps: 1) initiate \(C^h\) and run the LP problem to determine the cluster assignment \(T_{ih}^t\), and 2) update \(C^{h,t+1}=\frac{\Sigma _{i=1}^m T_{i,h}^t x^i}{\Sigma _{i=1}^m T_{i,h}^t}\) until convergence.

An Example

I deploy the model on a real-world dataset which consists the demand and location of 12 buildings.

coord <- matrix(c(40.555181921935095, -74.27603516627407,    # Create longitude/latitude matrix
                  40.55620901014459, -74.28006920857614,
                  40.555915557978196,-74.27901778265698,
                  40.560415016521716, -74.27509102871402,
                  40.560480223842745, -74.2754128937913,
                  40.55449718770875, -74.26743063987445,
                  40.55607858711832, -74.28137812655713,
                  40.560741052491544, -74.2889312270376,
                  40.559208669626244, -74.27987608952975,
                  40.55720347736548, -74.28328785934906,
                  40.55609489001049, -74.27689347314684,
                  40.55444827785318, -74.28251538316356),
                  ncol = 2,byrow = TRUE)


d1 = c(283, 9,  30, 67, 56, 530,    24, 37, 98,   2,    6,  48) #demand for month 1
d2 = c(310, 9,  34, 60, 65, 491,    26, 32, 118,    5,  7,  43)
d3 = c(336, 8,  31, 50, 57, 465,    22, 34, 100,    5,  7,  41)
d4 = c(323, 8,  26, 49, 31, 472,    23, 33, 130,    4,  8,  45)
d5 = c(329, 7,  30, 28, 26, 385,    18, 40, 112,    6,  7,  40)
d6 = c(351, 10, 41, 16, 27, 436,    19, 67, 325,    12, 11, 47)
d7 = c(363, 12, 88, 17, 31, 422,    18, 132,    31, 15, 13, 54)
d8 = c(351, 12, 61, 21, 23, 371,    7,  31, 174,    14, 13, 52)
d9 = c(329, 12, 36, 26, 27, 404,    11, 121,    311,    14, 12, 57)
d10 = c(309,    10, 32, 20, 21, 297,    24, 62, 168,    4,  9,  47)
d11 = c(301,    9,  33, 32, 33, 352,    17, 7,  36, 12, 8,  44)
d12 = c(322,    8,  34, 46, 47, 347,    21, 11, 48, 2,  6,  40)

d = cbind(d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12)
  
demand <- c(363,12,88,67,65,530,26,132,325,15,13,57)  # this is peak demand calculated as max for each building during the 12-month period.

data = cbind(coord,demand)
data = cbind(data,d)
colnames(data)[1]="x"
colnames(data)[2]="y"
colnames(data)[3]="demand"
data = as.data.frame(data)

A scatter plot is created to visualize the dataset.

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.3

graph <- ggplot(data, aes(x=x, y=y,label=demand)) +geom_point(size=5)
graph +  geom_text(hjust=-0.36, vjust=0)

Set the number of clusters \(k\) and upper/lower bound for aggregate demand in each cluster

num_cluster =3
bu = c(601,602,603)
bl = c(201,202,203)

Run the weighted constrained K-means algorithm

library(lpSolve)

## Warning: package 'lpSolve' was built under R version 4.2.2

library(geodist)

## Warning: package 'geodist' was built under R version 4.2.3

# constraints on demand capacity
time = dim(d)[2]
building = dim(d)[1]
W = matrix(t(d),nrow=time)
Au = kronecker(diag(num_cluster), W) 
Al = kronecker(diag(num_cluster), W) 
bu = c(rep(bu[1],time),rep(bu[2],time),rep(bu[3],time))
bl = c(rep(bl[1],time),rep(bl[2],time),rep(bl[3],time))

# set up constraints to make sure each data point is assigned to one cluster only
zero = rep(0,length(demand))
zero[1] = 1
B = kronecker(diag(num_cluster),matrix(zero,nrow=1))
one = matrix(rep(1,num_cluster),nrow=1)
B = one%*%B
for (x in 2:length(demand)){
  zero = rep(0,length(demand))
  zero[x] = 1
  temp = kronecker(diag(num_cluster),matrix(zero,nrow=1))
  temp = one%*%temp
  B = rbind(B,temp)
}

b= rep(1,length(demand))

################################################
left = rbind(Au,Al,B)
right =c(bu,bl,b)
direction = c(rep("<=",length(bu)),rep(">=",length(bl)),rep("=",length(demand)))

set.seed(1)
cx <- runif(n = num_cluster, min = min(coord[,1]), max = max(coord[,1]))
cy <- runif(n = num_cluster, min = min(coord[,2]), max = max(coord[,2]))
cluster = cbind(cx,cy)


adj = as.matrix(geodist(rbind(cluster[1,],coord)))
coef = adj[1,2:(length(demand)+1)]
for (x in 2:num_cluster){
  adj = as.matrix(geodist(rbind(cluster[x,],coord)))
  coef = c(coef,adj[1,2:(length(demand)+1)])
}

optimum <-  lp(direction="min",
               objective.in = coef,
               const.mat = left,
               const.dir = direction,
               const.rhs = right,
               all.bin = TRUE)

result = optimum$solution

for (i in 1:80){
  #print(i)
  
  xnew = coord[,1]*result
  ynew = coord[,2]*result
  cnewx = c()
  cnewy = c()
  for (x in 1:num_cluster){
    tmpx = sum(xnew[((x-1)*length(demand)+1):((x)*length(demand))])/sum(xnew[((x-1)*length(demand)+1):((x)*length(demand))] != 0)
    tmpy = sum(ynew[((x-1)*length(demand)+1):((x)*length(demand))])/sum(ynew[((x-1)*length(demand)+1):((x)*length(demand))] != 0)
    cnewx = c(cnewx,tmpx)
    cnewy = c(cnewy,tmpy)
  }
  clusternew = cbind(cnewx,cnewy)
  
  adj = as.matrix(geodist(rbind(clusternew[1,],coord)))
  coef = unname(adj[1,2:(length(demand)+1)])
  
  for (j in 2:num_cluster){
    adj = as.matrix(geodist(rbind(clusternew[j,],coord)))
    coef = c(coef,unname(adj[1,2:(length(demand)+1)]))
  }
  
  optimum <-  lp(direction="min",
                 objective.in = coef,
                 const.mat = left,
                 const.dir = direction,
                 const.rhs = right,
                 all.bin = TRUE)
  result = optimum$solution
  
}

group =rep(0,length(demand))
for (i in 1:num_cluster){
  group = group + result[((i-1)*length(demand)+1):((i)*length(demand))]*i
}

data$k <- kmeans(data[,1:2],3)$cluster
data$group <- group

We create the time series of electricity consumption for each cluster as follows.It can be shown that the variation of electricity consumption in each cluster is constrained by the pre-defined upper and lower bound during the 12-month period.

library(doBy)

## Warning: package 'doBy' was built under R version 4.2.3

ts = t(summaryBy(d ~ group, data=data,FUN=sum)[,2:(time+1)])
ts = cbind(ts,as.integer(seq(1,time,1)))
colnames(ts)[num_cluster+1] = "month"
colnames(ts)[1]="g1"
colnames(ts)[2]="g2"
colnames(ts)[3]="g3"
ts = as.data.frame(ts)

ggplot(ts, aes(x=month)) + 
  geom_line(aes(y = g1), color = "green") + 
  geom_line(aes(y = g2), color="black", linetype="twodash") +
  geom_line(aes(y = g3), color="red", linetype="longdash") + scale_x_continuous(breaks =seq(1,time,1)) + ylab("demand")

We also visualize the clustering result using the peak electricity consumption of each building. It can be shown that the capacity constraints are satisfied for each of the cluster,i.e., \(demand_{\ group1}=530\), \(demand_{\ group2}=567\), and \(demand_{\ group3}=596\). The distribution of electricity supply has achieved a balance across mocrogrids.

graph <- ggplot(data, aes(x=x, y=y,color=group,label=demand)) +geom_point(size=5)
graph +  geom_text(hjust=-0.33, vjust=0)

We can also compare the result with original K-means clustering. It can be shown that the capacity constraint is not satisfied for cluster 2 (dark blue) where aggregate demand exceeds 600.

graph <- ggplot(data, aes(x=x, y=y,color=k,label=demand)) +geom_point(size=5)
graph +  geom_text(hjust=-0.33, vjust=0)

Weighted Constrained K-Means

2023-07-26

Weighted Constrained K-Means

An Example

Extensions

References