Goal: design points in \([0,1]^m\) space so that the distances are uniformly distributed

Algorithm:

The realization is as follows.

library(plgp)
## Loading required package: mvtnorm
## Loading required package: tgp
## generate unifdist design with n samples
# m: dimention of the space
# k: number of points changed in one iteration
# T: number of iterations
ud <- function(x){
  n <- x[1]
  m <- x[2]
  k <- x[3]
  T <- x[4]
  dmax <- x[5]
  ## initial points  
  X1 <- matrix(runif(n*m), ncol=m)
  dX1 <- dist(X1)
  dX1 <- as.numeric(dX1[upper.tri(dX1)])  ## initial distances
  du <- seq(from = .Machine$double.eps , to = dmax*sqrt(m), length.out = 1000)  ## true uniform distances
  kst <- ks.test(dX1, du) ## k-s test
  ks1 <- kst$statistic ## k-s distance of two distributions
  kspval1 <- kst$p.value ## k-s test p-val
  # vector ks records the progress
  ks <- rep(NA, T)
  for(t in 1:T) {
    row <- sample(1:n, k)
    xold <- X1[row,]       ## random row selection
    X1[row,] <- matrix(runif(m*k), ncol = m)   ## random new row
    dprime <- dist(X1)
    kstprime <- ks.test(dprime, du)
    ksprime <- kstprime$statistic
    kspvalprime <- kstprime$p.value
    if(ksprime < ks1) { ks1 <- ksprime; ks[t] <- ksprime; kspval1 <- kspvalprime  ## accept
    } else { X1[row,] <- xold; ks[t] <- ks1   }         ## reject
  }
  return(list(X = X1, ksprg = ks, ksopt = ks1, dmax = sqrt(m)*dmax, ks.pval = kspval1))
}
## function to plot empirical distances cdf and pdf of a design
library(EnvStats)
## 
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm
## The following object is masked from 'package:base':
## 
##     print.default
edfPlot <- function(ud.obj)
{
  X <- ud.obj$X
  dmax <- ud.obj$dmax
  d <- dist(X)
  par(mfcol = c(1,2))
  plot(ecdf(d), xlab="distances", main="ecdf")
  segments(0,0,dmax,1)  ## add cdf of target uniform distribution
  breaks <- seq(0, max(d), length.out = 10)
  hist(d, breaks = breaks, freq = FALSE, xlab="distances", main="distance histogram")
  segments(0,1/dmax,dmax,1/dmax)
  lines(c(dmax,dmax), c(1/dmax, 0), lty = 2)
}

Let’s test the function with two arbitrary settings:

  1. n=10, m=1, k=1, T=10^4, dmin=.Machine$double.eps, dmax=\(0.9\sqrt{m}\).

  2. n=10, m=2, k=1, T=10^5, dmin=Machine$double.eps, dmax=\(0.9\sqrt{m}\).

x <- c(10,1,1,10^4, 0.9)
test01 <- ud(x)
x <- c(10,2,1,10^5, 0.9)
test02 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=1, k=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")

edfPlot(test01)

plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=2, k=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")

edfPlot(test02)

From the initial results,

  1. How to represent the true uniform distribution

For now Uniform(0, 0.9\(\sqrt{m}\)) is used, but when m increases, 0.1\(\sqrt{m}\) can be a big gap between simulated uniform dmax and real uniform dmax, \(\sqrt{m}\). I think there is a tradeoff between dmax and smallest K-S distance we can reach. Think about the extreme case, if set dmax = \(\sqrt{m}\), to get realization of this length, we have limited options like 1,2,4 in 1,2,3 dimention space respectively. But what’s the best choice for dmax?

The simulation results are much better than I expected, when dim >1, we can safely use dmax=1 with very high k-s test p-val. But when dim is 1, even we take dmax = 0.6, the p-val is still around 0.05.

## n = 10, m = 1, k = 1, T = 10^5
load('~/repos/unifdist/design/test1.RData')
pval <- rep(NA,20)
for(i in 1:20){
  pval[i] <- test11[[i]]$lpval
}
plot(seq(0.6, 0.9, length.out = 20), pval, xlab="dmax")
lines(seq(0.6, 0.9, length.out = 20), pval)

for(i in 1:20){
  pval[i] <- test12[[i]]$lpval
}
plot(seq(0.9, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.9, 1, length.out = 20), pval, xlab="dmax")

for(i in 1:20){
  pval[i] <- test13[[i]]$lpval
}
plot(seq(0.95, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.95, 1, length.out = 20), pval, xlab="dmax")

for(i in 1:20){
  pval[i] <- test14[[i]]$lpval
}
plot(seq(0.98, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.98, 1, length.out = 20), pval, xlab="dmax")

for(i in 1:20){
  pval[i] <- test15[[i]]$lpval
}
plot(seq(0.98, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.98, 1, length.out = 20), pval, xlab="dmax")

redo the initial testing with dmax = 1:

x <- c(10,1,1,10^5, 1)
test01 <- ud(x)
x <- c(10,2,1,10^5, 1)
test02 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=1, k=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")

edfPlot(test01)

plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=2, k=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")

edfPlot(test02)

  1. What factors can affect the convergence?
load('~/repos/unifdist/design/test2.RData')
# 1-d
par(mfcol = c(2,5))
# n = 10,m = 1, dmax = 0.6
for(i in 1:10){
  plot(test21[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}

# 2-d
# n = 10,m = 2, dmax = 1
for(i in 1:10){
  plot(test22[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}

# 3-d
# n = 10,m = 3, dmax = 1
for(i in 1:10){
  plot(test23[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}
  1. what happens when n gets larger?
load('~/repos/unifdist/design/test3.RData')
pval <- rep(NA, 18)
for(i in 1:18){
  pval[i] <- test3[[i]]$ks.pval
}
plot(kk[,1], pval, xlab = "n", main = "m = 2, k = 1, T = 10^6, dmax = 1")
lines(kk[,1], pval)
abline(h=0.05, col="red", lty = 2)

edfPlot(test3[[17]])

edfPlot(test3[[18]])

  1. See the design on 1-d, 2-d and 3-d
x <- c(9,1,1,10^5, 1)
test01 <- ud(x)
x <- c(9,2,1,10^5, 1)
test02 <- ud(x)
x <- c(9,3,1,10^5, 1)
test03 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=1, k=1, dmax=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")

edfPlot(test01)

plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=2, k=1, dmax=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")

edfPlot(test02)

plot(test03[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=3, k=1, dmax=1")
plot(test03[[1]], col = "red")

edfPlot(test03)