Uniform Distance design generation

Goal: design points in $[0,1]^m$ space so that the distances are uniformly distributed

Algorithm:

The design starts with n m-dim uniform ditribution X1
calculate k-s distance of empirical distance distribution and true uniform distribution Uniform[0, dmax]
randomly update k points if new k-s distance is smaller
repeat T times

The realization is as follows.

library(plgp)

## Loading required package: mvtnorm

## Loading required package: tgp

## generate unifdist design with n samples
# m: dimention of the space
# k: number of points changed in one iteration
# T: number of iterations
ud <- function(x){
  n <- x[1]
  m <- x[2]
  k <- x[3]
  T <- x[4]
  dmax <- x[5]
  ## initial points  
  X1 <- matrix(runif(n*m), ncol=m)
  dX1 <- dist(X1)
  dX1 <- as.numeric(dX1[upper.tri(dX1)])  ## initial distances
  du <- seq(from = .Machine$double.eps , to = dmax*sqrt(m), length.out = 1000)  ## true uniform distances
  kst <- ks.test(dX1, du) ## k-s test
  ks1 <- kst$statistic ## k-s distance of two distributions
  kspval1 <- kst$p.value ## k-s test p-val
  # vector ks records the progress
  ks <- rep(NA, T)
  for(t in 1:T) {
    row <- sample(1:n, k)
    xold <- X1[row,]       ## random row selection
    X1[row,] <- matrix(runif(m*k), ncol = m)   ## random new row
    dprime <- dist(X1)
    kstprime <- ks.test(dprime, du)
    ksprime <- kstprime$statistic
    kspvalprime <- kstprime$p.value
    if(ksprime < ks1) { ks1 <- ksprime; ks[t] <- ksprime; kspval1 <- kspvalprime  ## accept
    } else { X1[row,] <- xold; ks[t] <- ks1   }         ## reject
  }
  return(list(X = X1, ksprg = ks, ksopt = ks1, dmax = sqrt(m)*dmax, ks.pval = kspval1))
}

## function to plot empirical distances cdf and pdf of a design
library(EnvStats)

## 
## Attaching package: 'EnvStats'

## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm

## The following object is masked from 'package:base':
## 
##     print.default

edfPlot <- function(ud.obj)
{
  X <- ud.obj$X
  dmax <- ud.obj$dmax
  d <- dist(X)
  par(mfcol = c(1,2))
  plot(ecdf(d), xlab="distances", main="ecdf")
  segments(0,0,dmax,1)  ## add cdf of target uniform distribution
  breaks <- seq(0, max(d), length.out = 10)
  hist(d, breaks = breaks, freq = FALSE, xlab="distances", main="distance histogram")
  segments(0,1/dmax,dmax,1/dmax)
  lines(c(dmax,dmax), c(1/dmax, 0), lty = 2)
}

Let’s test the function with two arbitrary settings:

n=10, m=1, k=1, T=10^4, dmin=.Machine$double.eps, dmax=$0.9\sqrt{m}$.
n=10, m=2, k=1, T=10^5, dmin=Machine$double.eps, dmax=$0.9\sqrt{m}$.

x <- c(10,1,1,10^4, 0.9)
test01 <- ud(x)
x <- c(10,2,1,10^5, 0.9)
test02 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=1, k=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")

edfPlot(test01)

plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=2, k=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")

edfPlot(test02)

From the initial results,

How to represent the true uniform distribution

For now Uniform(0, 0.9$\sqrt{m}$) is used, but when m increases, 0.1$\sqrt{m}$ can be a big gap between simulated uniform dmax and real uniform dmax, $\sqrt{m}$. I think there is a tradeoff between dmax and smallest K-S distance we can reach. Think about the extreme case, if set dmax = $\sqrt{m}$, to get realization of this length, we have limited options like 1,2,4 in 1,2,3 dimention space respectively. But what’s the best choice for dmax?

The larger the better if we can have final p-val of k-s test larger than 0.05?
what if the dmax we find from this way is too small?
uniform or having various distances is more important?
Any GP theory can support the conclusion?

The simulation results are much better than I expected, when dim >1, we can safely use dmax=1 with very high k-s test p-val. But when dim is 1, even we take dmax = 0.6, the p-val is still around 0.05.

## n = 10, m = 1, k = 1, T = 10^5
load('~/repos/unifdist/design/test1.RData')
pval <- rep(NA,20)
for(i in 1:20){
  pval[i] <- test11[[i]]$lpval
}
plot(seq(0.6, 0.9, length.out = 20), pval, xlab="dmax")
lines(seq(0.6, 0.9, length.out = 20), pval)

for(i in 1:20){
  pval[i] <- test12[[i]]$lpval
}
plot(seq(0.9, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.9, 1, length.out = 20), pval, xlab="dmax")

for(i in 1:20){
  pval[i] <- test13[[i]]$lpval
}
plot(seq(0.95, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.95, 1, length.out = 20), pval, xlab="dmax")

for(i in 1:20){
  pval[i] <- test14[[i]]$lpval
}
plot(seq(0.98, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.98, 1, length.out = 20), pval, xlab="dmax")

for(i in 1:20){
  pval[i] <- test15[[i]]$lpval
}
plot(seq(0.98, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.98, 1, length.out = 20), pval, xlab="dmax")

redo the initial testing with dmax = 1:

x <- c(10,1,1,10^5, 1)
test01 <- ud(x)
x <- c(10,2,1,10^5, 1)
test02 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=1, k=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")

edfPlot(test01)

plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=2, k=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")

edfPlot(test02)

What factors can affect the convergence?

With no theory support, it’s hard to say the algorithm has converged to global optimal or local optimal value.
number of points changed in each iteration may affect this.

load('~/repos/unifdist/design/test2.RData')
# 1-d
par(mfcol = c(2,5))
# n = 10,m = 1, dmax = 0.6
for(i in 1:10){
  plot(test21[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}

# 2-d
# n = 10,m = 2, dmax = 1
for(i in 1:10){
  plot(test22[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}

# 3-d
# n = 10,m = 3, dmax = 1
for(i in 1:10){
  plot(test23[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}

what happens when n gets larger?

load('~/repos/unifdist/design/test3.RData')
pval <- rep(NA, 18)
for(i in 1:18){
  pval[i] <- test3[[i]]$ks.pval
}
plot(kk[,1], pval, xlab = "n", main = "m = 2, k = 1, T = 10^6, dmax = 1")
lines(kk[,1], pval)
abline(h=0.05, col="red", lty = 2)

edfPlot(test3[[17]])

edfPlot(test3[[18]])

See the design on 1-d, 2-d and 3-d

x <- c(9,1,1,10^5, 1)
test01 <- ud(x)
x <- c(9,2,1,10^5, 1)
test02 <- ud(x)
x <- c(9,3,1,10^5, 1)
test03 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=1, k=1, dmax=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")

edfPlot(test01)

plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=2, k=1, dmax=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")

edfPlot(test02)

plot(test03[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=3, k=1, dmax=1")
plot(test03[[1]], col = "red")

edfPlot(test03)

Uniform Distance design generation

Boya Zhang

January 28, 2018