Goal: design points in \([0,1]^m\) space so that the distances are uniformly distributed
Algorithm:
The realization is as follows.
library(plgp)
## Loading required package: mvtnorm
## Loading required package: tgp
## generate unifdist design with n samples
# m: dimention of the space
# k: number of points changed in one iteration
# T: number of iterations
ud <- function(x){
n <- x[1]
m <- x[2]
k <- x[3]
T <- x[4]
dmax <- x[5]
## initial points
X1 <- matrix(runif(n*m), ncol=m)
dX1 <- dist(X1)
dX1 <- as.numeric(dX1[upper.tri(dX1)]) ## initial distances
du <- seq(from = .Machine$double.eps , to = dmax*sqrt(m), length.out = 1000) ## true uniform distances
kst <- ks.test(dX1, du) ## k-s test
ks1 <- kst$statistic ## k-s distance of two distributions
kspval1 <- kst$p.value ## k-s test p-val
# vector ks records the progress
ks <- rep(NA, T)
for(t in 1:T) {
row <- sample(1:n, k)
xold <- X1[row,] ## random row selection
X1[row,] <- matrix(runif(m*k), ncol = m) ## random new row
dprime <- dist(X1)
kstprime <- ks.test(dprime, du)
ksprime <- kstprime$statistic
kspvalprime <- kstprime$p.value
if(ksprime < ks1) { ks1 <- ksprime; ks[t] <- ksprime; kspval1 <- kspvalprime ## accept
} else { X1[row,] <- xold; ks[t] <- ks1 } ## reject
}
return(list(X = X1, ksprg = ks, ksopt = ks1, dmax = sqrt(m)*dmax, ks.pval = kspval1))
}
## function to plot empirical distances cdf and pdf of a design
library(EnvStats)
##
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
##
## predict, predict.lm
## The following object is masked from 'package:base':
##
## print.default
edfPlot <- function(ud.obj)
{
X <- ud.obj$X
dmax <- ud.obj$dmax
d <- dist(X)
par(mfcol = c(1,2))
plot(ecdf(d), xlab="distances", main="ecdf")
segments(0,0,dmax,1) ## add cdf of target uniform distribution
breaks <- seq(0, max(d), length.out = 10)
hist(d, breaks = breaks, freq = FALSE, xlab="distances", main="distance histogram")
segments(0,1/dmax,dmax,1/dmax)
lines(c(dmax,dmax), c(1/dmax, 0), lty = 2)
}
Let’s test the function with two arbitrary settings:
n=10, m=1, k=1, T=10^4, dmin=.Machine$double.eps, dmax=\(0.9\sqrt{m}\).
n=10, m=2, k=1, T=10^5, dmin=Machine$double.eps, dmax=\(0.9\sqrt{m}\).
x <- c(10,1,1,10^4, 0.9)
test01 <- ud(x)
x <- c(10,2,1,10^5, 0.9)
test02 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=1, k=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")
edfPlot(test01)
plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=2, k=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")
edfPlot(test02)
From the initial results,
For now Uniform(0, 0.9\(\sqrt{m}\)) is used, but when m increases, 0.1\(\sqrt{m}\) can be a big gap between simulated uniform dmax and real uniform dmax, \(\sqrt{m}\). I think there is a tradeoff between dmax and smallest K-S distance we can reach. Think about the extreme case, if set dmax = \(\sqrt{m}\), to get realization of this length, we have limited options like 1,2,4 in 1,2,3 dimention space respectively. But what’s the best choice for dmax?
The simulation results are much better than I expected, when dim >1, we can safely use dmax=1 with very high k-s test p-val. But when dim is 1, even we take dmax = 0.6, the p-val is still around 0.05.
## n = 10, m = 1, k = 1, T = 10^5
load('~/repos/unifdist/design/test1.RData')
pval <- rep(NA,20)
for(i in 1:20){
pval[i] <- test11[[i]]$lpval
}
plot(seq(0.6, 0.9, length.out = 20), pval, xlab="dmax")
lines(seq(0.6, 0.9, length.out = 20), pval)
for(i in 1:20){
pval[i] <- test12[[i]]$lpval
}
plot(seq(0.9, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.9, 1, length.out = 20), pval, xlab="dmax")
for(i in 1:20){
pval[i] <- test13[[i]]$lpval
}
plot(seq(0.95, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.95, 1, length.out = 20), pval, xlab="dmax")
for(i in 1:20){
pval[i] <- test14[[i]]$lpval
}
plot(seq(0.98, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.98, 1, length.out = 20), pval, xlab="dmax")
for(i in 1:20){
pval[i] <- test15[[i]]$lpval
}
plot(seq(0.98, 1, length.out = 20), pval, xlab="dmax")
lines(seq(0.98, 1, length.out = 20), pval, xlab="dmax")
redo the initial testing with dmax = 1:
x <- c(10,1,1,10^5, 1)
test01 <- ud(x)
x <- c(10,2,1,10^5, 1)
test02 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=1, k=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")
edfPlot(test01)
plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=10, m=2, k=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")
edfPlot(test02)
load('~/repos/unifdist/design/test2.RData')
# 1-d
par(mfcol = c(2,5))
# n = 10,m = 1, dmax = 0.6
for(i in 1:10){
plot(test21[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}
# 2-d
# n = 10,m = 2, dmax = 1
for(i in 1:10){
plot(test22[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}
# 3-d
# n = 10,m = 3, dmax = 1
for(i in 1:10){
plot(test23[[i]]$ksprg, xlab="time", ylab= "ks dist", main=paste("k=", i))
}
load('~/repos/unifdist/design/test3.RData')
pval <- rep(NA, 18)
for(i in 1:18){
pval[i] <- test3[[i]]$ks.pval
}
plot(kk[,1], pval, xlab = "n", main = "m = 2, k = 1, T = 10^6, dmax = 1")
lines(kk[,1], pval)
abline(h=0.05, col="red", lty = 2)
edfPlot(test3[[17]])
edfPlot(test3[[18]])
x <- c(9,1,1,10^5, 1)
test01 <- ud(x)
x <- c(9,2,1,10^5, 1)
test02 <- ud(x)
x <- c(9,3,1,10^5, 1)
test03 <- ud(x)
par(mfcol = c(1,2))
plot(test01[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=1, k=1, dmax=1")
plot(cbind(test01[[1]],0), col = "red", xlab = "x", ylab = "")
edfPlot(test01)
plot(test02[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=2, k=1, dmax=1")
plot(test02[[1]], col = "red", xlab = "x1", ylab = "x2")
edfPlot(test02)
plot(test03[[2]], ylab = "K-S distance", xlab = "time", main = "n=9, m=3, k=1, dmax=1")
plot(test03[[1]], col = "red")
edfPlot(test03)