week3

Is it easier to have it uniform when n is larger?

library(ggplot2)
library(plgp)

## Loading required package: mvtnorm

## Loading required package: tgp

load("testn.RData")
row.names(out2) <- NULL
out <- as.data.frame(out2)
out <- transform(out2, n = as.factor(out$n), m = out$m, k = out$k, ksopt = out$ksopt )
ggplot(out[out$m==1,], aes(x=n, y=ksopt)) +
  geom_boxplot() + labs(title="dim = 1")

ggplot(out[out$m==2,], aes(x=n, y=ksopt)) +
  geom_boxplot() + labs(title="dim = 2")

ggplot(out[out$m==3,], aes(x=n, y=ksopt)) +
  geom_boxplot() + labs(title="dim = 3")

When dim = 1, as sample size increases, it’s more and more difficult to make it uniform.

But when dim = 2 and 3, the ksopt, optimized ks-distance decreases first then increases.

The pattern looks very interesting. Probably when n is large like 100, changing only one point at each time is not efficient enough to make it converge to the minimum after 10^5 iterations. But for 1-d I’m still concerned about whether a uniform(0,1) is possible or not.

Redo experiment with more replicates to see the effect of k: number of points changed in each iteration

load("testk1.RData")
tesk <- as.data.frame(testk)
par(mfcol=c(1,3))

for(i in 1:3){
  temp <- 10*i
  testk1 <- testk[testk[,1]==temp,]
  boxplot(ksopt~k, testk1, xlab = "k", ylab  = "ks-dist", main = paste0("n=",temp,",dim=1,dmax=0.9"))
}

load('testk.RData')
testk <- as.data.frame(testk)

for(i in 1:3){
  temp <- 10*i
  testk1 <- testk[testk$n==temp,]
  boxplot(ksopt~k, testk1[testk1$m==1,], xlab = "k", ylab  = "ks-dist", main = paste0("n=",temp,",dim=1,dmax=1"))
}

for(i in 1:3){
  temp <- 10*i
  testk1 <- testk[testk$n==temp,]
  boxplot(ksopt~k, testk1[testk1$m==2,], xlab = "k", ylab  = "ks-dist", main = paste0("n=",temp,",dim=2,dmax=1"))
}

From those boxplots, xlab is k, number of points changed in each iteration; ylab is ksopt, optimized ks-distance after 10^5 iterations.

Maximin initial is used here.

when n=10,20,30, dim=1, dmax=0.9 or 1, with k increases, final ksopt is increasing.

When n=10,20,30, dim=2, dmax=1, with k increases final ksopt is still increasing.

It seems in all the cases, k=1 is better.

compare running time of different k

library(rbenchmark)
benchmark("k=1" = {t1 <- ud.1(30,1,1,10^5,1)},
          "k=2" = {t2 <- ud.1(30,1,2,10^5,1)},
          "k=3" = {t3 <- ud.1(30,1,3,10^5,1)},
          "k=4" = {t4 <- ud.1(30,1,10,10^5,1)},
          "k=5" = {t5 <- ud.1(30,1,15,10^5,1)},
          replications = 1,
          columns = c("test", "replications", "elapsed",
                      "relative", "user.self", "sys.self"))

##   test replications elapsed relative user.self sys.self
## 1  k=1            1  54.988    1.017    54.948    0.000
## 2  k=2            1  54.303    1.004    54.272    0.004
## 3  k=3            1  56.626    1.047    56.516    0.000
## 4  k=4            1  54.066    1.000    53.312    0.012
## 5  k=5            1  54.187    1.002    53.876    0.004

Just run 1 replication here, I have run more replicates, the result indicate that when n=30, dim=1, k=1,2,3,10,15, the time used to finish 10^5 runs are almost same.

For one time run with 10^5 interations, it takes about 30 seconds.

Compare the two designs as follows:

load('test2ds.RData')
row.names(out1) <- NULL
row.names(out2) <- NULL
out1 <- as.data.frame(out1)
out2 <- as.data.frame(out2)
out1 <- cbind(out1, method = "random initial")# random initial
out2 <- cbind(out2, method = "maximin initial")# maximin initial
out <- rbind(out1,out2)
out <- transform(out, n = as.factor(n), method = as.factor(method))

ggplot(out[out$m==1,], aes(x=n, y=ksopt, fill=method)) +
  geom_boxplot() + labs(title="dim = 1, k = 1, 30 replicates")

ggplot(out[out$m==2,], aes(x=n, y=ksopt, fill=method)) +
  geom_boxplot()+ labs(title="dim = 2, k = 1, 30 replicates")

I compared unifdist with random initial and maximin initial:

It turns out that when n = 10,20,30, dim=1,2, k=1, final ks distance is not significantly reduced after using maximin initial.

It seems using maximin initial is not very helpful to make the design more uniform.

Test the design

1-d in GP rmse

Although the design is not as uniform as we expect, I started to run some GP to check whether this design helps in prediction.

I know the goal of the project is to show that unifdist works better in estimating the “d”.

So, I fixed the nugget paramter as 0.2. I guess the more exact “d” is, the better prediction it will gives.

load("GP-1d.RData")
boxplot(maximin.rmse, ud.rmse, ud.rmse1, main = "rmse with 10 replications", col = c(2,3,4))
legend("topleft", c("maximin", "unifdist dmax=0.9", "unifdist dmax=1"), col=c(2,3,4), pch = c(1,1,1))

It looks like maximin design is better than unifdist in rmse.

The code I used for “GP-1d.RData” is attached below. Not pretty confident about it.

library(laGP)
source('ud.R')
## true surface with input x from 0 to 1
f <- function(x) (sin(pi*10*x/5) + 0.2*cos(4*pi*10*x/5)) + rnorm(1, sd=0.2)

## return the rmse, with input design x and y(x)
gpi.rmse <- function(x, f){
  if(class(x)!="matrix"){ stop("x must be a matrix")}
  y <- do.call('f', list(x))
  gpi <- newGP(x, y, d = 0.1, g = 0.2, dK = T) 
  mle <- mleGP(gpi, param = "d",  tmax = max(dist(x)))
  xx <- matrix(seq(0, 1, length=40), ncol = 1)
  yytrue <- do.call('f', list(xx))
  p <- predGP(gpi, xx)
  deleteGP(gpi)
  rmse <- sqrt(mean((yytrue - p$mean)^2))
  return(rmse)
}

# 1-d with 10 replicates 
R <- 10
ud.rmse <- rep(NA, R)
ud.rmse1 <- rep(NA, R)
maximin.rmse <- rep(NA, R)
for( i in 1:R){
  x <- ud.11(10,1,1,0.9)$X ## function with stopping condition: ksopt <=0.07 or T >=10^6
  ud.rmse[i] <- gpi.rmse(x,f)
  x <- ud.11(10,1,1,1)$X
  ud.rmse1[i] <- gpi.rmse(x,f)
  x <- maximin(10,1)
  maximin.rmse[i] <- gpi.rmse(x,f)
}

1-d in generating data by GP definition

Probably this is the correct approach to do the simulation. I generated gp with nugget = 0 and specifying nugget “g” = 0 in function newGP.

I haven’t run this with more iterations and under more settings.

My plan is to run this at d = seq(0,0.1,length.out=10) and seq(0.8,1,length.out=10), at least 30 replicates for each. It’s running right now.

library(plgp)
library(mvtnorm)
library(laGP)

## 
## Attaching package: 'laGP'

## The following object is masked from 'package:plgp':
## 
##     distance

## returns estimated d parameter
gpi.d <- function(x, d){
  n <- length(x)
  D <- distance(x)
  eps <- sqrt(.Machine$double.eps)
  sigma <- exp(-D/d + diag(eps,n))
  y <- rmvnorm(1, sigma=sigma)
  gpi <- newGP(x, y, d = 0.1, g = 0, dK = T) 
  mle <- mleGP(gpi, param = "d",  tmax = max(dist(x)))
  deleteGP(gpi)
  return(c(dtrue=d, dhat = mle$d))
}

d <- 0.8

ud1 <- ud.1(10,1,1,10^5, 0.9)
edfPlot(ud1)

designPlot(ud1)

x <- ud1$X
gpi.d(x, d)

##     dtrue      dhat 
## 0.8000000 0.3120734

ud2 <- ud.1(10,1,1)
edfPlot(ud2)

designPlot(ud2)

x <- ud2$X
gpi.d(x, d)

##     dtrue      dhat 
## 0.8000000 0.2695604

x <- maximin(10,1)
plot(x, rep(0, length(x)))

gpi.d(x, d)

##    dtrue     dhat 
## 0.800000 0.237609

d <- 0.05

ud1 <- ud.1(10,1,1,10^5, 0.9)
edfPlot(ud1)

designPlot(ud1)

x <- ud1$X
gpi.d(x, d)

##      dtrue       dhat 
## 0.05000000 0.05942989

ud2 <- ud.1(10,1,1)
edfPlot(ud2)

designPlot(ud2)

x <- ud2$X
gpi.d(x, d)

##      dtrue       dhat 
## 0.05000000 0.03430093

x <- maximin(10,1)
plot(x, rep(0, length(x)))

gpi.d(x, d)

##      dtrue       dhat 
## 0.05000000 0.07584913