### This experiment compares intergrated squared error (ISE) of d, d $\in (0,1)^{dim}$, on beta distributions with 100+2 different parameters, dim = 2, n =8
library(plgp)
library(mvtnorm)
library(laGP)

library(foreach)
library(doParallel)
source("/home/boyazhang/repos/unifdist/code/ud.R")
Sys.time()
gpi.mse <- function(n, dim, shape1, shape2, I = 1000, seed = 0){
  set.seed(seed)
  x <- bd(n=n, m=dim, shape1 = shape1, shape2  = shape2)$X
  D <- distance(x)
  eps <- sqrt(.Machine$double.eps)
  dtrue <- runif(I, min = eps, max = sqrt(dim))
  dhat <- rep(NA, I)
  for(i in 1:I){
    sigma <- exp(-D/dtrue[i] + diag(eps, n))
    y <- rmvnorm(1, sigma = sigma)
    gpi <- newGP(x, y, d = 0.1, g = eps, dK = T)
    dhat[i] <- mleGP(gpi, param = "d")$d
    deleteGP(gpi)
  }
  mse <- mean((dtrue - dhat)^2)
  return(c(n, dim, shape1, shape2, mse, seed))
}


A <- cbind( n = 8, dim = 2, maximin(200,2)*15)
set <- matrix(rep(t(A),100), ncol=ncol(A), byrow=TRUE)
set <- cbind(set, 1:nrow(set))
colnames(set) <- c("n", "dim", "shape1","shape2", "seed")
cl <- makeCluster(8)
registerDoParallel(cl)
out <- foreach(i = 1:nrow(set), .errorhandling = "remove", .combine='rbind', .packages=c('plgp', 'mvtnorm', 'laGP' )) %dopar% { 
  gpi.mse(n = set[i,1], dim = set[i,2], shape1 = set[i,3], shape2 = set[i,4], seed = set[i,5]) }

stopCluster(cl)
save.image('beta_para_map_8_2.RData') 
Sys.time()
load('/home/boyazhang/repos/unifdist/code/beta_para_map_8_2.RData')
out <- as.data.frame(out)
colnames(out) <- c("n", "dim", "shape1", "shape2","ise")
mse <- aggregate(out$ise, by = list(shape1 = out$shape1, shape2 = out$shape2), FUN = mean)
sd <- aggregate(out$ise, by = list(shape1 = out$shape1, shape2 = out$shape2), FUN = sd)
mse <- cbind(mse, sd = sd$x)
colnames(mse) <- c("shape1", "shape2", "mse_mean", "mse_sd")
# output table sorted my mse
head(mse[order(mse$mse_mean),], 20)
##       shape1    shape2  mse_mean     mse_sd
## 176 3.651256 13.397477 0.2304892 0.03166256
## 196 4.034145 14.794726 0.2306478 0.02866238
## 158 3.428805 11.995707 0.2319428 0.02559908
## 79  1.981682  5.695770 0.2320065 0.03339808
## 63  1.879035  4.562303 0.2329406 0.03052574
## 134 2.641873 10.047732 0.2340100 0.03428219
## 148 3.124934 11.299004 0.2349306 0.02919165
## 93  2.574173  6.808900 0.2352787 0.02745491
## 185 4.106180 13.994361 0.2359472 0.03220258
## 128 3.599588  9.635423 0.2378964 0.02985340
## 108 3.033318  7.918321 0.2380762 0.02829393
## 139 4.050874 10.682492 0.2396768 0.02908662
## 74  2.596938  4.988586 0.2409075 0.03090302
## 121 2.662776  9.124940 0.2422933 0.03192830
## 82  2.747699  5.829846 0.2423237 0.02979726
## 188 3.331305 14.383827 0.2428657 0.07270420
## 197 4.995079 14.823944 0.2430010 0.02797787
## 175 5.074926 13.271385 0.2432985 0.03434203
## 147 4.629517 11.285873 0.2454061 0.03549417
## 50  2.044703  3.567676 0.2477159 0.03095388
#----------------------------------mse surface from plotly-------------------------
library(plotly)
p <- plot_ly(data = mse) %>%
  add_trace(x=~shape1, y=~shape2, z=~mse_mean, type="contour") %>%
  add_trace(x=~shape1, y=~shape2, type="scatter", mode="markers")
p
#---------------------------------- GP surface-----------------------------------------
library(hetGP)
X <- as.matrix(out[,3:4])
Z <- out$ise
nvar = 2

## Model fitting
# settings <- list(return.hom = TRUE) # To keep homoskedastic model used for training
# model <- mleHetGP(X = X, Z = Z, lower = rep(10^(-4), nvar), upper = rep(50, nvar),
#                   covtype = "Gaussian", settings = settings)
model <- mleHetGP(X = X, Z = log(Z), lower = rep(10^(-4), 2), upper = rep(50, 2), 
                  covtype = "Matern5_2", maxit=1999)

## A quick view of the fit                  
summary(model)
## N =  20000  n =  200  d =  2 
## Matern5_2  covariance lengthscale values of the main process:  3.263923 4.795559 
## Variance/scale hyperparameter:  1.792466 
## Summary of Lambda values: 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005637 0.007889 0.009211 0.016622 0.012138 0.089368 
## Estimated constant trend value:  -0.162234 
## Matern5_2  covariance lengthscale values of the log-noise process:  3.263923 4.795559 
## Nugget of the log-noise process:  1e-06 
## Estimated constant trend value of the log-noise process:  -2.727921 
## MLE optimization: 
##  Log-likelihood =  9521.79 ; Nb of evaluations (obj, gradient) by L-BFGS-B:  194 194 ; message:  CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH
xx <- seq(0, 15, length.out = 501)
xgrid <- as.matrix(expand.grid(xx,xx))
predictions <- predict(x = xgrid, object =  model)

par(mfrow=c(1,1)); cols <- heat.colors(10)
## Display mean predictive surface
image(x = xx, y = xx, z = matrix(predictions$mean, ncol = length(xx)), xlab="shape1",ylab="shape2", col=cols, 
      main = "mean surface with raw data ranks and lowest 20 prediction locs")
text(mse$shape1, mse$shape2, labels = rank(mse$mse_mean))
dat_pred <- data.frame(shape1 = xgrid[,1], shape2 = xgrid[,2], pred_mean = predictions$mean, pred_sd2 = predictions$sd2)
dat_pred_top <- head(dat_pred[order(dat_pred$pred_mean),], 20)
points(dat_pred_top[,1], dat_pred_top[,2])

head(dat_pred_top, 1)
##       shape1 shape2 pred_mean     pred_sd2
## 86748   2.22   5.19 -1.501433 0.0002534557
# mse <- cbind(shape1 = xgrid[,1],shape2 = xgrid[,2], p_mean = predictions$mean, p_sd = predictions$sd2)
# p <- plot_ly(data = as.data.frame(mse)) %>%
#   add_trace(x=~shape1, y=~shape2, z=~p_mean, type="contour")
# p
#------------------------------------linear regression-------------------------------
mse <- transform(mse,beta_m = shape1/(shape1+shape2), beta_var = shape1*shape2/(shape1+shape2+1)/(shape1+shape2)^2, y = log(mse$mse_mean))
## plot beta mean v.s log(mse_mean)
# with raw data
plot(mse$beta_m, mse$y, xlab = "shape1/(shape1 + shape2)", ylab = "log(mse_mean)")

# with gp predictions
plot(xgrid[,1]/(xgrid[,1]+xgrid[,2]+.Machine$double.eps),predictions$mean, 
     xlab = "shape1/(shape1 + shape2)", ylab = "log(mse_mean)")

plot(mse$y, mse$mse_sd, xlab = "log(mse_mean)", ylab = "mse_sd")