NORTA Algorithm

A brief example of the NORmal-To-Anything (NORTA) algorithm.

First show a scatterplot of two multivariate random normal variables with a correlation of 0.7.

library(MASS)
library(ggplot2)

npoints <- 1000
mu <- rep(0,4) # Support up to 4 variables at once
Sigma <- matrix(.7, nrow=4, ncol=4) + diag(4)*.3
rawvars <- mvrnorm(n=npoints, mu=mu, Sigma=Sigma)
normXY <- data.frame(x=rawvars[,1], y=rawvars[,2])

g <- ggplot(normXY, aes(x=x,y=y))
g <- g + geom_point(size=1) + stat_smooth(method='lm',se=FALSE,color='red')
plot(g)

A similar plot using a uniform distribution created using NORTA.

unifvars <- pnorm(rawvars)
#unifvars <- qunif(pnorm(rawvars)) # qunif not needed, but shows how to convert to other distributions
cor(unifvars)
##           [,1]      [,2]      [,3]      [,4]
## [1,] 1.0000000 0.6933571 0.6596281 0.6598211
## [2,] 0.6933571 1.0000000 0.6678996 0.6634066
## [3,] 0.6596281 0.6678996 1.0000000 0.6520740
## [4,] 0.6598211 0.6634066 0.6520740 1.0000000
unifXY <- data.frame(x=unifvars[,1], y=unifvars[,2])

g <- ggplot(unifXY, aes(x=x,y=y))
g <- g + geom_point(size=1) + stat_smooth(method='lm',se=FALSE,color='red')
plot(g)

Now a Poisson distribution. Notice the additional lambda argument needed to specify the distribution. Because the variables are discrete this should really have point size proportional to count.

# Note qpois requires an additional lambda argument
poisvars <- qpois(pnorm(rawvars), 2)
cor(poisvars)
##           [,1]      [,2]      [,3]      [,4]
## [1,] 1.0000000 0.6595986 0.6223229 0.6195183
## [2,] 0.6595986 1.0000000 0.6397520 0.6492565
## [3,] 0.6223229 0.6397520 1.0000000 0.6180326
## [4,] 0.6195183 0.6492565 0.6180326 1.0000000
poisXY <- data.frame(x=poisvars[,1], y=poisvars[,2])

# Better to make point size proportional to count here
g <- ggplot(poisXY, aes(x=x,y=y))
g <- g + geom_point(size=1) + stat_smooth(method='lm',se=FALSE,color='red')
plot(g)

Now use NORTA to extend http://blog.ouseful.info/2014/12/17/sketching-scatterplots-to-demonstrate-different-correlations/
to support different distributions.

First define the utility functions we will use.

# Create a multi-variate <dist> distribution with <samples> samples and correlation <r>
# Distribution names as in help("Distributions")
# The distribution names are used to form the quantile function name q<dist>
# Currently additional arguments to the quantile function are not supported
corrdata <- function(samples=200, r=0, dist="norm"){
  data = mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(1, r, r, 1), nrow=2), empirical=TRUE)
  distfunname <- paste0("q", dist)
  if (!exists(distfunname)) { error(paste("Function", distfunname, "does not exist")) }
  
  # Convert the multi-variate normal variables to the desired distribution using NORTA
  data <- eval(parse(text=paste0(distfunname, "(pnorm(data))")))
  
  # Check that the correlation has not changed
  writeLines(paste("Desired correlation for dist", dist, "was", r,
              "  Actual correlation was", round(cor(data)[1,2],3)))
  
  data.frame(x=data[,1], y=data[,2])
}

plotcorrdata <- function(samples=200, dist="norm") {
  df <- data.frame()
  for (i in c(1,0.8,0.5,0.2,0,-0.2,-0.5,-0.8,-1)) {
    tmp <- corrdata(samples, i, dist)
    tmp['corr'] <- i
    df <- rbind(df, tmp)
  }
  
  g <- ggplot(df, aes(x=x,y=y)) + geom_point(size=1)
  g <- g + facet_wrap(~corr) + stat_smooth(method='lm',se=FALSE,color='red')
  g <- g + coord_fixed() +  ggtitle(paste("Scatterplots for Correlated", dist, "Random Variables"))
  plot(g)
}

Correlation Plots

Now show the plots for different distributions.

Notice that the consistency of the correlations of the post-NORTA variables varies for different distributions. Behaviour of the NORTA Method for Correlated Random Vector Generation as the Dimension Increases has some relevant commentary (search for exponential).

dists <- c("norm", "unif", "exp")

for (dist in dists) {
  plotcorrdata(samples=200, dist=dist)
}
## Desired correlation for dist norm was 1   Actual correlation was 1
## Desired correlation for dist norm was 0.8   Actual correlation was 0.8
## Desired correlation for dist norm was 0.5   Actual correlation was 0.5
## Desired correlation for dist norm was 0.2   Actual correlation was 0.2
## Desired correlation for dist norm was 0   Actual correlation was 0
## Desired correlation for dist norm was -0.2   Actual correlation was -0.2
## Desired correlation for dist norm was -0.5   Actual correlation was -0.5
## Desired correlation for dist norm was -0.8   Actual correlation was -0.8
## Desired correlation for dist norm was -1   Actual correlation was -1

## Desired correlation for dist unif was 1   Actual correlation was 1
## Desired correlation for dist unif was 0.8   Actual correlation was 0.78
## Desired correlation for dist unif was 0.5   Actual correlation was 0.527
## Desired correlation for dist unif was 0.2   Actual correlation was 0.176
## Desired correlation for dist unif was 0   Actual correlation was -0.014
## Desired correlation for dist unif was -0.2   Actual correlation was -0.157
## Desired correlation for dist unif was -0.5   Actual correlation was -0.471
## Desired correlation for dist unif was -0.8   Actual correlation was -0.786
## Desired correlation for dist unif was -1   Actual correlation was -1

## Desired correlation for dist exp was 1   Actual correlation was 1
## Desired correlation for dist exp was 0.8   Actual correlation was 0.801
## Desired correlation for dist exp was 0.5   Actual correlation was 0.422
## Desired correlation for dist exp was 0.2   Actual correlation was 0.141
## Desired correlation for dist exp was 0   Actual correlation was 0.068
## Desired correlation for dist exp was -0.2   Actual correlation was -0.195
## Desired correlation for dist exp was -0.5   Actual correlation was -0.376
## Desired correlation for dist exp was -0.8   Actual correlation was -0.542
## Desired correlation for dist exp was -1   Actual correlation was -0.646