Simulating Correlated Data with Different Types of Variables

This report reveals a small performance test for two data simulation methods. The utilized methods are expected to produce correlated continuous, categorical and binary variables based on a single multivariate normal distribution.

In order to test the performance, a fake dataset, let me name it kaf, was used. Please note that in a simulation study, a population covariance matrix, and descriptives of the variables (mean,sd and probabilities) will be enough.

This report will proceed as follows;

  1. Using rmvnorm and categorizing based on probabilities.

  2. Using rmvnorm and categorizing based on Tannenbaum et al. [1]

  3. Comparison

  4. Different types of correlations

  5. The ordinary approach

    The task was creating different types of predictors to be used in a Monte Carlo study. One possible alternative is to use mvtnorm package to create correlated continuous variables and then categorize them.

Let me start with introducing the kaf, our fake data set, 9 variables and 1000 observations; here is the head(kaf)

require(knitr)
## Loading required package: knitr
kable(head(kaf))
bin1 bin2 bin3 cat1 cat2 cat3 con1 con2 con3
2 2 2 3 3 3 -0.1290336 0.2080767 0.7340849
1 1 1 5 3 2 -0.6969824 0.8908636 0.3099231
2 2 2 3 3 3 0.1310828 -0.1660467 0.6380111
1 1 1 4 3 1 -0.9223883 0.1643777 -0.0336724
2 2 2 2 3 1 0.1979196 -0.5187818 -0.5243418
1 1 1 4 3 1 -0.4614307 0.3624084 0.1371253

As you can see, by order, there are 3 binary, 3 categorical and 3 continuous variables. Further, the continuous variables are standard normal.Lets look at the covariance matrix

kafcov=round(cov(kaf),3)
kafcov[upper.tri(kafcov)] <- ""
kable(kafcov)
bin1 bin2 bin3 cat1 cat2 cat3 con1 con2 con3
bin1 0.247
bin2 0.133 0.23
bin3 0.214 0.135 0.248
cat1 0.18 0.197 0.194 1.378
cat2 0.013 -0.006 -0.001 -0.016 0.374
cat3 0.277 0.232 0.274 0.288 0.129 1.064
con1 0.339 0.291 0.313 0.492 0.101 0.717 1
con2 0.113 0.151 0.124 0.711 0.127 0.313 0.48 1
con3 0.148 0.109 0.144 0.361 0.327 0.344 0.395 0.335 1

More on kaf

# means for binary and continuos variables
kafmeans=data.frame(means=colMeans(kaf[,c(1:3,7:9)]))
kable(kafmeans)
means
bin1 1.558
bin2 1.358
bin3 1.547
con1 0.000
con2 0.000
con3 0.000
#frequencies for categorical variables
require(Kmisc)
longkaf=melt_(kaf, measure.vars=c("cat1","cat2","cat3"))
require(ggplot2)
    ggplot(longkaf, aes(x=factor(value),fill=variable)) +
    geom_histogram(colour="black") +
    facet_grid(variable ~ .) +
    ggtitle("Frequency of Factor Levels in Kaf") +
    theme(legend.position="none") 

percentages

apply(kaf[,4:6],2,function(x){table(x)/nrow(kaf)}) 
## $cat1
## x
##     1     2     3     4     5 
## 0.032 0.121 0.249 0.205 0.393 
## 
## $cat2
## x
##     1     2     3 
## 0.081 0.142 0.777 
## 
## $cat3
## x
##     1     2     3     4 
## 0.285 0.304 0.264 0.147

Lets try to mimic kaf by using rmvnorm

require(mvtnorm)
n=nrow(kaf)
set.seed(2358)
simdata=rmvnorm(n,mean=as.vector(colMeans(kaf)),sigma=cov(kaf))

Let’s categorize the simulated data and evaluate;

#cat 
categorizer = function(o,s){
  prob=cumsum(table(o))/length(o)
  ncp=as.vector(c(min(s),quantile(s,prob)))
  return(cut(s,right=T,include.lowest=T,breaks=ncp,labels=names(table(o)))) }

simkaf=simdata
for (i in 1:6) {
simkaf[,i] =  categorizer(kaf[,i],simkaf[,i])
}
  1. Bias of the simulated means (\(\begin{equation}x_{sim}-x\end{equation}\))

    Bias
    bin1 0.0000000
    bin2 0.0000000
    bin3 0.0000000
    cat1 0.0000000
    cat2 0.0000000
    cat3 0.0000000
    con1 -0.0252327
    con2 -0.0256821
    con3 -0.0267577
  2. Bias of the simulated covariance matrix

    bin1 bin2 bin3 cat1 cat2 cat3 con1 con2 con3
    bin1 0
    bin2 -0.035 0
    bin3 -0.045 -0.035 0
    cat1 -0.041 -0.016 -0.04 0
    cat2 -0.019 0.016 -0.005 0.031 0
    cat3 -0.056 -0.045 -0.051 -0.001 -0.032 0
    con1 -0.069 -0.063 -0.049 0 -0.021 -0.006 0.029
    con2 -0.011 -0.019 -0.007 -0.093 -0.031 0.021 0.024 -0.027
    con3 -0.021 -0.02 -0.014 0.004 -0.097 0.029 0.053 0.016 0.002
  3. Frequencies of the simulated categorical variables

percentages of simulated categorical variables

apply(simkaf[,4:6],2,function(x){table(x)/nrow(simkaf)}) 
## $cat1
## x
##     1     2     3     4     5 
## 0.032 0.121 0.249 0.205 0.393 
## 
## $cat2
## x
##     1     2     3 
## 0.081 0.142 0.777 
## 
## $cat3
## x
##     1     2     3     4 
## 0.285 0.304 0.264 0.147

As you can see, the frequencies are identical

2. Tannenbaum et al.

The simulation and evaluation steps will be the same, but the categorizer function will be different. Categorization will be based on the formula; \[ \begin{equation} CrV(\mu, \sigma, P_i ) = e^{\mu + \sigma \cdot NORMINV(P_i)} \end{equation} \]

kaf2=kaf
categorizer2=function(var.s,var.o) {
                    
                      le=length(table(var.o)) #number of categories
                      mu=mean(log(var.o)) #mean of variable
                      sd=sd(log(var.o)) #sd opf variable
                      prob=as.vector(cumsum(table(var.o))/length(var.o))#obtain probabilities of categories
                      
              
                  if (le==2){
                      crv=exp(mu+(sd*qnorm(prob[1])))
                      return(ifelse(var.s>crv,2,1))}
                  
                 if (le>2){
                   crv=exp(mu+(sd*qnorm(prob[-le]))) #get thresholds from a normal distribution
                       breaks=c(min(var.s),crv,max(var.s))
                      return(cut(var.s,right=T,include.lowest=T,breaks=breaks,
                                 labels=names(table(var.o)))) 
                   } #close if  
} #close function

simtan=simdata
for (i in 1:6){
simtan[,i]=categorizer2(simdata[,i],kaf2[,i])}

The simulation is completed, lets evaluate

2-a) Bias of the simulated means with Tannenbaum approach

Bias
bin1 0.0580000
bin2 0.0680000
bin3 0.0450000
cat1 0.1190000
cat2 0.1270000
cat3 0.1190000
con1 -0.0252327
con2 -0.0256821
con3 -0.0267577
  1. Bias of the simulated covariance matrix with Tannenbaum approach

    bin1 bin2 bin3 cat1 cat2 cat3 con1 con2 con3
    bin1 -0.01
    bin2 -0.028 0.015
    bin3 -0.046 -0.023 -0.006
    cat1 -0.035 -0.031 -0.042 -0.069
    cat2 -0.014 0.021 0.007 0.023 -0.144
    cat3 -0.074 -0.041 -0.061 -0.027 -0.049 -0.09
    con1 -0.079 -0.052 -0.052 -0.023 -0.038 -0.026 0.029
    con2 -0.012 -0.02 -0.005 -0.105 -0.058 -0.002 0.024 -0.027
    con3 -0.02 -0.013 -0.01 -0.009 -0.171 0.016 0.053 0.016 0.002
  2. Frequencies of the simulated categoricals with Tannenbaum approach

percentages of the simulated categoricals with Tannenbaum approach

apply(simtan[,4:6],2,function(x){table(x)/nrow(simtan)}) 
## $cat1
## x
##     1     2     3     4     5 
## 0.035 0.085 0.229 0.222 0.429 
## 
## $cat2
## x
##     1     2     3 
## 0.042 0.093 0.865 
## 
## $cat3
## x
##     1     2     3     4 
## 0.238 0.261 0.372 0.129

3. Comparison

In terms of replicating the means the ordinary approach seems to outperform Tannenbaum’s approach

In terms of replicating the covariance matrix the methods perform roughly equal with an exception. The 3rd categorical variable seems to be distriuted more normal compared to other categorical variables and the Tannenbaum’s approach slighltly outperforms the ordinary approach.Further evaluation is needed.

4. Different correlations

When there are different types of variables, the pearson correlation matrix might be misleading.Let’s use the polycor package

library(polycor)
## Loading required package: sfsmisc
round(hetcor(kaf)$correlations,2) 
##      bin1  bin2  bin3  cat1  cat2 cat3 con1 con2 con3
## bin1 1.00  0.82  0.99  0.43  0.06 0.69 0.93 0.29 0.38
## bin2 0.82  1.00  0.83  0.50 -0.05 0.62 0.72 0.38 0.29
## bin3 0.99  0.83  1.00  0.46 -0.03 0.68 0.88 0.31 0.37
## cat1 0.43  0.50  0.46  1.00 -0.07 0.31 0.52 0.67 0.34
## cat2 0.06 -0.05 -0.03 -0.07  1.00 0.31 0.26 0.26 0.72
## cat3 0.69  0.62  0.68  0.31  0.31 1.00 0.74 0.33 0.35
## con1 0.93  0.72  0.88  0.52  0.26 0.74 1.00 0.48 0.41
## con2 0.29  0.38  0.31  0.67  0.26 0.33 0.48 1.00 0.34
## con3 0.38  0.29  0.37  0.34  0.72 0.35 0.41 0.34 1.00
round(hetcor(simkaf)$correlations,2) 
##       bin1 bin2  bin3 cat1  cat2 cat3 con1 con2 con3
## bin1  1.00 0.64  0.88 0.32 -0.03 0.57 0.67 0.26 0.32
## bin2  0.64 1.00  0.64 0.45  0.05 0.50 0.61 0.36 0.24
## bin3  0.88 0.64  1.00 0.35 -0.03 0.57 0.66 0.30 0.33
## cat1  0.32 0.45  0.35 1.00  0.02 0.27 0.44 0.57 0.33
## cat2 -0.03 0.05 -0.03 0.02  1.00 0.22 0.17 0.22 0.50
## cat3  0.57 0.50  0.57 0.27  0.22 1.00 0.73 0.35 0.39
## con1  0.67 0.61  0.66 0.44  0.17 0.73 1.00 0.50 0.44
## con2  0.26 0.36  0.30 0.57  0.22 0.35 0.50 1.00 0.36
## con3  0.32 0.24  0.33 0.33  0.50 0.39 0.44 0.36 1.00
round(hetcor(simtan)$correlations,2) 
##       bin1 bin2 bin3 cat1  cat2 cat3 con1 con2 con3
## bin1  1.00 0.67 0.89 0.35 -0.01 0.56 0.67 0.27 0.33
## bin2  0.67 1.00 0.69 0.41  0.10 0.51 0.60 0.34 0.24
## bin3  0.89 0.69 1.00 0.37  0.04 0.58 0.66 0.31 0.34
## cat1  0.35 0.41 0.37 1.00  0.02 0.27 0.44 0.59 0.33
## cat2 -0.01 0.10 0.04 0.02  1.00 0.28 0.20 0.22 0.48
## cat3  0.56 0.51 0.58 0.27  0.28 1.00 0.74 0.34 0.39
## con1  0.67 0.60 0.66 0.44  0.20 0.74 1.00 0.50 0.44
## con2  0.27 0.34 0.31 0.59  0.22 0.34 0.50 1.00 0.36
## con3  0.33 0.24 0.34 0.33  0.48 0.39 0.44 0.36 1.00

Reference: http://link.springer.com/article/10.1007%2Fs10928-006-9033-1