About

Compositional data are tricky to handle as predictors. Here I explore this issue with simulations and suggest solutions.

Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, rms, poliscidata, broom)

#function to transform into proportional
make_compositional = function(x) {
  #negative values cause errors
  abs_x = abs(x)
  
  abs_x / rowSums(abs_x)
}

Example 1

#simulate random uncorrelated data
set.seed(1)
ex1 = rnorm(500e3) %>% matrix(ncol = 5) %>% set_colnames(LETTERS[1:5]) %>% make_compositional() %>% as_tibble()

#looks right?
ex1 %>% head() %>% {
  .$sum = rowSums(.)
  .
}

#cors of proportions should be slightly negative due to constraint
ex1 %>% wtd.cors()

##        A      B      C      D      E
## A  1.000 -0.247 -0.252 -0.251 -0.251
## B -0.247  1.000 -0.249 -0.251 -0.255
## C -0.252 -0.249  1.000 -0.251 -0.247
## D -0.251 -0.251 -0.251  1.000 -0.245
## E -0.251 -0.255 -0.247 -0.245  1.000

#simulate an outcome based on parts
#note the betas here
ex1 %<>% mutate(
  Y = A * .2 + B * .4 + C * -.1 + D * -.2 + E * .1 + rnorm(100e3, sd = .1)
)

#regress all at once does not work well
ols(Y ~ A + B + C + D + E, data = ex1)

## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C + D + E, data = ex1)
##  
##                  Model Likelihood      Discrimination    
##                     Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    42707.38    R2       0.348    
##  sigma0.1000    d.f.              5    R2 adj   0.348    
##  d.f.  99994    Pr(> chi2)   0.0000    g        0.083    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -4.111e-01 -6.743e-02 -9.678e-05  6.744e-02  4.553e-01 
##  
##  
##            Coef    S.E.     t    Pr(>|t|)
##  Intercept  0.0996 6.93e+10 0.00 1.0000  
##  A          0.0990 6.93e+10 0.00 1.0000  
##  B          0.3014 6.93e+10 0.00 1.0000  
##  C         -0.1950 6.93e+10 0.00 1.0000  
##  D         -0.3037 6.93e+10 0.00 1.0000  
##  E                 6.93e+10              
##

#one at a time may produce biases when variables are correlated
#these results are somewhat off, despite the tiny SEs
map_df(LETTERS[1:5], function(x) {
  fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex1)
  #broom hack
  tidy(summary.lm(fit))[2, ]
})

#Simple solution is to leave one out, which then serves as the reference
#the betas are off, but the solution is simple to add the intercept to them
#the intercept is also the beta for the excluded part
#beta for A is truly .2
ols(Y ~ B + C + D + E, data = ex1)

## Linear Regression Model
##  
##  ols(formula = Y ~ B + C + D + E, data = ex1)
##  
##                  Model Likelihood      Discrimination    
##                     Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    42707.38    R2       0.348    
##  sigma0.1000    d.f.              4    R2 adj   0.348    
##  d.f.  99995    Pr(> chi2)   0.0000    g        0.083    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -4.111e-01 -6.743e-02 -9.678e-05  6.744e-02  4.553e-01 
##  
##  
##            Coef    S.E.   t       Pr(>|t|)
##  Intercept  0.1986 0.0019  105.64 <0.0001 
##  B          0.2024 0.0029   69.00 <0.0001 
##  C         -0.2940 0.0029 -100.33 <0.0001 
##  D         -0.4027 0.0029 -137.33 <0.0001 
##  E         -0.0990 0.0029  -33.73 <0.0001 
##

#let's try another one: E, which is truly .1
#note also that the fits in R2 are identical
ols(Y ~ A + B + C + D, data = ex1)

## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C + D, data = ex1)
##  
##                  Model Likelihood      Discrimination    
##                     Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    42707.38    R2       0.348    
##  sigma0.1000    d.f.              4    R2 adj   0.348    
##  d.f.  99995    Pr(> chi2)   0.0000    g        0.083    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -4.111e-01 -6.743e-02 -9.678e-05  6.744e-02  4.553e-01 
##  
##  
##            Coef    S.E.   t       Pr(>|t|)
##  Intercept  0.0996 0.0019   52.83 <0.0001 
##  A          0.0990 0.0029   33.73 <0.0001 
##  B          0.3014 0.0029  102.92 <0.0001 
##  C         -0.1950 0.0029  -66.34 <0.0001 
##  D         -0.3037 0.0029 -103.17 <0.0001 
##

Example 2

The above method works in most of the cases, but can get into trouble if the proportions have certain relationships. It works best when there is a large group that’s present everywhere and has most of the variance in Y associated with it. Let’s try violating that.

#simulate data without a single group being present everywhere
ex2 = c(rep(0, 50e3), rnorm(50e3),
  rep(0, 50e3), rnorm(50e3),
  rnorm(50e3), rep(0, 50e3),
  rnorm(50e3), rep(0, 50e3)) %>% 
  matrix(ncol = 4) %>% 
  make_compositional() %>% 
  set_colnames(LETTERS[1:4]) %>% 
  as_tibble()

#looks right? first 50k rows have only variation in 2 groups, last 50k the opposite
ex2 %>% head()

ex2 %>% tail()

#simulate an outcome based on parts
#note the betas here
ex2 %<>% mutate(
  Y = A * .2 + B * .4 + C * -.1 + D * -.2 + rnorm(100e3, sd = .1)
)

#regress all at once
ols(Y ~ A + B + C + D, data = ex2)

## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C + D, data = ex2)
##  
##                   Model Likelihood      Discrimination    
##                      Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    182903.45    R2       0.839    
##  sigma0.1000    d.f.               4    R2 adj   0.839    
##  d.f.  99995    Pr(> chi2)    0.0000    g        0.247    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.4637077 -0.0671900 -0.0001714  0.0673360  0.4057392 
##  
##  
##            Coef    S.E.     t    Pr(>|t|)
##  Intercept -0.1981 4.34e+09 0.00 1.0000  
##  A          0.4002 4.34e+09 0.00 1.0000  
##  B          0.5964 4.34e+09 0.00 1.0000  
##  C          0.0964 4.34e+09 0.00 1.0000  
##  D                 4.34e+09              
##

#one at a time gives wrong results
map_df(LETTERS[1:4], function(x) {
  fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex2)
  #broom hack
  tidy(summary.lm(fit))[2, ]
})

#leave one out?
ols(Y ~ B + C + D, data = ex2) #still works

## Linear Regression Model
##  
##  ols(formula = Y ~ B + C + D, data = ex2)
##  
##                   Model Likelihood      Discrimination    
##                      Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    182903.45    R2       0.839    
##  sigma0.1000    d.f.               3    R2 adj   0.839    
##  d.f.  99996    Pr(> chi2)    0.0000    g        0.247    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.4637077 -0.0671900 -0.0001714  0.0673360  0.4057392 
##  
##  
##            Coef    S.E.   t       Pr(>|t|)
##  Intercept  0.2021 0.0010  209.55 <0.0001 
##  B          0.1963 0.0017  114.69 <0.0001 
##  C         -0.3037 0.0014 -222.64 <0.0001 
##  D         -0.4002 0.0014 -293.10 <0.0001 
##

ols(Y ~ A + B + C, data = ex2) #yep

## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C, data = ex2)
##  
##                   Model Likelihood      Discrimination    
##                      Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    182903.45    R2       0.839    
##  sigma0.1000    d.f.               3    R2 adj   0.839    
##  d.f.  99996    Pr(> chi2)    0.0000    g        0.247    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.4637077 -0.0671900 -0.0001714  0.0673360  0.4057392 
##  
##  
##            Coef    S.E.   t       Pr(>|t|)
##  Intercept -0.1981 0.0010 -204.97 <0.0001 
##  A          0.4002 0.0014  293.10 <0.0001 
##  B          0.5964 0.0014  436.47 <0.0001 
##  C          0.0964 0.0017   56.34 <0.0001 
##

Example 3

Ok, what if we have unequal sized groups? Can we leave out a tiny group or does it work best with the large one? No difference?

#simulate data without a single group being present everywhere
ex3 = tibble(
  A = rnorm(100e3, mean = 100),
  B = rnorm(100e3, mean = 10),
  C = rnorm(100e3, mean = 1),
  D = rnorm(100e3, mean = .1)
) %>% 
  make_compositional() %>% 
  # set_colnames(LETTERS[1:4]) %>% 
  as_tibble()

#looks right? first 50k rows have only variation in 2 groups, last 50k the opposite
ex3 %>% head()

#simulate an outcome based on parts
#note the betas here
ex3 %<>% mutate(
  Y = A * .2 + B * .4 + C * -.1 + D * -.2 + rnorm(100e3, sd = .1)
)

#regress all at once
ols(Y ~ A + B + C + D, data = ex3)

## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C + D, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            4    R2 adj   0.001    
##  d.f.  99995    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.     t    Pr(>|t|)
##  Intercept -0.2278 6.88e+10 0.00 1.0000  
##  A          0.4218 6.88e+10 0.00 1.0000  
##  B          0.6818 6.88e+10 0.00 1.0000  
##  C          0.1585 6.88e+10 0.00 1.0000  
##  D                 6.88e+10              
##

#one at a time gives wrong results
map_df(LETTERS[1:4], function(x) {
  fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex3)
  #broom hack
  tidy(summary.lm(fit))[2, ]
})

#leave one out?
ols(Y ~ B + C + D, data = ex3) #right

## Linear Regression Model
##  
##  ols(formula = Y ~ B + C + D, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            3    R2 adj   0.001    
##  d.f.  99996    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.1940 0.0036 53.81 <0.0001 
##  B          0.2600 0.0388  6.70 <0.0001 
##  C         -0.2632 0.0450 -5.84 <0.0001 
##  D         -0.4218 0.0591 -7.14 <0.0001 
##

ols(Y ~ A + C + D, data = ex3) #right

## Linear Regression Model
##  
##  ols(formula = Y ~ A + C + D, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            3    R2 adj   0.001    
##  d.f.  99996    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.4540 0.0353 12.87 <0.0001 
##  A         -0.2600 0.0388 -6.70 <0.0001 
##  C         -0.5232 0.0568 -9.22 <0.0001 
##  D         -0.6818 0.0683 -9.98 <0.0001 
##

ols(Y ~ A + B + D, data = ex3) #right

## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + D, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            3    R2 adj   0.001    
##  d.f.  99996    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept -0.0693 0.0444 -1.56 0.1188  
##  A          0.2632 0.0450  5.84 <0.0001 
##  B          0.5232 0.0568  9.22 <0.0001 
##  D         -0.1585 0.0734 -2.16 0.0307  
##

ols(Y ~ A + B + C, data = ex3) #right

## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            3    R2 adj   0.001    
##  d.f.  99996    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept -0.2278 0.0585 -3.89 <0.0001 
##  A          0.4218 0.0591  7.14 <0.0001 
##  B          0.6818 0.0683  9.98 <0.0001 
##  C          0.1585 0.0734  2.16 0.0307  
##

So apparently, it doesn’t appear to matter much the nature of the compotional data. One can seemingly always use the leave one out as reference approach.

Understanding compositional data as predictors in regressions

About

Init

Example 1

Example 2

Example 3