About

Compositional data are tricky to handle as predictors. Here I explore this issue with simulations and suggest solutions.

Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, rms, poliscidata, broom)

#function to transform into proportional
make_compositional = function(x) {
  #negative values cause errors
  abs_x = abs(x)
  
  abs_x / rowSums(abs_x)
}

Example 1

#simulate random uncorrelated data
set.seed(1)
ex1 = rnorm(500e3) %>% matrix(ncol = 5) %>% set_colnames(LETTERS[1:5]) %>% make_compositional() %>% as_tibble()

#looks right?
ex1 %>% head() %>% {
  .$sum = rowSums(.)
  .
}
#cors of proportions should be slightly negative due to constraint
ex1 %>% wtd.cors()
##        A      B      C      D      E
## A  1.000 -0.247 -0.252 -0.251 -0.251
## B -0.247  1.000 -0.249 -0.251 -0.255
## C -0.252 -0.249  1.000 -0.251 -0.247
## D -0.251 -0.251 -0.251  1.000 -0.245
## E -0.251 -0.255 -0.247 -0.245  1.000
#simulate an outcome based on parts
#note the betas here
ex1 %<>% mutate(
  Y = A * .2 + B * .4 + C * -.1 + D * -.2 + E * .1 + rnorm(100e3, sd = .1)
)

#regress all at once does not work well
ols(Y ~ A + B + C + D + E, data = ex1)
## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C + D + E, data = ex1)
##  
##                  Model Likelihood      Discrimination    
##                     Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    42707.38    R2       0.348    
##  sigma0.1000    d.f.              5    R2 adj   0.348    
##  d.f.  99994    Pr(> chi2)   0.0000    g        0.083    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -4.111e-01 -6.743e-02 -9.678e-05  6.744e-02  4.553e-01 
##  
##  
##            Coef    S.E.     t    Pr(>|t|)
##  Intercept  0.0996 6.93e+10 0.00 1.0000  
##  A          0.0990 6.93e+10 0.00 1.0000  
##  B          0.3014 6.93e+10 0.00 1.0000  
##  C         -0.1950 6.93e+10 0.00 1.0000  
##  D         -0.3037 6.93e+10 0.00 1.0000  
##  E                 6.93e+10              
## 
#one at a time may produce biases when variables are correlated
#these results are somewhat off, despite the tiny SEs
map_df(LETTERS[1:5], function(x) {
  fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex1)
  #broom hack
  tidy(summary.lm(fit))[2, ]
})
#Simple solution is to leave one out, which then serves as the reference
#the betas are off, but the solution is simple to add the intercept to them
#the intercept is also the beta for the excluded part
#beta for A is truly .2
ols(Y ~ B + C + D + E, data = ex1)
## Linear Regression Model
##  
##  ols(formula = Y ~ B + C + D + E, data = ex1)
##  
##                  Model Likelihood      Discrimination    
##                     Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    42707.38    R2       0.348    
##  sigma0.1000    d.f.              4    R2 adj   0.348    
##  d.f.  99995    Pr(> chi2)   0.0000    g        0.083    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -4.111e-01 -6.743e-02 -9.678e-05  6.744e-02  4.553e-01 
##  
##  
##            Coef    S.E.   t       Pr(>|t|)
##  Intercept  0.1986 0.0019  105.64 <0.0001 
##  B          0.2024 0.0029   69.00 <0.0001 
##  C         -0.2940 0.0029 -100.33 <0.0001 
##  D         -0.4027 0.0029 -137.33 <0.0001 
##  E         -0.0990 0.0029  -33.73 <0.0001 
## 
#let's try another one: E, which is truly .1
#note also that the fits in R2 are identical
ols(Y ~ A + B + C + D, data = ex1)
## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C + D, data = ex1)
##  
##                  Model Likelihood      Discrimination    
##                     Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    42707.38    R2       0.348    
##  sigma0.1000    d.f.              4    R2 adj   0.348    
##  d.f.  99995    Pr(> chi2)   0.0000    g        0.083    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -4.111e-01 -6.743e-02 -9.678e-05  6.744e-02  4.553e-01 
##  
##  
##            Coef    S.E.   t       Pr(>|t|)
##  Intercept  0.0996 0.0019   52.83 <0.0001 
##  A          0.0990 0.0029   33.73 <0.0001 
##  B          0.3014 0.0029  102.92 <0.0001 
##  C         -0.1950 0.0029  -66.34 <0.0001 
##  D         -0.3037 0.0029 -103.17 <0.0001 
## 

Example 2

The above method works in most of the cases, but can get into trouble if the proportions have certain relationships. It works best when there is a large group that’s present everywhere and has most of the variance in Y associated with it. Let’s try violating that.

#simulate data without a single group being present everywhere
ex2 = c(rep(0, 50e3), rnorm(50e3),
  rep(0, 50e3), rnorm(50e3),
  rnorm(50e3), rep(0, 50e3),
  rnorm(50e3), rep(0, 50e3)) %>% 
  matrix(ncol = 4) %>% 
  make_compositional() %>% 
  set_colnames(LETTERS[1:4]) %>% 
  as_tibble()

#looks right? first 50k rows have only variation in 2 groups, last 50k the opposite
ex2 %>% head()
ex2 %>% tail()
#simulate an outcome based on parts
#note the betas here
ex2 %<>% mutate(
  Y = A * .2 + B * .4 + C * -.1 + D * -.2 + rnorm(100e3, sd = .1)
)

#regress all at once
ols(Y ~ A + B + C + D, data = ex2)
## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C + D, data = ex2)
##  
##                   Model Likelihood      Discrimination    
##                      Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    182903.45    R2       0.839    
##  sigma0.1000    d.f.               4    R2 adj   0.839    
##  d.f.  99995    Pr(> chi2)    0.0000    g        0.247    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.4637077 -0.0671900 -0.0001714  0.0673360  0.4057392 
##  
##  
##            Coef    S.E.     t    Pr(>|t|)
##  Intercept -0.1981 4.34e+09 0.00 1.0000  
##  A          0.4002 4.34e+09 0.00 1.0000  
##  B          0.5964 4.34e+09 0.00 1.0000  
##  C          0.0964 4.34e+09 0.00 1.0000  
##  D                 4.34e+09              
## 
#one at a time gives wrong results
map_df(LETTERS[1:4], function(x) {
  fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex2)
  #broom hack
  tidy(summary.lm(fit))[2, ]
})
#leave one out?
ols(Y ~ B + C + D, data = ex2) #still works
## Linear Regression Model
##  
##  ols(formula = Y ~ B + C + D, data = ex2)
##  
##                   Model Likelihood      Discrimination    
##                      Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    182903.45    R2       0.839    
##  sigma0.1000    d.f.               3    R2 adj   0.839    
##  d.f.  99996    Pr(> chi2)    0.0000    g        0.247    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.4637077 -0.0671900 -0.0001714  0.0673360  0.4057392 
##  
##  
##            Coef    S.E.   t       Pr(>|t|)
##  Intercept  0.2021 0.0010  209.55 <0.0001 
##  B          0.1963 0.0017  114.69 <0.0001 
##  C         -0.3037 0.0014 -222.64 <0.0001 
##  D         -0.4002 0.0014 -293.10 <0.0001 
## 
ols(Y ~ A + B + C, data = ex2) #yep
## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C, data = ex2)
##  
##                   Model Likelihood      Discrimination    
##                      Ratio Test            Indexes        
##  Obs   1e+05    LR chi2    182903.45    R2       0.839    
##  sigma0.1000    d.f.               3    R2 adj   0.839    
##  d.f.  99996    Pr(> chi2)    0.0000    g        0.247    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.4637077 -0.0671900 -0.0001714  0.0673360  0.4057392 
##  
##  
##            Coef    S.E.   t       Pr(>|t|)
##  Intercept -0.1981 0.0010 -204.97 <0.0001 
##  A          0.4002 0.0014  293.10 <0.0001 
##  B          0.5964 0.0014  436.47 <0.0001 
##  C          0.0964 0.0017   56.34 <0.0001 
## 

Example 3

Ok, what if we have unequal sized groups? Can we leave out a tiny group or does it work best with the large one? No difference?

#simulate data without a single group being present everywhere
ex3 = tibble(
  A = rnorm(100e3, mean = 100),
  B = rnorm(100e3, mean = 10),
  C = rnorm(100e3, mean = 1),
  D = rnorm(100e3, mean = .1)
) %>% 
  make_compositional() %>% 
  # set_colnames(LETTERS[1:4]) %>% 
  as_tibble()

#looks right? first 50k rows have only variation in 2 groups, last 50k the opposite
ex3 %>% head()
#simulate an outcome based on parts
#note the betas here
ex3 %<>% mutate(
  Y = A * .2 + B * .4 + C * -.1 + D * -.2 + rnorm(100e3, sd = .1)
)

#regress all at once
ols(Y ~ A + B + C + D, data = ex3)
## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C + D, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            4    R2 adj   0.001    
##  d.f.  99995    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.     t    Pr(>|t|)
##  Intercept -0.2278 6.88e+10 0.00 1.0000  
##  A          0.4218 6.88e+10 0.00 1.0000  
##  B          0.6818 6.88e+10 0.00 1.0000  
##  C          0.1585 6.88e+10 0.00 1.0000  
##  D                 6.88e+10              
## 
#one at a time gives wrong results
map_df(LETTERS[1:4], function(x) {
  fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex3)
  #broom hack
  tidy(summary.lm(fit))[2, ]
})
#leave one out?
ols(Y ~ B + C + D, data = ex3) #right
## Linear Regression Model
##  
##  ols(formula = Y ~ B + C + D, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            3    R2 adj   0.001    
##  d.f.  99996    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.1940 0.0036 53.81 <0.0001 
##  B          0.2600 0.0388  6.70 <0.0001 
##  C         -0.2632 0.0450 -5.84 <0.0001 
##  D         -0.4218 0.0591 -7.14 <0.0001 
## 
ols(Y ~ A + C + D, data = ex3) #right
## Linear Regression Model
##  
##  ols(formula = Y ~ A + C + D, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            3    R2 adj   0.001    
##  d.f.  99996    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.4540 0.0353 12.87 <0.0001 
##  A         -0.2600 0.0388 -6.70 <0.0001 
##  C         -0.5232 0.0568 -9.22 <0.0001 
##  D         -0.6818 0.0683 -9.98 <0.0001 
## 
ols(Y ~ A + B + D, data = ex3) #right
## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + D, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            3    R2 adj   0.001    
##  d.f.  99996    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept -0.0693 0.0444 -1.56 0.1188  
##  A          0.2632 0.0450  5.84 <0.0001 
##  B          0.5232 0.0568  9.22 <0.0001 
##  D         -0.1585 0.0734 -2.16 0.0307  
## 
ols(Y ~ A + B + C, data = ex3) #right
## Linear Regression Model
##  
##  ols(formula = Y ~ A + B + C, data = ex3)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   1e+05    LR chi2    143.12    R2       0.001    
##  sigma0.1000    d.f.            3    R2 adj   0.001    
##  d.f.  99996    Pr(> chi2) 0.0000    g        0.004    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.5026101 -0.0672748 -0.0001804  0.0673603  0.5376472 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept -0.2278 0.0585 -3.89 <0.0001 
##  A          0.4218 0.0591  7.14 <0.0001 
##  B          0.6818 0.0683  9.98 <0.0001 
##  C          0.1585 0.0734  2.16 0.0307  
## 

So apparently, it doesn’t appear to matter much the nature of the compotional data. One can seemingly always use the leave one out as reference approach.