Compositional data are tricky to handle as predictors. Here I explore this issue with simulations and suggest solutions.
options(digits = 3)
library(pacman)
p_load(kirkegaard, rms, poliscidata, broom)
#function to transform into proportional
make_compositional = function(x) {
#negative values cause errors
abs_x = abs(x)
abs_x / rowSums(abs_x)
}
#simulate random uncorrelated data
set.seed(1)
ex1 = rnorm(500e3) %>% matrix(ncol = 5) %>% set_colnames(LETTERS[1:5]) %>% make_compositional() %>% as_tibble()
#looks right?
ex1 %>% head() %>% {
.$sum = rowSums(.)
.
}
#cors of proportions should be slightly negative due to constraint
ex1 %>% wtd.cors()
## A B C D E
## A 1.000 -0.247 -0.252 -0.251 -0.251
## B -0.247 1.000 -0.249 -0.251 -0.255
## C -0.252 -0.249 1.000 -0.251 -0.247
## D -0.251 -0.251 -0.251 1.000 -0.245
## E -0.251 -0.255 -0.247 -0.245 1.000
#simulate an outcome based on parts
#note the betas here
ex1 %<>% mutate(
Y = A * .2 + B * .4 + C * -.1 + D * -.2 + E * .1 + rnorm(100e3, sd = .1)
)
#regress all at once does not work well
ols(Y ~ A + B + C + D + E, data = ex1)
## Linear Regression Model
##
## ols(formula = Y ~ A + B + C + D + E, data = ex1)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 42707.38 R2 0.348
## sigma0.1000 d.f. 5 R2 adj 0.348
## d.f. 99994 Pr(> chi2) 0.0000 g 0.083
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.111e-01 -6.743e-02 -9.678e-05 6.744e-02 4.553e-01
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0996 6.93e+10 0.00 1.0000
## A 0.0990 6.93e+10 0.00 1.0000
## B 0.3014 6.93e+10 0.00 1.0000
## C -0.1950 6.93e+10 0.00 1.0000
## D -0.3037 6.93e+10 0.00 1.0000
## E 6.93e+10
##
#one at a time may produce biases when variables are correlated
#these results are somewhat off, despite the tiny SEs
map_df(LETTERS[1:5], function(x) {
fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex1)
#broom hack
tidy(summary.lm(fit))[2, ]
})
#Simple solution is to leave one out, which then serves as the reference
#the betas are off, but the solution is simple to add the intercept to them
#the intercept is also the beta for the excluded part
#beta for A is truly .2
ols(Y ~ B + C + D + E, data = ex1)
## Linear Regression Model
##
## ols(formula = Y ~ B + C + D + E, data = ex1)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 42707.38 R2 0.348
## sigma0.1000 d.f. 4 R2 adj 0.348
## d.f. 99995 Pr(> chi2) 0.0000 g 0.083
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.111e-01 -6.743e-02 -9.678e-05 6.744e-02 4.553e-01
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.1986 0.0019 105.64 <0.0001
## B 0.2024 0.0029 69.00 <0.0001
## C -0.2940 0.0029 -100.33 <0.0001
## D -0.4027 0.0029 -137.33 <0.0001
## E -0.0990 0.0029 -33.73 <0.0001
##
#let's try another one: E, which is truly .1
#note also that the fits in R2 are identical
ols(Y ~ A + B + C + D, data = ex1)
## Linear Regression Model
##
## ols(formula = Y ~ A + B + C + D, data = ex1)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 42707.38 R2 0.348
## sigma0.1000 d.f. 4 R2 adj 0.348
## d.f. 99995 Pr(> chi2) 0.0000 g 0.083
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.111e-01 -6.743e-02 -9.678e-05 6.744e-02 4.553e-01
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0996 0.0019 52.83 <0.0001
## A 0.0990 0.0029 33.73 <0.0001
## B 0.3014 0.0029 102.92 <0.0001
## C -0.1950 0.0029 -66.34 <0.0001
## D -0.3037 0.0029 -103.17 <0.0001
##
The above method works in most of the cases, but can get into trouble if the proportions have certain relationships. It works best when there is a large group that’s present everywhere and has most of the variance in Y associated with it. Let’s try violating that.
#simulate data without a single group being present everywhere
ex2 = c(rep(0, 50e3), rnorm(50e3),
rep(0, 50e3), rnorm(50e3),
rnorm(50e3), rep(0, 50e3),
rnorm(50e3), rep(0, 50e3)) %>%
matrix(ncol = 4) %>%
make_compositional() %>%
set_colnames(LETTERS[1:4]) %>%
as_tibble()
#looks right? first 50k rows have only variation in 2 groups, last 50k the opposite
ex2 %>% head()
ex2 %>% tail()
#simulate an outcome based on parts
#note the betas here
ex2 %<>% mutate(
Y = A * .2 + B * .4 + C * -.1 + D * -.2 + rnorm(100e3, sd = .1)
)
#regress all at once
ols(Y ~ A + B + C + D, data = ex2)
## Linear Regression Model
##
## ols(formula = Y ~ A + B + C + D, data = ex2)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 182903.45 R2 0.839
## sigma0.1000 d.f. 4 R2 adj 0.839
## d.f. 99995 Pr(> chi2) 0.0000 g 0.247
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.4637077 -0.0671900 -0.0001714 0.0673360 0.4057392
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.1981 4.34e+09 0.00 1.0000
## A 0.4002 4.34e+09 0.00 1.0000
## B 0.5964 4.34e+09 0.00 1.0000
## C 0.0964 4.34e+09 0.00 1.0000
## D 4.34e+09
##
#one at a time gives wrong results
map_df(LETTERS[1:4], function(x) {
fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex2)
#broom hack
tidy(summary.lm(fit))[2, ]
})
#leave one out?
ols(Y ~ B + C + D, data = ex2) #still works
## Linear Regression Model
##
## ols(formula = Y ~ B + C + D, data = ex2)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 182903.45 R2 0.839
## sigma0.1000 d.f. 3 R2 adj 0.839
## d.f. 99996 Pr(> chi2) 0.0000 g 0.247
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.4637077 -0.0671900 -0.0001714 0.0673360 0.4057392
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.2021 0.0010 209.55 <0.0001
## B 0.1963 0.0017 114.69 <0.0001
## C -0.3037 0.0014 -222.64 <0.0001
## D -0.4002 0.0014 -293.10 <0.0001
##
ols(Y ~ A + B + C, data = ex2) #yep
## Linear Regression Model
##
## ols(formula = Y ~ A + B + C, data = ex2)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 182903.45 R2 0.839
## sigma0.1000 d.f. 3 R2 adj 0.839
## d.f. 99996 Pr(> chi2) 0.0000 g 0.247
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.4637077 -0.0671900 -0.0001714 0.0673360 0.4057392
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.1981 0.0010 -204.97 <0.0001
## A 0.4002 0.0014 293.10 <0.0001
## B 0.5964 0.0014 436.47 <0.0001
## C 0.0964 0.0017 56.34 <0.0001
##
Ok, what if we have unequal sized groups? Can we leave out a tiny group or does it work best with the large one? No difference?
#simulate data without a single group being present everywhere
ex3 = tibble(
A = rnorm(100e3, mean = 100),
B = rnorm(100e3, mean = 10),
C = rnorm(100e3, mean = 1),
D = rnorm(100e3, mean = .1)
) %>%
make_compositional() %>%
# set_colnames(LETTERS[1:4]) %>%
as_tibble()
#looks right? first 50k rows have only variation in 2 groups, last 50k the opposite
ex3 %>% head()
#simulate an outcome based on parts
#note the betas here
ex3 %<>% mutate(
Y = A * .2 + B * .4 + C * -.1 + D * -.2 + rnorm(100e3, sd = .1)
)
#regress all at once
ols(Y ~ A + B + C + D, data = ex3)
## Linear Regression Model
##
## ols(formula = Y ~ A + B + C + D, data = ex3)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 143.12 R2 0.001
## sigma0.1000 d.f. 4 R2 adj 0.001
## d.f. 99995 Pr(> chi2) 0.0000 g 0.004
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.5026101 -0.0672748 -0.0001804 0.0673603 0.5376472
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2278 6.88e+10 0.00 1.0000
## A 0.4218 6.88e+10 0.00 1.0000
## B 0.6818 6.88e+10 0.00 1.0000
## C 0.1585 6.88e+10 0.00 1.0000
## D 6.88e+10
##
#one at a time gives wrong results
map_df(LETTERS[1:4], function(x) {
fit = ols(str_glue("Y ~ {x}") %>% as.formula(), data = ex3)
#broom hack
tidy(summary.lm(fit))[2, ]
})
#leave one out?
ols(Y ~ B + C + D, data = ex3) #right
## Linear Regression Model
##
## ols(formula = Y ~ B + C + D, data = ex3)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 143.12 R2 0.001
## sigma0.1000 d.f. 3 R2 adj 0.001
## d.f. 99996 Pr(> chi2) 0.0000 g 0.004
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.5026101 -0.0672748 -0.0001804 0.0673603 0.5376472
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.1940 0.0036 53.81 <0.0001
## B 0.2600 0.0388 6.70 <0.0001
## C -0.2632 0.0450 -5.84 <0.0001
## D -0.4218 0.0591 -7.14 <0.0001
##
ols(Y ~ A + C + D, data = ex3) #right
## Linear Regression Model
##
## ols(formula = Y ~ A + C + D, data = ex3)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 143.12 R2 0.001
## sigma0.1000 d.f. 3 R2 adj 0.001
## d.f. 99996 Pr(> chi2) 0.0000 g 0.004
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.5026101 -0.0672748 -0.0001804 0.0673603 0.5376472
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.4540 0.0353 12.87 <0.0001
## A -0.2600 0.0388 -6.70 <0.0001
## C -0.5232 0.0568 -9.22 <0.0001
## D -0.6818 0.0683 -9.98 <0.0001
##
ols(Y ~ A + B + D, data = ex3) #right
## Linear Regression Model
##
## ols(formula = Y ~ A + B + D, data = ex3)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 143.12 R2 0.001
## sigma0.1000 d.f. 3 R2 adj 0.001
## d.f. 99996 Pr(> chi2) 0.0000 g 0.004
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.5026101 -0.0672748 -0.0001804 0.0673603 0.5376472
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.0693 0.0444 -1.56 0.1188
## A 0.2632 0.0450 5.84 <0.0001
## B 0.5232 0.0568 9.22 <0.0001
## D -0.1585 0.0734 -2.16 0.0307
##
ols(Y ~ A + B + C, data = ex3) #right
## Linear Regression Model
##
## ols(formula = Y ~ A + B + C, data = ex3)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1e+05 LR chi2 143.12 R2 0.001
## sigma0.1000 d.f. 3 R2 adj 0.001
## d.f. 99996 Pr(> chi2) 0.0000 g 0.004
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.5026101 -0.0672748 -0.0001804 0.0673603 0.5376472
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2278 0.0585 -3.89 <0.0001
## A 0.4218 0.0591 7.14 <0.0001
## B 0.6818 0.0683 9.98 <0.0001
## C 0.1585 0.0734 2.16 0.0307
##
So apparently, it doesn’t appear to matter much the nature of the compotional data. One can seemingly always use the leave one out as reference approach.