Initialization

Let’s build two post-asinh-transformed populations, using gaussian distribution. First population is off for the current channel, ie around zero. Second population is on, ie around 5. Dispersions are equal to 1.

set.seed(0)
s1 = rnorm(100000, sd = 1)
s2 = rnorm(100000, sd = 1, mean = 5)

Two populations of same abundance

The overall histogram.

len2 = round(100*length(s1)/100,0)
s.combined = c(s1, s2[1:len2])
hist(s.combined, breaks = 80)

Let’s center and standardize

s.combined.scaled = scale(s.combined)
hist(s.combined.scaled, breaks = 80)

Let’s describe the populations

cat("Combined population")
## Combined population
summary(s.combined.scaled)
##        V1          
##  Min.   :-2.59131  
##  1st Qu.:-0.92712  
##  Median : 0.00404  
##  Mean   : 0.00000  
##  3rd Qu.: 0.92924  
##  Max.   : 2.51679
sd(s.combined.scaled)
## [1] 1
cat("Population 1")
## Population 1
summary(s.combined.scaled[1:length(s1)])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.5910 -1.1810 -0.9271 -0.9284 -0.6766  0.6832
sd(s.combined.scaled[1:length(s1)])
## [1] 0.3721837
cat("Population 2")
## Population 2
summary(s.combined.scaled[(1:len2)+length(s1)])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.8376  0.6785  0.9292  0.9284  1.1770  2.5170
sd(s.combined.scaled[(1:len2)+length(s1)])
## [1] 0.3707199

Function

Let’s glue the process and the summarization in a function

ff = function(percentage, draw.histo = TRUE) {
  # Percentage to count
  len2 = round(percentage*length(s1)/100,0)
  print(len2)
  # Combine
  s.combined = c(s1, s2[1:len2])
  # Histo of unscaled
  if (draw.histo) {
    hist(s.combined, breaks = 80, 
         main = sprintf("Histogram of unscaled combined (pct=%5.2f%%)", percentage))
  }
  # Scaling
  s.combined.scaled = scale(s.combined)
  # Histo of scaled
  if (draw.histo) {
    hist(s.combined.scaled, breaks = 80, 
         main = sprintf("Histogram of scaled combined (pct=%5.2f%%)", percentage))
  }
  # Build a data.frame to summarize
  df = data.frame(combined = s.combined.scaled, pop1 = NA, pop2 = NA)
  df$pop1[1:length(s1)] = s.combined.scaled[1:length(s1)]
  df$pop2[(1:len2)+length(s1)] = s.combined.scaled[(1:len2)+length(s1)]
  # Summarize, igonoring NA
  summ = rbind(sapply(df, mean, na.rm = TRUE), 
               sapply(df, sd, na.rm = TRUE), 
               sapply(df, quantile, probs = c(0, .25, .5, .75, 1), na.rm = TRUE))
  row.names(summ)[1:2] = c("Mean", "Sd")
  # Return
  round(summ, 3)
}

Abundance pop2 = 100%

ff(100)
## [1] 1e+05

##      combined   pop1   pop2
## Mean    0.000 -0.928  0.928
## Sd      1.000  0.372  0.371
## 0%     -2.591 -2.591 -0.838
## 25%    -0.927 -1.181  0.678
## 50%     0.004 -0.927  0.929
## 75%     0.929 -0.677  1.177
## 100%    2.517  0.683  2.517

Abundance pop2 = 25%

ff(25)
## [1] 25000

##      combined   pop1   pop2
## Mean    0.000 -0.447  1.789
## Sd      1.000  0.448  0.445
## 0%     -2.449 -2.449 -0.338
## 25%    -0.667 -0.752  1.489
## 50%    -0.304 -0.446  1.790
## 75%     0.239 -0.144  2.087
## 100%    3.618  1.493  3.618

Abundance pop2 = 10%

ff(10)
## [1] 10000

##      combined   pop1   pop2
## Mean    0.000 -0.260  2.596
## Sd      1.000  0.571  0.569
## 0%     -2.813 -2.813 -0.120
## 25%    -0.604 -0.648  2.215
## 50%    -0.187 -0.258  2.597
## 75%     0.273  0.127  2.980
## 100%    4.815  2.215  4.815

Abundance pop2 = 1%

ff(1)
## [1] 1000

##      combined   pop1  pop2
## Mean    0.000 -0.044 4.429
## Sd      1.000  0.897 0.880
## 0%     -4.051 -4.051 1.897
## 25%    -0.646 -0.654 3.830
## 50%    -0.030 -0.041 4.407
## 75%     0.584  0.563 5.034
## 100%    7.919  3.839 7.919

Abundance pop2 = 0.1%

ff(0.1)
## [1] 100

##      combined   pop1  pop2
## Mean    0.000 -0.005 4.968
## Sd      1.000  0.988 0.922
## 0%     -4.418 -4.418 2.934
## 25%    -0.675 -0.676 4.298
## 50%     0.000 -0.001 5.010
## 75%     0.666  0.663 5.619
## 100%    7.082  4.272 7.082

Process over a range of percentages

Process

my.seq = c(100, 25, 10, 1, 0.1)
res = list()
for (i in my.seq) {
  cat(i)
  r = ff(i, draw.histo = FALSE)
  print(r)
  res = c(res, list(r))
}
## 100[1] 1e+05
##      combined   pop1   pop2
## Mean    0.000 -0.928  0.928
## Sd      1.000  0.372  0.371
## 0%     -2.591 -2.591 -0.838
## 25%    -0.927 -1.181  0.678
## 50%     0.004 -0.927  0.929
## 75%     0.929 -0.677  1.177
## 100%    2.517  0.683  2.517
## 25[1] 25000
##      combined   pop1   pop2
## Mean    0.000 -0.447  1.789
## Sd      1.000  0.448  0.445
## 0%     -2.449 -2.449 -0.338
## 25%    -0.667 -0.752  1.489
## 50%    -0.304 -0.446  1.790
## 75%     0.239 -0.144  2.087
## 100%    3.618  1.493  3.618
## 10[1] 10000
##      combined   pop1   pop2
## Mean    0.000 -0.260  2.596
## Sd      1.000  0.571  0.569
## 0%     -2.813 -2.813 -0.120
## 25%    -0.604 -0.648  2.215
## 50%    -0.187 -0.258  2.597
## 75%     0.273  0.127  2.980
## 100%    4.815  2.215  4.815
## 1[1] 1000
##      combined   pop1  pop2
## Mean    0.000 -0.044 4.429
## Sd      1.000  0.897 0.880
## 0%     -4.051 -4.051 1.897
## 25%    -0.646 -0.654 3.830
## 50%    -0.030 -0.041 4.407
## 75%     0.584  0.563 5.034
## 100%    7.919  3.839 7.919
## 0.1[1] 100
##      combined   pop1  pop2
## Mean    0.000 -0.005 4.968
## Sd      1.000  0.988 0.922
## 0%     -4.418 -4.418 2.934
## 25%    -0.675 -0.676 4.298
## 50%     0.000 -0.001 5.010
## 75%     0.666  0.663 5.619
## 100%    7.082  4.272 7.082

Mean as a function of percentage

plot(my.seq, sapply(res, function(r) r["Mean", "combined"]), col = "Blue3", ylim = c(-2, 6), xlab = "Percentage", ylab = "Mean", main = "Mean vs percentage", pch = "o")
abline(h = 0, lty = 2)
points(my.seq, sapply(res, function(r) r["Mean", "pop1"]), col = "Green3", pch = "o")
points(my.seq, sapply(res, function(r) r["Mean", "pop2"]), col = "Red3", pch = "o")
legend("topright", colnames(res[[1]]), col = c("Blue3", "Green3", "Red3"), pch = "o")

Dispersion as a function of percentage

plot(my.seq, sapply(res, function(r) r["Sd", "combined"]), col = "Blue3", ylim = c(-0.5, 2.5), xlab = "Percentage", ylab = "Sd", main = "Dispersion vs percentage", pch = "o")
abline(h = 0, lty = 2)
points(my.seq, sapply(res, function(r) r["Sd", "pop1"]), col = "Green3", pch = "o")
points(my.seq, sapply(res, function(r) r["Sd", "pop2"]), col = "Red3", pch = "o")
legend("topright", colnames(res[[1]]), col = c("Blue3", "Green3", "Red3"), pch = "o")

Distance as a function of percentage

plot(my.seq, sapply(res, function(r) r["Mean", "pop2"] - r["Mean", "pop1"]), col = "Orange3", ylim = c(-0.5, 7), xlab = "Percentage", ylab = "Diff.", main = "Difference of means vs percentage", pch = "o")
abline(h = 0, lty = 2)
abline(h = 5, lty = 2, lwd = 2, col = "dodgerblue")  # True position
legend("topright", c("Mean2 - Mean1", "True diff.") , col = c("Orange3", "dodgerblue"), pch = c("o", NA), lwd = c(NA, 2), lty = c(NA, 2))

Conclusions

Alternatives scalings: