Initialization
Two populations of same abundance
Function
Abundance pop2 = 100%
Abundance pop2 = 25%
Abundance pop2 = 10%
Abundance pop2 = 1%
Abundance pop2 = 0.1%
Process over a range of percentages
Conclusions

Initialization

Let’s build two post-asinh-transformed populations, using gaussian distribution. First population is off for the current channel, ie around zero. Second population is on, ie around 5. Dispersions are equal to 1.

set.seed(0)
s1 = rnorm(100000, sd = 1)
s2 = rnorm(100000, sd = 1, mean = 5)

Two populations of same abundance

The overall histogram.

len2 = round(100*length(s1)/100,0)
s.combined = c(s1, s2[1:len2])
hist(s.combined, breaks = 80)

Let’s center and standardize

s.combined.scaled = scale(s.combined)
hist(s.combined.scaled, breaks = 80)

Let’s describe the populations

cat("Combined population")

## Combined population

summary(s.combined.scaled)

##        V1          
##  Min.   :-2.59131  
##  1st Qu.:-0.92712  
##  Median : 0.00404  
##  Mean   : 0.00000  
##  3rd Qu.: 0.92924  
##  Max.   : 2.51679

sd(s.combined.scaled)

## [1] 1

cat("Population 1")

## Population 1

summary(s.combined.scaled[1:length(s1)])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.5910 -1.1810 -0.9271 -0.9284 -0.6766  0.6832

sd(s.combined.scaled[1:length(s1)])

## [1] 0.3721837

cat("Population 2")

## Population 2

summary(s.combined.scaled[(1:len2)+length(s1)])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.8376  0.6785  0.9292  0.9284  1.1770  2.5170

sd(s.combined.scaled[(1:len2)+length(s1)])

## [1] 0.3707199

Function

Let’s glue the process and the summarization in a function

ff = function(percentage, draw.histo = TRUE) {
  # Percentage to count
  len2 = round(percentage*length(s1)/100,0)
  print(len2)
  # Combine
  s.combined = c(s1, s2[1:len2])
  # Histo of unscaled
  if (draw.histo) {
    hist(s.combined, breaks = 80, 
         main = sprintf("Histogram of unscaled combined (pct=%5.2f%%)", percentage))
  }
  # Scaling
  s.combined.scaled = scale(s.combined)
  # Histo of scaled
  if (draw.histo) {
    hist(s.combined.scaled, breaks = 80, 
         main = sprintf("Histogram of scaled combined (pct=%5.2f%%)", percentage))
  }
  # Build a data.frame to summarize
  df = data.frame(combined = s.combined.scaled, pop1 = NA, pop2 = NA)
  df$pop1[1:length(s1)] = s.combined.scaled[1:length(s1)]
  df$pop2[(1:len2)+length(s1)] = s.combined.scaled[(1:len2)+length(s1)]
  # Summarize, igonoring NA
  summ = rbind(sapply(df, mean, na.rm = TRUE), 
               sapply(df, sd, na.rm = TRUE), 
               sapply(df, quantile, probs = c(0, .25, .5, .75, 1), na.rm = TRUE))
  row.names(summ)[1:2] = c("Mean", "Sd")
  # Return
  round(summ, 3)
}

Abundance pop2 = 100%

ff(100)

## [1] 1e+05

##      combined   pop1   pop2
## Mean    0.000 -0.928  0.928
## Sd      1.000  0.372  0.371
## 0%     -2.591 -2.591 -0.838
## 25%    -0.927 -1.181  0.678
## 50%     0.004 -0.927  0.929
## 75%     0.929 -0.677  1.177
## 100%    2.517  0.683  2.517

Abundance pop2 = 25%

ff(25)

## [1] 25000

##      combined   pop1   pop2
## Mean    0.000 -0.447  1.789
## Sd      1.000  0.448  0.445
## 0%     -2.449 -2.449 -0.338
## 25%    -0.667 -0.752  1.489
## 50%    -0.304 -0.446  1.790
## 75%     0.239 -0.144  2.087
## 100%    3.618  1.493  3.618

Abundance pop2 = 10%

ff(10)

## [1] 10000

##      combined   pop1   pop2
## Mean    0.000 -0.260  2.596
## Sd      1.000  0.571  0.569
## 0%     -2.813 -2.813 -0.120
## 25%    -0.604 -0.648  2.215
## 50%    -0.187 -0.258  2.597
## 75%     0.273  0.127  2.980
## 100%    4.815  2.215  4.815

Abundance pop2 = 1%

ff(1)

## [1] 1000

##      combined   pop1  pop2
## Mean    0.000 -0.044 4.429
## Sd      1.000  0.897 0.880
## 0%     -4.051 -4.051 1.897
## 25%    -0.646 -0.654 3.830
## 50%    -0.030 -0.041 4.407
## 75%     0.584  0.563 5.034
## 100%    7.919  3.839 7.919

Abundance pop2 = 0.1%

ff(0.1)

## [1] 100

##      combined   pop1  pop2
## Mean    0.000 -0.005 4.968
## Sd      1.000  0.988 0.922
## 0%     -4.418 -4.418 2.934
## 25%    -0.675 -0.676 4.298
## 50%     0.000 -0.001 5.010
## 75%     0.666  0.663 5.619
## 100%    7.082  4.272 7.082

Process over a range of percentages

Process

my.seq = c(100, 25, 10, 1, 0.1)
res = list()
for (i in my.seq) {
  cat(i)
  r = ff(i, draw.histo = FALSE)
  print(r)
  res = c(res, list(r))
}

## 100[1] 1e+05
##      combined   pop1   pop2
## Mean    0.000 -0.928  0.928
## Sd      1.000  0.372  0.371
## 0%     -2.591 -2.591 -0.838
## 25%    -0.927 -1.181  0.678
## 50%     0.004 -0.927  0.929
## 75%     0.929 -0.677  1.177
## 100%    2.517  0.683  2.517
## 25[1] 25000
##      combined   pop1   pop2
## Mean    0.000 -0.447  1.789
## Sd      1.000  0.448  0.445
## 0%     -2.449 -2.449 -0.338
## 25%    -0.667 -0.752  1.489
## 50%    -0.304 -0.446  1.790
## 75%     0.239 -0.144  2.087
## 100%    3.618  1.493  3.618
## 10[1] 10000
##      combined   pop1   pop2
## Mean    0.000 -0.260  2.596
## Sd      1.000  0.571  0.569
## 0%     -2.813 -2.813 -0.120
## 25%    -0.604 -0.648  2.215
## 50%    -0.187 -0.258  2.597
## 75%     0.273  0.127  2.980
## 100%    4.815  2.215  4.815
## 1[1] 1000
##      combined   pop1  pop2
## Mean    0.000 -0.044 4.429
## Sd      1.000  0.897 0.880
## 0%     -4.051 -4.051 1.897
## 25%    -0.646 -0.654 3.830
## 50%    -0.030 -0.041 4.407
## 75%     0.584  0.563 5.034
## 100%    7.919  3.839 7.919
## 0.1[1] 100
##      combined   pop1  pop2
## Mean    0.000 -0.005 4.968
## Sd      1.000  0.988 0.922
## 0%     -4.418 -4.418 2.934
## 25%    -0.675 -0.676 4.298
## 50%     0.000 -0.001 5.010
## 75%     0.666  0.663 5.619
## 100%    7.082  4.272 7.082

Mean as a function of percentage

plot(my.seq, sapply(res, function(r) r["Mean", "combined"]), col = "Blue3", ylim = c(-2, 6), xlab = "Percentage", ylab = "Mean", main = "Mean vs percentage", pch = "o")
abline(h = 0, lty = 2)
points(my.seq, sapply(res, function(r) r["Mean", "pop1"]), col = "Green3", pch = "o")
points(my.seq, sapply(res, function(r) r["Mean", "pop2"]), col = "Red3", pch = "o")
legend("topright", colnames(res[[1]]), col = c("Blue3", "Green3", "Red3"), pch = "o")

Dispersion as a function of percentage

plot(my.seq, sapply(res, function(r) r["Sd", "combined"]), col = "Blue3", ylim = c(-0.5, 2.5), xlab = "Percentage", ylab = "Sd", main = "Dispersion vs percentage", pch = "o")
abline(h = 0, lty = 2)
points(my.seq, sapply(res, function(r) r["Sd", "pop1"]), col = "Green3", pch = "o")
points(my.seq, sapply(res, function(r) r["Sd", "pop2"]), col = "Red3", pch = "o")
legend("topright", colnames(res[[1]]), col = c("Blue3", "Green3", "Red3"), pch = "o")

Distance as a function of percentage

plot(my.seq, sapply(res, function(r) r["Mean", "pop2"] - r["Mean", "pop1"]), col = "Orange3", ylim = c(-0.5, 7), xlab = "Percentage", ylab = "Diff.", main = "Difference of means vs percentage", pch = "o")
abline(h = 0, lty = 2)
abline(h = 5, lty = 2, lwd = 2, col = "dodgerblue")  # True position
legend("topright", c("Mean2 - Mean1", "True diff.") , col = c("Orange3", "dodgerblue"), pch = c("o", NA), lwd = c(NA, 2), lty = c(NA, 2))

Conclusions

When the observed population consists in two populations (ie from 100/100 down to 100/10), the dispersion of each is clearly under-estimated.
Because the dispersion acts as the scaling coefficent, the euclidean distance of the two populations is also changing accordingly.
Thus the distance between is changing according to the relative of the two populations.
It sounds like the clustering will be driven by channels that shows the highest difference in relative abundance. This could be interesting maybe, because it lowers un-informative channels.

Alternatives scalings:

None
User’s skills
Model mixture estimation
Density down-sampling a la SPADE

Centering and standardizing process

Cibi - CRCM - S. Granjeaud

16 November 2015