Let’s build two post-asinh-transformed populations, using gaussian distribution. First population is off for the current channel, ie around zero. Second population is on, ie around 5. Dispersions are equal to 1.
set.seed(0)
s1 = rnorm(100000, sd = 1)
s2 = rnorm(100000, sd = 1, mean = 5)
The overall histogram.
len2 = round(100*length(s1)/100,0)
s.combined = c(s1, s2[1:len2])
hist(s.combined, breaks = 80)
Let’s center and standardize
s.combined.scaled = scale(s.combined)
hist(s.combined.scaled, breaks = 80)
Let’s describe the populations
cat("Combined population")
## Combined population
summary(s.combined.scaled)
## V1
## Min. :-2.59131
## 1st Qu.:-0.92712
## Median : 0.00404
## Mean : 0.00000
## 3rd Qu.: 0.92924
## Max. : 2.51679
sd(s.combined.scaled)
## [1] 1
cat("Population 1")
## Population 1
summary(s.combined.scaled[1:length(s1)])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.5910 -1.1810 -0.9271 -0.9284 -0.6766 0.6832
sd(s.combined.scaled[1:length(s1)])
## [1] 0.3721837
cat("Population 2")
## Population 2
summary(s.combined.scaled[(1:len2)+length(s1)])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.8376 0.6785 0.9292 0.9284 1.1770 2.5170
sd(s.combined.scaled[(1:len2)+length(s1)])
## [1] 0.3707199
Let’s glue the process and the summarization in a function
ff = function(percentage, draw.histo = TRUE) {
# Percentage to count
len2 = round(percentage*length(s1)/100,0)
print(len2)
# Combine
s.combined = c(s1, s2[1:len2])
# Histo of unscaled
if (draw.histo) {
hist(s.combined, breaks = 80,
main = sprintf("Histogram of unscaled combined (pct=%5.2f%%)", percentage))
}
# Scaling
s.combined.scaled = scale(s.combined)
# Histo of scaled
if (draw.histo) {
hist(s.combined.scaled, breaks = 80,
main = sprintf("Histogram of scaled combined (pct=%5.2f%%)", percentage))
}
# Build a data.frame to summarize
df = data.frame(combined = s.combined.scaled, pop1 = NA, pop2 = NA)
df$pop1[1:length(s1)] = s.combined.scaled[1:length(s1)]
df$pop2[(1:len2)+length(s1)] = s.combined.scaled[(1:len2)+length(s1)]
# Summarize, igonoring NA
summ = rbind(sapply(df, mean, na.rm = TRUE),
sapply(df, sd, na.rm = TRUE),
sapply(df, quantile, probs = c(0, .25, .5, .75, 1), na.rm = TRUE))
row.names(summ)[1:2] = c("Mean", "Sd")
# Return
round(summ, 3)
}
ff(100)
## [1] 1e+05
## combined pop1 pop2
## Mean 0.000 -0.928 0.928
## Sd 1.000 0.372 0.371
## 0% -2.591 -2.591 -0.838
## 25% -0.927 -1.181 0.678
## 50% 0.004 -0.927 0.929
## 75% 0.929 -0.677 1.177
## 100% 2.517 0.683 2.517
ff(25)
## [1] 25000
## combined pop1 pop2
## Mean 0.000 -0.447 1.789
## Sd 1.000 0.448 0.445
## 0% -2.449 -2.449 -0.338
## 25% -0.667 -0.752 1.489
## 50% -0.304 -0.446 1.790
## 75% 0.239 -0.144 2.087
## 100% 3.618 1.493 3.618
ff(10)
## [1] 10000
## combined pop1 pop2
## Mean 0.000 -0.260 2.596
## Sd 1.000 0.571 0.569
## 0% -2.813 -2.813 -0.120
## 25% -0.604 -0.648 2.215
## 50% -0.187 -0.258 2.597
## 75% 0.273 0.127 2.980
## 100% 4.815 2.215 4.815
ff(1)
## [1] 1000
## combined pop1 pop2
## Mean 0.000 -0.044 4.429
## Sd 1.000 0.897 0.880
## 0% -4.051 -4.051 1.897
## 25% -0.646 -0.654 3.830
## 50% -0.030 -0.041 4.407
## 75% 0.584 0.563 5.034
## 100% 7.919 3.839 7.919
ff(0.1)
## [1] 100
## combined pop1 pop2
## Mean 0.000 -0.005 4.968
## Sd 1.000 0.988 0.922
## 0% -4.418 -4.418 2.934
## 25% -0.675 -0.676 4.298
## 50% 0.000 -0.001 5.010
## 75% 0.666 0.663 5.619
## 100% 7.082 4.272 7.082
my.seq = c(100, 25, 10, 1, 0.1)
res = list()
for (i in my.seq) {
cat(i)
r = ff(i, draw.histo = FALSE)
print(r)
res = c(res, list(r))
}
## 100[1] 1e+05
## combined pop1 pop2
## Mean 0.000 -0.928 0.928
## Sd 1.000 0.372 0.371
## 0% -2.591 -2.591 -0.838
## 25% -0.927 -1.181 0.678
## 50% 0.004 -0.927 0.929
## 75% 0.929 -0.677 1.177
## 100% 2.517 0.683 2.517
## 25[1] 25000
## combined pop1 pop2
## Mean 0.000 -0.447 1.789
## Sd 1.000 0.448 0.445
## 0% -2.449 -2.449 -0.338
## 25% -0.667 -0.752 1.489
## 50% -0.304 -0.446 1.790
## 75% 0.239 -0.144 2.087
## 100% 3.618 1.493 3.618
## 10[1] 10000
## combined pop1 pop2
## Mean 0.000 -0.260 2.596
## Sd 1.000 0.571 0.569
## 0% -2.813 -2.813 -0.120
## 25% -0.604 -0.648 2.215
## 50% -0.187 -0.258 2.597
## 75% 0.273 0.127 2.980
## 100% 4.815 2.215 4.815
## 1[1] 1000
## combined pop1 pop2
## Mean 0.000 -0.044 4.429
## Sd 1.000 0.897 0.880
## 0% -4.051 -4.051 1.897
## 25% -0.646 -0.654 3.830
## 50% -0.030 -0.041 4.407
## 75% 0.584 0.563 5.034
## 100% 7.919 3.839 7.919
## 0.1[1] 100
## combined pop1 pop2
## Mean 0.000 -0.005 4.968
## Sd 1.000 0.988 0.922
## 0% -4.418 -4.418 2.934
## 25% -0.675 -0.676 4.298
## 50% 0.000 -0.001 5.010
## 75% 0.666 0.663 5.619
## 100% 7.082 4.272 7.082
plot(my.seq, sapply(res, function(r) r["Mean", "combined"]), col = "Blue3", ylim = c(-2, 6), xlab = "Percentage", ylab = "Mean", main = "Mean vs percentage", pch = "o")
abline(h = 0, lty = 2)
points(my.seq, sapply(res, function(r) r["Mean", "pop1"]), col = "Green3", pch = "o")
points(my.seq, sapply(res, function(r) r["Mean", "pop2"]), col = "Red3", pch = "o")
legend("topright", colnames(res[[1]]), col = c("Blue3", "Green3", "Red3"), pch = "o")
plot(my.seq, sapply(res, function(r) r["Sd", "combined"]), col = "Blue3", ylim = c(-0.5, 2.5), xlab = "Percentage", ylab = "Sd", main = "Dispersion vs percentage", pch = "o")
abline(h = 0, lty = 2)
points(my.seq, sapply(res, function(r) r["Sd", "pop1"]), col = "Green3", pch = "o")
points(my.seq, sapply(res, function(r) r["Sd", "pop2"]), col = "Red3", pch = "o")
legend("topright", colnames(res[[1]]), col = c("Blue3", "Green3", "Red3"), pch = "o")
plot(my.seq, sapply(res, function(r) r["Mean", "pop2"] - r["Mean", "pop1"]), col = "Orange3", ylim = c(-0.5, 7), xlab = "Percentage", ylab = "Diff.", main = "Difference of means vs percentage", pch = "o")
abline(h = 0, lty = 2)
abline(h = 5, lty = 2, lwd = 2, col = "dodgerblue") # True position
legend("topright", c("Mean2 - Mean1", "True diff.") , col = c("Orange3", "dodgerblue"), pch = c("o", NA), lwd = c(NA, 2), lty = c(NA, 2))
Alternatives scalings: