A simple write-up to test normality of recorded phenotypes for new Pareto Front metrics - including the gravity (G) component. The measurements were calculated by Arjun Chandrasekhar, and analyzed by Magda Julkowska as a part of NSF IOS funded research.
pheno <- read.table("trait.optimal.alpha.C.tfam_aligned.txt")
pheno$V3[pheno$V3 == -9] <- NA
write.table(pheno, "trait.optimal.alpha.C.tfam_aligned_corr.txt", row.names = F)
y <- pheno$V3
y <- y[is.finite(y)] # drop NA/NaN/Inf
y
## [1] 0.3925000 0.7250000 0.2816667 0.4550000 0.1437500 0.8300000 0.3883333
## [8] 0.4750000 0.4587500 0.6166667 0.6350000 0.3587500 0.1250000 0.6000000
## [15] 0.7950000 0.3916667 0.6612500 0.5983333 0.5266667 0.6983333 0.2333333
## [22] 0.2150000 0.6083333 0.7316667 0.7600000 0.5975000 0.5750000 0.4550000
## [29] 0.6500000 0.5083333 0.1887500 0.6842857 0.6583333 0.1975000 0.7125000
## [36] 0.6500000 0.5975000 0.5625000 0.5180000 0.6775000 0.6000000 0.5612500
## [43] 0.4433333 0.3262500 0.6900000 0.6533333 0.7362500 0.4320000 0.6900000
## [50] 0.6562500 0.3687500 0.4687500 0.2683333 0.2212500 0.3162500 0.4375000
## [57] 0.6725000 0.7333333 0.5083333 0.6833333 0.3762500 0.6412500 0.8250000
## [64] 0.4187500 0.4750000 0.3400000 0.2890000 0.5390000 0.3710000 0.8140000
## [71] 0.4190000 0.6270000 0.5260000 0.7250000 0.1970000 0.6675000 0.6866667
## [78] 0.5866667 0.4287500 0.4050000 0.7012500 0.6250000 0.6566667 0.3833333
## [85] 0.5350000 0.4762500 0.3225000 0.4400000 0.8333333 0.5933333 0.7037500
## [92] 0.4216667 0.4425000 0.3750000 0.2750000 0.3750000 0.6800000 0.4487500
## [99] 0.5037500 0.3625000 0.4125000 0.8150000 0.5583333 0.4600000 0.4637500
## [106] 0.4983333 0.7183333 0.3850000 0.6025000 0.3550000 0.3150000 0.6933333
## [113] 0.6416667 0.5750000 0.5462500 0.2250000 0.1650000 0.3500000 0.6250000
## [120] 0.4375000 0.5650000 0.4883333 0.6250000 0.6100000 0.6237500 0.7312500
## [127] 0.3700000 0.6500000 0.4250000 0.7500000 0.5150000 0.2916667 0.1416667
## [134] 0.1166667 0.5816667 0.5583333 0.4764286 0.4233333 0.6025000 0.3883333
## [141] 0.6537500 0.7250000 0.1916667 0.6533333 0.4883333 0.6466667 0.5500000
First - let’s do some visuals:
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
Now - let’s get some numbers:
Shapiro-Wilk (recommended for n <= 5000)
sh <- shapiro.test(y)
print(sh)
##
## Shapiro-Wilk normality test
##
## data: y
## W = 0.97417, p-value = 0.007089
Anderson-Darling (good power for many departures from normality)
ad <- nortest::ad.test(y)
print(ad)
##
## Anderson-Darling normality test
##
## data: y
## A = 1.0484, p-value = 0.009076
D’Agostino-Pearson / Jarque-Bera style (skewness + kurtosis)
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: -0.341
## Kurtosis: 2.389 (normal ~ 3)
## Jarque-Bera: 5.133, p=0.0768
pheno <- read.table("trait.optimal.alpha.S.tfam_aligned.txt")
pheno$V3[pheno$V3 == -9] <- NA
write.table(pheno, "trait.optimal.alpha.S.tfam_aligned_corr.txt", row.names = F)
y <- pheno$V3
y <- y[is.finite(y)] # drop NA/NaN/Inf
y
## [1] 0.2500000 0.7250000 0.1933333 0.7362500 0.0600000 0.5400000 0.3083333
## [8] 0.2500000 0.4666667 0.6166667 0.7483333 0.2875000 0.0000000 0.4883333
## [15] 0.5712500 0.2750000 0.4625000 0.5000000 0.5975000 0.3750000 0.3400000
## [22] 0.1790000 0.4833333 0.4950000 0.2600000 0.2375000 0.3875000 0.3730000
## [29] 0.6950000 0.1000000 0.3500000 0.6800000 0.6300000 0.8250000 0.5400000
## [36] 0.5733333 0.7925000 0.7375000 0.3850000 0.6250000 0.7150000 0.1250000
## [43] 0.7000000 0.0512500 0.8650000 0.5950000 0.6212500 0.1680000 0.5070000
## [50] 0.5150000 0.3516667 0.4312500 0.2816667 0.1125000 0.5125000 0.3425000
## [57] 0.6500000 0.5662500 0.4650000 0.4125000 0.2562500 0.5912500 0.6000000
## [64] 0.1250000 0.4125000 0.3400000 0.4280000 0.5130000 0.2890000 0.5150000
## [71] 0.3660000 0.3530000 0.3270000 0.4687500 0.2410000 0.4037500 0.7116667
## [78] 0.5833333 0.2475000 0.6487500 0.8350000 0.4675000 0.3750000 0.5350000
## [85] 0.5666667 0.4625000 0.6375000 0.1000000 0.6583333 0.2166667 0.5700000
## [92] 0.5150000 0.0537500 0.6750000 0.1200000 0.4033333 0.6650000 0.4625000
## [99] 0.4625000 0.5100000 0.6187500 0.5150000 0.1566667 0.6737500 0.2987500
## [106] 0.5416667 0.4716667 0.5383333 0.6050000 0.2800000 0.4475000 0.6250000
## [113] 0.5333333 0.5212500 0.3583333 0.2000000 0.5816667 0.5000000 0.5450000
## [120] 0.3266667 0.5250000 0.4820000 0.5716667 0.4337500 0.7166667 0.5816667
## [127] 0.2250000 0.3066667 0.0750000 0.5250000 0.3350000 0.3400000 0.3375000
## [134] 0.4500000 0.2250000 0.5083333 0.4000000 0.2316667 0.3366667 0.5350000
## [141] 0.3516667 0.2516667 0.4766667 0.2750000 0.6666667 0.5750000
First - let’s do some visuals:
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
Now - let’s get some numbers:
Shapiro-Wilk (recommended for n <= 5000)
sh <- shapiro.test(y)
print(sh)
##
## Shapiro-Wilk normality test
##
## data: y
## W = 0.98771, p-value = 0.2245
Anderson-Darling (good power for many departures from normality)
#ad <-
nortest::ad.test(y)
##
## Anderson-Darling normality test
##
## data: y
## A = 0.53801, p-value = 0.1653
print(ad)
##
## Anderson-Darling normality test
##
## data: y
## A = 1.0484, p-value = 0.009076
D’Agostino-Pearson / Jarque-Bera style (skewness + kurtosis)
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: -0.195
## Kurtosis: 2.472 (normal ~ 3)
## Jarque-Bera: 2.623, p=0.269
pheno <- read.table("trait.optimal.G.C.tfam_aligned.txt")
pheno$V3[pheno$V3 == -9] <- NA
write.table(pheno, "trait.optimal.G.C.tfam_aligned_corr.txt", row.names = F)
y <- pheno$V3
y <- y[is.finite(y)] # drop NA/NaN/Inf
y
## [1] 7.500000e-02 1.000000e-01 3.333333e-02 -5.000000e-02 2.375000e-01
## [6] 1.125000e-01 0.000000e+00 2.500000e-01 1.375000e-01 8.333333e-02
## [11] 8.333333e-02 1.250000e-02 1.750000e-01 0.000000e+00 -2.500000e-02
## [16] 1.333333e-01 -1.250000e-01 1.666667e-02 -3.333333e-02 1.833333e-01
## [21] 6.666667e-02 0.000000e+00 -8.333333e-02 1.666667e-02 2.000000e-02
## [26] -2.500000e-01 3.000000e-01 8.000000e-02 -2.000000e-01 8.333333e-02
## [31] 2.125000e-01 2.142857e-01 1.000000e-01 2.250000e-01 1.250000e-01
## [36] 3.333333e-02 7.500000e-02 1.750000e-01 5.000000e-02 1.125000e-01
## [41] -2.000000e-01 3.750000e-02 -5.000000e-02 1.000000e-01 1.250000e-02
## [46] 0.000000e+00 1.875000e-01 1.500000e-01 1.100000e-01 1.250000e-01
## [51] -1.250000e-02 -7.500000e-02 1.166667e-01 -8.750000e-02 -5.000000e-02
## [56] 1.375000e-01 0.000000e+00 1.666667e-02 8.333333e-02 -1.666667e-02
## [61] 2.500000e-02 3.750000e-02 -2.000000e-01 1.250000e-01 4.000000e-02
## [66] 1.333333e-01 1.100000e-01 1.000000e-01 2.000000e-02 5.000000e-02
## [71] 2.400000e-01 1.000000e-01 -1.000000e-02 -7.500000e-02 -2.000000e-02
## [76] 5.000000e-02 1.000000e-01 -6.666667e-02 3.750000e-02 -3.750000e-02
## [81] -1.125000e-01 2.500000e-02 8.333333e-02 -3.333333e-02 1.333333e-01
## [86] 0.000000e+00 5.000000e-02 2.166667e-01 -1.666667e-02 -1.666667e-02
## [91] 8.750000e-02 1.833333e-01 1.250000e-01 -4.000000e-01 1.333333e-01
## [96] 2.166667e-01 1.166667e-01 -1.250000e-02 3.750000e-02 2.500000e-02
## [101] -1.000000e-01 2.666667e-01 1.833333e-01 -3.750000e-02 7.500000e-02
## [106] -6.666667e-02 0.000000e+00 0.000000e+00 -5.000000e-02 6.666667e-02
## [111] -1.250000e-02 5.000000e-02 3.333333e-02 -5.000000e-02 2.375000e-01
## [116] 1.166667e-01 2.000000e-01 -2.500000e-01 -1.000000e-01 5.000000e-02
## [121] -1.000000e-01 -1.000000e-01 1.000000e-01 7.000000e-02 -3.750000e-02
## [126] 5.000000e-02 8.333333e-02 -5.000000e-02 -5.000000e-02 -5.000000e-02
## [131] 1.333333e-01 2.000000e-01 6.666667e-02 1.166667e-01 6.666667e-02
## [136] 3.333333e-02 3.357143e-01 1.833333e-01 -2.500000e-02 8.333333e-02
## [141] -2.500000e-02 9.250000e-18 -2.833333e-01 -1.666667e-02 -3.000000e-01
## [146] 3.333333e-02 -6.250000e-02
First - let’s do some visuals:
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
Now - let’s get some numbers:
Shapiro-Wilk (recommended for n <= 5000)
sh <- shapiro.test(y)
print(sh)
##
## Shapiro-Wilk normality test
##
## data: y
## W = 0.9712, p-value = 0.003472
Anderson-Darling (good power for many departures from normality)
ad <- nortest::ad.test(y)
print(ad)
##
## Anderson-Darling normality test
##
## data: y
## A = 0.93301, p-value = 0.01752
D’Agostino-Pearson / Jarque-Bera style (skewness + kurtosis)
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: -0.563
## Kurtosis: 4.337 (normal ~ 3)
## Jarque-Bera: 18.703, p=8.68e-05
pheno <- read.table("trait.optimal.G.S.tfam_aligned.txt")
pheno$V3[pheno$V3 == -9] <- NA
write.table(pheno, "trait.optimal.G.S.tfam_aligned_corr.txt", row.names = F)
y <- pheno$V3
y <- y[is.finite(y)] # drop NA/NaN/Inf
y
## [1] 0.03750000 0.35000000 -0.23333333 -0.17500000 0.06250000 -0.06250000
## [7] 0.06666667 0.10000000 0.03333333 0.26666667 0.13333333 0.16250000
## [13] 0.67500000 0.06666667 -0.05000000 0.33333333 0.01250000 0.18333333
## [19] 0.00000000 -1.10000000 0.06666667 -0.27000000 0.05000000 0.06000000
## [25] 0.18000000 0.87500000 0.12500000 0.35000000 -0.05000000 0.51250000
## [31] 0.20000000 0.16000000 0.38333333 1.10000000 0.18333333 0.31666667
## [37] 0.27500000 -0.55000000 0.08000000 -0.01250000 0.05000000 -0.08750000
## [43] 0.15000000 0.02500000 -0.03333333 0.13750000 -0.08750000 0.08000000
## [49] 0.25000000 -0.05000000 0.40000000 -0.03750000 -0.05000000 0.02500000
## [55] 0.05000000 0.13750000 -0.08333333 0.22500000 0.40000000 -0.05000000
## [61] 0.05000000 0.17500000 0.15000000 0.71250000 0.27500000 0.08000000
## [67] -0.02000000 0.22000000 0.02000000 0.02000000 -0.11000000 -0.01000000
## [73] 0.08000000 -0.11250000 -0.31000000 0.08750000 0.21666667 0.36666667
## [79] -0.11250000 0.18750000 0.20000000 0.08750000 0.30000000 0.00000000
## [85] 0.05000000 0.06250000 0.35000000 0.03333333 0.16666667 0.01666667
## [91] -0.06250000 0.63333333 0.23750000 0.05000000 0.60000000 0.05000000
## [97] -0.38333333 0.12500000 -0.23750000 0.17500000 -0.03750000 -0.08750000
## [103] 0.40000000 0.56250000 0.02500000 0.71666667 0.25000000 -0.05000000
## [109] 0.10000000 -0.01666667 -0.11250000 0.05000000 0.11666667 0.18750000
## [115] -0.33333333 0.06666667 -0.15000000 0.00000000 -0.10000000 0.20000000
## [121] -1.05000000 -0.30000000 0.00000000 0.07500000 -0.51666667 -0.28333333
## [127] 0.22500000 0.06666667 0.40000000 0.05000000 0.13333333 0.23333333
## [133] 0.07500000 0.00000000 0.01666667 0.42500000 -0.05000000 -0.16666667
## [139] 0.05000000 0.15000000 -0.01666667 0.06666667 0.41666667 0.45000000
## [145] -0.01666667 -0.12500000
First - let’s do some visuals:
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
Now - let’s get some numbers:
Shapiro-Wilk (recommended for n <= 5000)
sh <- shapiro.test(y)
print(sh)
##
## Shapiro-Wilk normality test
##
## data: y
## W = 0.90677, p-value = 4.53e-08
Anderson-Darling (good power for many departures from normality)
#ad <-
nortest::ad.test(y)
##
## Anderson-Darling normality test
##
## data: y
## A = 3.7265, p-value = 2.439e-09
print(ad)
##
## Anderson-Darling normality test
##
## data: y
## A = 0.93301, p-value = 0.01752
D’Agostino-Pearson / Jarque-Bera style (skewness + kurtosis)
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: -0.373
## Kurtosis: 7.696 (normal ~ 3)
## Jarque-Bera: 137.519, p=0
pheno <- read.table("trait.optimal.pt.dist.C.tfam_aligned.txt")
pheno$V3[pheno$V3 == -9] <- NA
write.table(pheno, "trait.optimal.pt.dist.C.tfam_aligned_corr.txt", row.names = F)
y <- pheno$V3
y <- y[is.finite(y)] # drop NA/NaN/Inf
y
## [1] 108.138160 30.141398 66.372139 10.435085 1351.407706 12.345047
## [7] 35.431202 202.653210 19.696191 3.511861 13.078911 174.571749
## [13] 1746.081766 41.954239 10.513659 41.842746 12.609847 14.857870
## [19] 47.714177 1.252295 422.416490 1420.195928 17.390426 6.041013
## [25] 26.367183 2.291419 79.137011 21.597793 4.072800 458.809686
## [31] 27.783039 14.600634 35.330620 62.133809 25.773820 19.951493
## [37] 8.184837 65.566886 126.932776 103.110046 5.951883 13.261498
## [43] 22.424597 302.094433 13.276339 32.301668 21.050471 78.107556
## [49] 32.787928 6.739327 213.671071 33.846197 66.605535 35.065179
## [55] 214.375047 114.164536 25.411503 69.535953 870.663103 463.667339
## [61] 90.064599 45.265395 4.574525 27.591675 6.654747 143.806478
## [67] 133.260335 69.941606 71.175888 22.593823 177.733739 65.248137
## [73] 84.830800 28.096349 80.502093 74.041671 20.255109 122.890187
## [79] 15.229126 119.104105 18.918267 53.326051 663.756858 39.850541
## [85] 27.646989 58.156664 21.218437 1231.848504 11.931623 271.210669
## [91] 9.470194 43.388089 10.110317 3.861678 139.709180 231.697644
## [97] 26.736405 72.422540 16.018204 31.049073 18.850080 16.926488
## [103] 145.690269 35.222961 136.475813 18.035863 36.009279 199.313220
## [109] 10.530507 167.508743 10.585593 54.025920 27.038711 37.070434
## [115] 68.215859 169.825965 134.311859 244.150570 16.610092 52.215851
## [121] 2.418298 334.966561 29.275482 44.131412 43.487522 54.381604
## [127] 61.019376 38.627332 73.804338 16.832953 59.352052 72.976966
## [133] 18.949380 384.182359 10.923122 27.649444 40.092263 15.557702
## [139] 29.237500 9.664503 25.794224 103.925174 95.474830 20.340701
## [145] 7.878581 18.282450 16.664259
First - let’s do some visuals:
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
Now - let’s get some numbers:
Shapiro-Wilk (recommended for n <= 5000)
sh <- shapiro.test(y)
print(sh)
##
## Shapiro-Wilk normality test
##
## data: y
## W = 0.42921, p-value < 2.2e-16
Anderson-Darling (good power for many departures from normality)
ad <- nortest::ad.test(y)
print(ad)
##
## Anderson-Darling normality test
##
## data: y
## A = 28.298, p-value < 2.2e-16
D’Agostino-Pearson / Jarque-Bera style (skewness + kurtosis)
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: 4.392
## Kurtosis: 23.626 (normal ~ 3)
## Jarque-Bera: 3078.210, p=0
pheno <- read.table("trait.optimal.pt.dist.S.tfam_aligned.txt")
pheno$V3[pheno$V3 == -9] <- NA
write.table(pheno, "trait.optimal.pt.dist.S.tfam_aligned_corr.txt", row.names = F)
y <- pheno$V3
y <- y[is.finite(y)] # drop NA/NaN/Inf
y
## [1] 2.148712e+00 1.082508e+00 4.073213e+01 3.235350e+00 1.524937e+02
## [6] 2.421165e+00 6.657706e+01 1.247843e+01 2.401958e-01 1.903473e-01
## [11] 7.920330e-01 7.858670e+01 1.677544e+02 1.808478e+01 5.236054e-01
## [16] 3.747291e+01 2.781654e+00 7.638735e-01 3.268116e+00 1.520000e-04
## [21] 6.431960e+02 1.711250e+03 1.226688e+00 3.263415e+00 4.470739e+00
## [26] 6.639455e-01 2.204882e+00 7.521949e-01 7.291830e-01 1.870656e+01
## [31] 6.222517e-01 8.306098e-01 1.821510e-01 1.515020e-01 2.962203e-01
## [36] 8.912023e+00 5.028825e-02 2.024666e+01 6.099293e+00 1.015679e+01
## [41] 1.320005e-01 2.219239e+00 3.986750e-03 2.089759e+01 3.915172e-01
## [46] 1.025288e+00 1.224009e+02 1.644496e+01 1.121709e+01 1.557262e+00
## [51] 2.038223e+01 1.014905e+01 9.322021e+01 3.410802e+01 9.924297e+00
## [56] 4.869872e+01 9.815235e-01 8.520413e+00 1.882276e+01 8.842347e+01
## [61] 1.231976e+01 1.298385e+01 5.222550e-02 1.329832e+01 2.755623e-01
## [66] 1.405821e+01 1.884315e+00 1.884438e+00 1.272932e+02 3.091614e+00
## [71] 8.654340e+00 1.415860e+02 1.570090e+01 1.668601e+00 1.254543e+02
## [76] 2.152426e+01 3.390944e+00 1.096712e+02 1.937047e+01 3.643801e+00
## [81] 9.131500e-03 2.138184e+00 1.450974e+01 6.490710e-01 3.343057e-01
## [86] 6.933774e+00 1.449108e-01 1.233285e+01 4.950908e-01 6.262271e+01
## [91] 9.318416e+00 2.654571e+00 3.379163e+01 2.499640e+00 2.126328e-01
## [96] 1.223865e+02 4.003100e-02 1.726352e+01 7.392477e-01 3.449403e-01
## [101] 1.985507e+00 2.420048e+00 4.742411e+01 6.599089e-01 1.721630e+01
## [106] 4.689806e+00 2.582590e+00 6.705422e+01 1.480298e+00 3.191254e+01
## [111] 1.988619e+00 1.226426e+01 4.221593e+00 1.280620e+01 3.286986e+02
## [116] 2.432353e+00 3.462559e+00 2.110248e+01 1.393420e+00 1.483306e+01
## [121] 2.830000e-04 1.827898e+00 1.261423e+00 3.642247e+00 7.994826e+01
## [126] 1.024490e+00 1.468993e+01 1.101423e-01 4.546213e+01 7.983970e-01
## [131] 3.779305e+00 5.771817e+00 1.179780e+01 1.325290e+00 1.748261e+00
## [136] 4.050743e-01 2.842478e+00 1.861722e+01 4.107853e-01 2.004206e+00
## [141] 6.708759e+00 3.062644e+00 2.282958e-01 1.543885e-01 2.937988e+00
## [146] 1.810371e+00
First - let’s do some visuals:
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
Now - let’s get some numbers:
Shapiro-Wilk (recommended for n <= 5000)
sh <- shapiro.test(y)
print(sh)
##
## Shapiro-Wilk normality test
##
## data: y
## W = 0.20557, p-value < 2.2e-16
Anderson-Darling (good power for many departures from normality)
#ad <-
nortest::ad.test(y)
##
## Anderson-Darling normality test
##
## data: y
## A = 38.396, p-value < 2.2e-16
print(ad)
##
## Anderson-Darling normality test
##
## data: y
## A = 28.298, p-value < 2.2e-16
D’Agostino-Pearson / Jarque-Bera style (skewness + kurtosis)
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: 9.265
## Kurtosis: 97.271 (normal ~ 3)
## Jarque-Bera: 56151.104, p=0
OK now we have established which traits need normalization - let’s do that.
First - let’s set up some helper functions:
is_num <- function(x) is.numeric(x) || is.integer(x)
## Rank-based inverse normal transformation (Blom’s formula by default)
INT <- function(x, ties.method = "average") {
x <- as.numeric(x)
ok <- is.finite(x)
out <- rep(NA_real_, length(x))
r <- rank(x[ok], ties.method = ties.method)
n <- length(r)
out[ok] <- qnorm((r - 3/8) / (n + 1/4))
out
}
## Shift to make values strictly positive (needed for log/sqrt/BoxCox)
shift_to_positive <- function(x, eps = 1e-6) {
x <- as.numeric(x)
m <- min(x, na.rm = TRUE)
if (!is.finite(m)) return(x)
if (m <= 0) x <- x + (-m) + eps
x
}
stopifnot(is_num(y))
pheno <- read.table("trait.optimal.alpha.C.tfam_aligned_corr.txt", header = T)
pheno
pheno$transformed <- INT(pheno$V3)
pheno
y <- pheno$transformed
y <- y[is.finite(y)] # drop NA/NaN/Inf
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: -0.000
## Kurtosis: 2.819 (normal ~ 3)
## Jarque-Bera: 0.201, p=0.905
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
OK - looks good - let’s save this:
pheno2 <- pheno[,c(1:2,4)]
pheno2
write.table(pheno2, row.names = F, col.names = F, "trait.optimal.alpha.C.al.cor.transformed.txt")
pheno <- read.table("trait.optimal.alpha.S.tfam_aligned_corr.txt", header = T)
pheno
pheno$transformed <- INT(pheno$V3)
pheno
y <- pheno$transformed
y <- y[is.finite(y)] # drop NA/NaN/Inf
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: 0.000
## Kurtosis: 2.818 (normal ~ 3)
## Jarque-Bera: 0.201, p=0.904
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
OK - looks good - let’s save this:
pheno2 <- pheno[,c(1:2,4)]
pheno2
write.table(pheno2, row.names = F, col.names = F, "trait.optimal.alpha.S.al.cor.transformed.txt")
pheno <- read.table("trait.optimal.G.C.tfam_aligned_corr.txt", header = T)
pheno
pheno$transformed <- INT(pheno$V3)
pheno
y <- pheno$transformed
y <- y[is.finite(y)] # drop NA/NaN/Inf
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: 0.002
## Kurtosis: 2.820 (normal ~ 3)
## Jarque-Bera: 0.199, p=0.905
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
OK - looks good - let’s save this:
pheno2 <- pheno[,c(1:2,4)]
pheno2
write.table(pheno2, row.names = F, col.names = F, "trait.optimal.G.C.al.cor.transformed.txt")
pheno <- read.table("trait.optimal.G.S.tfam_aligned_corr.txt", header = T)
pheno
pheno$transformed <- INT(pheno$V3)
pheno
y <- pheno$transformed
y <- y[is.finite(y)] # drop NA/NaN/Inf
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: 0.000
## Kurtosis: 2.820 (normal ~ 3)
## Jarque-Bera: 0.197, p=0.906
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
OK - looks good - let’s save this:
pheno2 <- pheno[,c(1:2,4)]
pheno2
write.table(pheno2, row.names = F, col.names = F, "trait.optimal.G.S.al.cor.transformed.txt")
pheno <- read.table("trait.optimal.pt.dist.C.tfam_aligned_corr.txt", header = T)
pheno
pheno$transformed <- INT(pheno$V3)
pheno
y <- pheno$transformed
y <- y[is.finite(y)] # drop NA/NaN/Inf
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: 0.000
## Kurtosis: 2.819 (normal ~ 3)
## Jarque-Bera: 0.201, p=0.904
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
OK - looks good - let’s save this:
pheno2 <- pheno[,c(1:2,4)]
pheno2
write.table(pheno2, row.names = F, col.names = F, "trait.optimal.pt.dist.C.al.cor.transformed.txt")
pheno <- read.table("trait.optimal.pt.dist.S.tfam_aligned_corr.txt", header = T)
pheno
pheno$transformed <- INT(pheno$V3)
pheno
y <- pheno$transformed
y <- y[is.finite(y)] # drop NA/NaN/Inf
n <- length(y)
sk <- moments::skewness(y)
ku <- moments::kurtosis(y) # by default, "raw" kurtosis (normal ~ 3)
jb <- n/6 * (sk^2 + ((ku - 3)^2)/4) # Jarque–Bera statistic approximation
p_jb <- 1 - pchisq(jb, df = 2)
cat(sprintf("Skewness: %.3f\nKurtosis: %.3f (normal ~ 3)\nJarque-Bera: %.3f, p=%.3g\n",
sk, ku, jb, p_jb))
## Skewness: 0.000
## Kurtosis: 2.818 (normal ~ 3)
## Jarque-Bera: 0.201, p=0.904
hist(y, breaks = "FD", main = "Trait distribution", xlab = "Trait", col = "grey80", border = "white")
lines(density(y), lwd = 2)
qqnorm(y, main = "Q-Q plot of trait")
qqline(y, lwd = 2)
OK - looks good - let’s save this:
pheno2 <- pheno[,c(1:2,4)]
pheno2
write.table(pheno2, row.names = F, col.names = F, "trait.optimal.pt.dist.S.al.cor.transformed.txt")