setwd("/flash/LuscombeU/jiashun/20230612_phyluce/")
#install.packages("viridis")
#install.packages("ggrepel")
#install.packages("GGally")
#install.packages("entropy")
library(viridis)
## Loading required package: viridisLite
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggrepel)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(entropy)
# read the data
d = read.delim("8-iqtree/matrix70/matrix70_concord.cf.stat", header = T, comment.char="#")
colnames(d)
## [1] "ID" "gCF" "gCF_N" "gDF1" "gDF1_N" "gDF2" "gDF2_N" "gDFP"
## [9] "gDFP_N" "gN" "sCF" "sCF_N" "sDF1" "sDF1_N" "sDF2" "sDF2_N"
## [17] "sN" "Label" "Length"
# 1 ID: Branch ID
# 2 gCF: Gene concordance factor (=gCF_N/gN %)
# 3 gCF_N: Number of trees concordant with the branch
# 4 gDF1: Gene discordance factor for NNI-1 branch (=gDF1_N/gN %)
# 5 gDF1_N: Number of trees concordant with NNI-1 branch
# 6 gDF2: Gene discordance factor for NNI-2 branch (=gDF2_N/gN %)
# 7 gDF2_N: Number of trees concordant with NNI-2 branch
# 8 gDFP: Gene discordance factor due to polyphyly (=gDFP_N/gN %)
# 9 gDFP_N: Number of trees decisive but discordant due to polyphyly
# 10 gN: Number of trees decisive for the branch
# 11 sCF: Site concordance factor averaged over 100 quartets (=sCF_N/sN %)
# 12 sCF_N: sCF in absolute number of sites
# 13 sDF1: Site discordance factor for alternative quartet 1 (=sDF1_N/sN %)
# 14 sDF1_N: sDF1 in absolute number of sites
# 15 sDF2: Site discordance factor for alternative quartet 2 (=sDF2_N/sN %)
# 16 sDF2_N: sDF2 in absolute number of sites
# 17 sN: Number of informative sites averaged over 100 quartets
# 18 Label: Existing branch label
# 18 Length: Branch length
summary(d)
## ID gCF gCF_N gDF1
## Min. :36.00 Min. : 7.50 Min. : 332 Min. : 0.000
## 1st Qu.:43.75 1st Qu.:41.76 1st Qu.:1809 1st Qu.: 1.045
## Median :51.50 Median :75.32 Median :3340 Median : 3.385
## Mean :51.50 Mean :64.30 Mean :2796 Mean : 5.150
## 3rd Qu.:59.25 3rd Qu.:89.90 3rd Qu.:3908 3rd Qu.: 6.505
## Max. :67.00 Max. :99.60 Max. :4347 Max. :23.680
## gDF1_N gDF2 gDF2_N gDFP
## Min. : 0.0 Min. : 0.000 Min. : 0.0 Min. : 0.170
## 1st Qu.: 44.0 1st Qu.: 0.615 1st Qu.: 27.5 1st Qu.: 3.915
## Median : 148.0 Median : 3.990 Median : 176.5 Median : 8.070
## Mean : 226.7 Mean : 5.691 Mean : 250.4 Mean :24.861
## 3rd Qu.: 289.0 3rd Qu.: 8.668 3rd Qu.: 383.2 3rd Qu.:43.620
## Max. :1021.0 Max. :27.920 Max. :1204.0 Max. :83.560
## gDFP_N gN sCF sCF_N
## Min. : 7.0 Min. :3877 Min. : 34.40 Min. : 525.9
## 1st Qu.: 173.8 1st Qu.:4381 1st Qu.: 52.42 1st Qu.: 2444.8
## Median : 359.5 Median :4436 Median : 66.06 Median : 11534.7
## Mean :1102.4 Mean :4376 Mean : 69.16 Mean : 28568.9
## 3rd Qu.:1942.2 3rd Qu.:4445 3rd Qu.: 85.25 3rd Qu.: 29134.9
## Max. :3699.0 Max. :4459 Max. :100.00 Max. :176527.6
## sDF1 sDF1_N sDF2 sDF2_N
## Min. : 0.00 Min. : 0.0 Min. : 0.000 Min. : 0.0
## 1st Qu.: 6.97 1st Qu.: 511.3 1st Qu.: 8.045 1st Qu.: 541.0
## Median :15.30 Median : 774.6 Median :17.645 Median : 804.9
## Mean :14.62 Mean : 6406.2 Mean :16.220 Mean : 6901.3
## 3rd Qu.:22.35 3rd Qu.: 4961.7 3rd Qu.:25.797 3rd Qu.: 7046.4
## Max. :30.42 Max. :38804.3 Max. :35.180 Max. :39559.5
## sN Label Length
## Min. : 1522 Min. : 98.00 Min. :6.643e-05
## 1st Qu.: 3757 1st Qu.:100.00 1st Qu.:6.122e-04
## Median : 13545 Median :100.00 Median :3.433e-03
## Mean : 41876 Mean : 99.94 Mean :1.554e-02
## 3rd Qu.: 44454 3rd Qu.:100.00 3rd Qu.:9.838e-03
## Max. :239567 Max. :100.00 Max. :1.635e-01
#names(d)[10] = "bootstrap"
#names(d)[11] = "branchlength"
head(d)
## ID gCF gCF_N gDF1 gDF1_N gDF2 gDF2_N gDFP gDFP_N gN sCF sCF_N
## 1 36 99.60 4193 0.05 2 0.19 8 0.17 7 4210 83.72 102677.87
## 2 37 86.72 3722 3.40 146 3.45 148 6.43 276 4292 47.22 45981.28
## 3 38 83.69 3504 4.11 172 6.14 257 6.07 254 4187 46.06 45056.37
## 4 39 97.17 4052 1.20 50 0.41 17 1.22 51 4170 65.17 98449.02
## 5 40 47.98 2069 23.68 1021 27.92 1204 0.42 18 4312 43.00 59198.91
## 6 41 99.43 4339 0.00 0 0.00 0 0.57 25 4364 91.65 149748.87
## sDF1 sDF1_N sDF2 sDF2_N sN Label Length
## 1 7.39 8980.37 8.89 10771.36 122429.60 100 0.0516744
## 2 23.27 22678.52 29.51 28779.82 97439.62 100 0.0189989
## 3 24.23 23683.18 29.72 29166.21 97905.76 100 0.0200700
## 4 19.74 29834.92 15.10 22799.52 151083.46 100 0.0683911
## 5 28.23 38804.33 28.77 39559.52 137562.76 100 0.0232155
## 6 4.12 6651.05 4.22 6797.98 163197.90 100 0.0787252
More generally, we can look at the links between bootstrap, gCF, and sCF across all nodes of the tree. This is simple to do in R, because IQ-TREE outputs a tab-delimited file that’s easy to read called concord.cf.stat. This file has one row per branch in the tree, and gives a lot of details on the statistics for each branch.
# plot the values
ggplot(d, aes(x = gCF, y = sCF)) +
geom_point(aes(colour = Label)) +
scale_colour_viridis(direction = -1) +
xlim(0, 100) +
ylim(0, 100) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed")+
theme_bw()+
theme(axis.title = element_text(size = 15, color = "black") )
# adding ID label
ggplot(d, aes(x = gCF, y = sCF, label = ID)) +
geom_point(aes(colour = Label)) +
scale_colour_viridis(direction = -1) +
xlim(0, 100) +
ylim(0, 100) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
geom_text_repel()+
theme_bw()+
theme(axis.title = element_text(size = 15, color = "black") )
A discordance factor relates to the proportion of genes (gDF) or sites (sDF) that support a different resolution of the branch we’re interested in.
To get at the discordance factors for the deepest split in the tree above, we need to figure out the ID of the branch. You can do this by loading the concord.cf.branch file into your tree viewer. In this tree the branch labels are ID numbers that are used in various other output files that IQ-TREE produces. In this case, the branch we are interested in has ID 31. Now we can look up some more details on that branch in the file concord.cf.stat. You can do this straight from the text file, using excel, or in R. I’ll do it in R:
d[d$ID==46,]
## ID gCF gCF_N gDF1 gDF1_N gDF2 gDF2_N gDFP gDFP_N gN sCF sCF_N
## 11 46 63.33 2811 12.05 535 10.79 479 13.83 614 4439 80.55 3235.28
## sDF1 sDF1_N sDF2 sDF2_N sN Label Length
## 11 10.27 407.43 9.18 363.5 4006.21 100 0.000936104
chisq = function(DF1, DF2, N){
tryCatch({
# converts percentages to counts, runs chisq, gets pvalue
chisq.test(c(round(DF1*N)/100, round(DF2*N)/100))$p.value
},
error = function(err) {
# errors come if you give chisq two zeros
# but here we're sure that there's no difference
return(1.0)
})
}
e = d %>%
group_by(ID) %>%
mutate(gEF_p = chisq(gDF1, gDF2, gN)) %>%
mutate(sEF_p = chisq(sDF1, sDF2, sN))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `gEF_p = chisq(gDF1, gDF2, gN)`.
## ℹ In group 31: `ID = 66`.
## Caused by warning in `chisq.test()`:
## ! Chi-squared approximation may be incorrect