#cyano<-read.table("sizes_and_transfers", h=F, comment.char = "!")
# Updated version: we now use the Cyanobacteria data with 41 species, not the 2013 one with 36 species.
cyano<-read.table("size_nb_transfers_cyano", h=F, comment.char = "!")
summary(cyano)
## V1 V2
## Min. : 2.000 Min. : 0.0000
## 1st Qu.: 2.000 1st Qu.: 0.0060
## Median : 4.000 Median : 0.0550
## Mean : 9.481 Mean : 0.5684
## 3rd Qu.: 10.000 3rd Qu.: 0.6270
## Max. :286.000 Max. :89.5660
cyano$V2 <- cyano$V2+0.1
summary(cyano$V2/(2*cyano$V1-2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0008333 0.0897059 0.1263889 0.1211481 0.1525000 0.3025000
plot(2*cyano$V1-2, cyano$V2, pch=20, xlab="Number of branches in gene family tree", ylab="Inferred number of transfers per family", ylim=c(0,max(cyano$V2)), col=rgb(0,0,0,0.2))
hist(cyano$V2/(2*cyano$V1-2), pch=20, xlab="Normalized inferred number of transfers per family", ylab="Number of families", nc=30)
plot(density(cyano$V2/(2*cyano$V1-2)), lwd=2, xlab="Normalized inferred number of transfers per family", ylab="Number of families", main="")
boxplot(cyano$V2/(2*cyano$V1-2), xlab="Normalized inferred number of transfers per family", main="", horizontal = TRUE, outline=F, col=rgb(0,0.5,0,0.8), lwd=2)
fungi<-read.table("fungi_size_nb_transfers", h=F, comment.char = "!")
summary(fungi)
## V1 V2
## Min. : 2.000 Min. : 0.0000
## 1st Qu.: 4.000 1st Qu.: 0.0000
## Median : 8.000 Median : 0.0000
## Mean : 8.973 Mean : 0.9581
## 3rd Qu.: 11.000 3rd Qu.: 1.0600
## Max. :200.000 Max. :55.0200
fungi$V2 <- fungi$V2+0.1
summary(fungi$V2/(2*fungi$V1-2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001351 0.007143 0.025000 0.057612 0.074286 0.830000
plot(2*fungi$V1-2, fungi$V2, pch=20, xlab="Number of branches in gene family tree", ylab="Inferred number of transfers per family", ylim=c(0,max(fungi$V2)), col=rgb(0,0,0,0.2))
hist(fungi$V2/(2*fungi$V1-2), pch=20, xlab="Normalized inferred number of transfers per family", ylab="Number of families", nc=30)
plot(density(fungi$V2/(2*fungi$V1-2)), lwd=2, xlab="Normalized inferred number of transfers per family", ylab="Number of families", main="")
boxplot(fungi$V2/(2*fungi$V1-2), xlab="Normalized inferred number of transfers per family", main="", horizontal = TRUE, outline=F, col=rgb(0.5,0,0,0.8), lwd=2)
d<-read.table("similarities", h=T, comment.char = "!")
summary(d)
## X.exp number_transfers_per_family transfer_rate
## 2A_1 : 1 Min. : 0.01301 Min. :1.000e-09
## 2A1_1 : 1 1st Qu.: 1.30838 1st Qu.:8.750e-08
## 2A2_1 : 1 Median :15.26319 Median :1.000e-06
## 2A3_1 : 1 Mean :25.02384 Mean :2.242e-06
## 2A4_1 : 1 3rd Qu.:40.96979 3rd Qu.:3.250e-06
## 2A5_2 : 1 Max. :80.01358 Max. :1.000e-05
## (Other):10
## kendall conf_sol conf_true conf_sol_0.8
## Min. :0.6013 Min. : 0.000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.7903 1st Qu.: 1.352 1st Qu.: 4.285 1st Qu.:0.6172
## Median :0.9161 Median : 7.781 Median : 9.453 Median :1.9496
## Mean :0.8612 Mean : 9.971 Mean :11.644 Mean :2.4324
## 3rd Qu.:0.9459 3rd Qu.:17.406 3rd Qu.:17.916 3rd Qu.:3.6892
## Max. :0.9644 Max. :25.774 Max. :27.062 Max. :6.6026
##
## conf_true_0.8
## Min. :0.000
## 1st Qu.:3.043
## Median :4.247
## Mean :4.607
## 3rd Qu.:6.561
## Max. :9.651
##
#d$number_transfers_per_family <- d$number_transfers_per_family+0.1
plot(d$number_transfers_per_family/(2*100-2), d$conf_sol, pch=20, xlab="Normalized inferred number of transfers per family", ylab="Percentage of conflicting constraints", ylim=c(0,max(c(d$conf_sol, d$conf_true)) + 18 ), log="x", cex=1.5, xaxt="n")
ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
axis(1, at=ticks, labels=ticks)
points(d$number_transfers_per_family/(2*100-2), d$conf_true, pch=20, col="red", cex=1.5, xaxt="n")
legend("topleft", legend=c("Conflict with reconstructed time order","Conflict with true time order"), col=c("black", "red"), pch=20, cex=1)
lines(lowess(d$number_transfers_per_family/(2*100-2), d$conf_sol), col = rgb(0,0,0,0.3), lwd=2, xaxt="n")
lines(lowess(d$number_transfers_per_family/(2*100-2), d$conf_true), col = rgb(1,0,0,0.3), lwd=2, xaxt="n")
boxplot(cyano$V2/(2*cyano$V1-2), xlab="", main="", horizontal = TRUE, add=T, outline=F, col=rgb(0,0.5,0,0.8), lwd=2, boxwex=5, at=max(c(d$conf_sol, d$conf_true)) + 1, xaxt="n")
boxplot(fungi$V2/(2*fungi$V1-2), xlab="", main="", horizontal = TRUE, add=T, outline=F, col=rgb(0.5,0,0,0.8), lwd=2, boxwex=5, at=max(c(d$conf_sol, d$conf_true)) + 3, xaxt="n")
#d$number_transfers_per_family <- d$number_transfers_per_family+0.1
plot(d$number_transfers_per_family/(2*100-2), d$conf_sol, pch=20, xlab="Normalized inferred number of transfers per family", ylab="Percentage of conflicting constraints", ylim=c(0,max(c(d$conf_sol, d$conf_true)) + 0 ), log="x", cex=1.5, xaxt="n")
ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
axis(1, at=ticks, labels=ticks)
points(d$number_transfers_per_family/(2*100-2), d$conf_true, pch=20, col="red", cex=1.5)
legend("topleft", legend=c("Conflict with reconstructed time order","Conflict with true time order"), col=c("black", "red"), pch=20, cex=1)
lines(lowess(d$number_transfers_per_family/(2*100-2), d$conf_sol), col = rgb(0,0,0,0.3), lwd=2)
lines(lowess(d$number_transfers_per_family/(2*100-2), d$conf_true), col = rgb(1,0,0,0.3), lwd=2)
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(d$transfer_rate, d$number_transfers_per_family/(2*100-2), pch=20, xlab="Transfer rate for simulation", ylab="Normalized inferred number of transfers\nper family", ylim=c(0,max(d$number_transfers_per_family/(2*100-2))), log="x", cex=1.5)#, xaxt="n")
#ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
#axis(1, at=ticks, labels=ticks)
lines(lowess(d$transfer_rate, d$number_transfers_per_family/(2*100-2)), col = "grey", lwd=2)
#ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
#axis(1, at=ticks, labels=ticks)
boxplot(cyano$V2/(2*cyano$V1-2), xlab="", main="", add=T, outline=F, col=rgb(0,0.5,0,0.8), lwd=2, at=1e-05)
boxplot(fungi$V2/(2*fungi$V1-2), xlab="", main="", add=T, outline=F, col=rgb(0.5,0,0,0.8), lwd=2, at=5e-06)
#ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
#axis(1, at=ticks, labels=ticks)
Même chose, sans Boxplot:
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(d$transfer_rate, d$number_transfers_per_family/(2*100-2), pch=20, xlab="Transfer rate for simulation", ylab="Normalized inferred number of transfers\nper family", ylim=c(0,max(d$number_transfers_per_family/(2*100-2))), log="x", cex=1.5)
lines(lowess(d$transfer_rate, d$number_transfers_per_family/(2*100-2)), col = "grey", lwd=2)
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(d$number_transfers_per_family/(2*100-2), d$kendall, pch=20, xlab="Normalized inferred number of transfers\nper family", ylab="Normalized Kendall similarity\n between inferred and true orders", ylim=c(0.3,max(d$kendall)) , cex=1.5, log="x", xaxt="n")
ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
axis(1, at=ticks, labels=ticks)
#lines(lowess(d$number_transfers_per_family/(2*100-2), d$kendall), col = "grey", lwd=2)
boxplot(cyano$V2/(2*cyano$V1-2), xlab="", main="", horizontal = TRUE, add=T, outline=F, col=rgb(0,0.5,0,0.8), lwd=2, at=0.4, boxwex=0.1, xaxt="n")
boxplot(fungi$V2/(2*fungi$V1-2), xlab="", main="", horizontal = TRUE, add=T, outline=F, col=rgb(0.5,0,0,0.8), lwd=2, boxwex=0.1, at=0.5, xaxt="n")
#d<-read.table("ForFig4.txt", h=T)
d<-read.table("conflictPerNumbersOfGeneTrees", h=T)
summary(d)
## unused conflict kendallToTrue
## Min. : 1 Min. : 0.5597 Min. :0.6624
## 1st Qu.: 575 1st Qu.:20.1353 1st Qu.:0.9440
## Median :2050 Median :20.6687 Median :0.9586
## Mean :2133 Mean :19.7308 Mean :0.9379
## 3rd Qu.:3525 3rd Qu.:20.7161 3rd Qu.:0.9616
## Max. :5000 Max. :20.7645 Max. :0.9652
#plot(5000-d$unused, d$conflict, pch=20, xlab="Number of gene families", ylab="Percentage of conflicting constraints", ylim=c(0,max(d$conflict)), cex=1.5)
plot(d$unused, d$conflict, pch=20, xlab="Number of gene families", ylab="Percentage of conflicting constraints", ylim=c(0,max(d$conflict)), cex=1.5)
#lines(lowess(5000-d$unused, d$conflict), col = "grey", lwd=2)
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
#plot(5000-d$unused, d$kendallToTrue, pch=20, xlab="Number of gene families", ylab="Normalized Kendall similarity between inferred and true orders", ylim=c(0.6,max(d$kendallToTrue)), cex=1.5)
plot(d$unused, d$kendallToTrue, pch=20, xlab="Number of gene families", ylab="Normalized Kendall similarity\nbetween inferred and true orders", ylim=c(0.6,max(d$kendallToTrue)), cex=1.5)
#lines(lowess(5000-d$unused, d$kendallToTrue), col = "grey", lwd=2)
par(mfrow=c(2,1))
layout(matrix(c(1,2,2,2), nrow = 4, ncol = 1, byrow = TRUE), c(5,5), c(5,5), respect = F)
mar.default <- c(5,4,4,2) + 0.1
par(mar = c(0, 6, 0, 2)+0.1)
plot(d$unused, d$conflict, pch=20, xlab="", ylab="Percentage of\nconflicting constraints", ylim=c(0,max(d$conflict)), cex=1.5, xaxt='n', main="", cex.lab=1.5, cex.axis=1.5)
par(mar=c(5,6,0,2) + 0.1)
plot(d$unused, d$kendallToTrue, pch=20, xlab="Number of gene families", ylab="Normalized Kendall similarity\nbetween inferred and true orders", ylim=c(0.6,max(d$kendallToTrue)), cex=1.5, main="", cex.lab=1.5, cex.axis=1.5)
d<-read.table("pop_size", h=T, comment.char = "!")
summary(d)
## X.pop_size mean_nb_transfers_per_family conflict
## Min. : 2 Min. :29.39 Min. :12.33
## 1st Qu.: 110 1st Qu.:51.80 1st Qu.:22.95
## Median : 2000 Median :54.29 Median :23.80
## Mean : 317460 Mean :56.36 Mean :24.77
## 3rd Qu.: 110000 3rd Qu.:60.83 3rd Qu.:27.01
## Max. :2000000 Max. :85.55 Max. :37.35
## kendall
## Min. :0.8520
## 1st Qu.:0.9221
## Median :0.9462
## Mean :0.9281
## 3rd Qu.:0.9475
## Max. :0.9597
par(mfrow=c(3,1))
layout(matrix(c(1,2,2,3,3,3,3,3,3,3,3), nrow = 11, ncol = 1, byrow = TRUE), c(5,5), c(5,5), respect = F)
#mar.default <- c(5,4,4,2) + 0.1
par(mar = c(0, 26, 0, 2)+0.1)
plot(d$X.pop_size, d$conflict, pch=20, xlab="", ylab="", ylim=c(0.6,max(d$conflict)), log="x", cex=1.5, xaxt='n', main="", cex.lab=1, cex.axis=1.5)
mtext(side = 2, "Percentage of\nconflicting constraints", line = 3, las=1, adj = 1)
par(mar=c(0,26,0,2) + 0.1)
plot(d$X.pop_size, d$mean_nb_transfers_per_family/100, pch=20, xlab="", ylab="", ylim=c(0.0,max(d$mean_nb_transfers_per_family/100)), log="x", cex=1.5, xaxt='n', main="", cex.lab=1, cex.axis=1.5)
mtext(side = 2, "Normalized inferred number \nof transfers per family", line = 3, las=1, adj = 1)
par(mar = c(4, 26, 0, 2)+0.1)
plot(d$X.pop_size, d$kendall, pch=20, xlab="Effective population size", ylab="", ylim=c(0.6,max(d$kendall)), log="x", cex=1.5, main="", cex.lab=1.5, cex.axis=1.5)
mtext(side = 2, "Normalized Kendall similarity\nbetween inferred and true orders", line = 3, las=1, adj = 1)
#plot(d$X.pop_size, d$mean_nb_transfers_per_family, pch=20, xlab="Effective population size", ylab="Percentage of conflicting constraints", ylim=c(0.6,max(d$conflict)), log="x", cex=1.5)
#lines(lowess(d$X.pop_size, d$conflict), col = "grey", lwd=2)
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(d$X.pop_size, d$kendall, pch=20, xlab="Effective population size", ylab="Normalized Kendall similarity\n between inferred and true orders", ylim=c(0.6,max(d$kendall)), log="x", cex=1.5)
#lines(lowess(d$X.pop_size, d$kendall), col = "grey", lwd=2)
#d<-read.table("tree_perturbation.txtOLD", h=T )
d<-read.table("tree_perturbationOK.txt", h=T )
d$tree <- as.character(d$tree)
summary(d)
## tree percentCommon kendall
## Length:25 Min. :82.00 Min. :0.8905
## Class :character 1st Qu.:85.00 1st Qu.:0.9212
## Mode :character Median :92.00 Median :0.9358
## Mean :90.92 Mean :0.9313
## 3rd Qu.:97.00 3rd Qu.:0.9440
## Max. :98.00 Max. :0.9521
We have to add in the true trees:
d<-rbind(c("true_tree_1",100,0.95143603133), d)
d<-rbind(c("true_tree_2",100,0.94639944069), d)
d<-rbind(c("true_tree_3",100,0.95167464114), d)
d<-rbind(c("true_tree_4",100,0.96209016393), d)
d<-rbind(c("true_tree_5",100,0.96421225833), d)
#plot(sort(rep( 1:6, 5)), c(d$true_tree, d$reroot_grandchildren, d$X5NNI, d$X10NNI, d$X15NNI, d$X20NNI), xlab="Species tree topology used for MaxTic", ylab="Normalized Kendall similarity\n between inferred and true orders", xaxt="n", pch=20, col=rgb(0,0,0,0.6), ylim=c(0.85, 1.0), cex=2)
#axis(1, at=c(1,2,3,4, 5, 6), labels=c("True", "Rerooted", "5 NNIs", "10 NNIs","15 NNIs","20 NNIs"))
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(sort(rep( 1:6, 5)), d$kendall, xlab="Species tree topology used for MaxTic", ylab="Normalized Kendall similarity\n between inferred and true orders", xaxt="n", pch=20, col=rgb(0,0,0,0.6), ylim=c(0.85, 1.0), cex=2)
axis(1, at=c(1,2,3,4, 5, 6), labels=c("True", "Rerooted", "5 NNIs", "10 NNIs","15 NNIs","20 NNIs"))
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(sort(rep( 1:6, 5)), d$percentCommon, xlab="Species tree topology used for MaxTic", ylab="Percentage of clades in common \n between altered and true trees", xaxt="n", pch=20, col=rgb(0,0,0,0.6), ylim=c(80, 100), cex=2)
axis(1, at=c(1,2,3,4, 5, 6), labels=c("True", "Rerooted", "5 NNIs", "10 NNIs","15 NNIs","20 NNIs"))
print(d$tree)
## [1] "true_tree_5" "true_tree_4" "true_tree_3" "true_tree_2" "true_tree_1"
## [6] "root1" "root2" "root3" "root4" "root5"
## [11] "NNI5_1" "NNI5_2" "NNI5_3" "NNI5_4" "NNI5_5"
## [16] "NNI10_1" "NNI10_2" "NNI10_3" "NNI10_4" "NNI10_5"
## [21] "NNI15_1" "NNI15_2" "NNI15_3" "NNI15_4" "NNI15_5"
## [26] "NNI20_1" "NNI20_2" "NNI20_3" "NNI20_4" "NNI20_5"
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(d$percentCommon, d$kendall, xlab="Percentage of clades in common \n between altered and true trees", ylab="Normalized Kendall similarity\n between inferred and true orders", pch=20, col=c(rep( rgb(0,0,1,0.6), 5), rep( rgb(1,0,0,0.6), 5), rep( rgb(0,1,0,0.6), 20) ), ylim=c(0.85, 1.0), cex=2, xlim=c(max(as.numeric(d$percentCommon)), min(as.numeric(d$percentCommon))))
abline(lm(d$kendall~as.numeric(d$percentCommon)), col="grey", lwd=2)
Now we are going to read in Kendall similarities obtained on random node orders on trees with some proportion of common clades with the true tree.
randSim<-read.table("random_trees_similarity", h=F )
summary(randSim)
## V1 V2 V3
## random_trees :100 Min. : 82.00 Min. :0.6417
## random_trees_100:100 1st Qu.: 85.00 1st Qu.:0.7180
## random_trees_84 :100 Median : 90.00 Median :0.7352
## random_trees_85 :100 Mean : 90.82 Mean :0.7351
## random_trees_87 :100 3rd Qu.: 97.00 3rd Qu.:0.7530
## random_trees_89 :100 Max. :100.00 Max. :0.8188
## (Other) :500
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(d$percentCommon, d$kendall, xlab="Percentage of clades in common \n between altered and true trees", ylab="Normalized Kendall similarity\n between inferred and true orders", pch=20, col=c(rep( rgb(0,0,1,0.6), 5), rep( rgb(1,0,0,0.6), 5), rep( rgb(0,1,0,0.6), 20) ), ylim=c(0.60, 1.0), cex=2, xlim=c(max(as.numeric(d$percentCommon)), min(as.numeric(d$percentCommon))))
abline(lm(d$kendall~as.numeric(d$percentCommon)), col="grey", lwd=2)
points(jitter(randSim$V2), randSim$V3, pch=20, cex=1, col=rgb(0,0,0,0.3))
legend("bottomleft", col=c(rgb(0,0,1,0.6), rgb(1,0,0,0.6), rgb(0,1,0,0.6), rgb(0,0,0,0.6) ), pch=20,cex=0.75, pt.cex=1.5, legend=c("True tree", "Rerooted", "NNIs", "Random node order"), horiz=T)
Même chose, mais on montre les quantiles des distributions.
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0))
plot(d$percentCommon, d$kendall, xlab="Percentage of clades in common \n between altered and true trees", ylab="Normalized Kendall similarity\n between inferred and true orders", pch=20, col=c(rep( rgb(0,0,1,0.6), 5), rep( rgb(1,0,0,0.6), 5), rep( rgb(0,1,0,0.6), 20) ), ylim=c(0.64, 1.0), cex=2, xlim=c(max(as.numeric(d$percentCommon)), min(as.numeric(d$percentCommon))))
abline(lm(d$kendall~as.numeric(d$percentCommon)), col="grey", lwd=2)
quants <- quantile (randSim$V3, probs=seq(0, 1, 0.05))
#points(jitter(randSim$V2), randSim$V3, pch=20, cex=1, col=rgb(0,0,0,0.3))
abline(h=quants[2], lwd=2, lty=2)
abline(h=quants[20], lwd=2, lty=2)
abline(h=quants[11], lwd=2)
legend("bottomleft", col=c(rgb(0,0,1,0.6), rgb(1,0,0,0.6), rgb(0,1,0,0.6) ), pch=20, cex=1, legend=c("True tree", "Rerooted", "NNIs"), ncol=3)