Analysis of Cyanobacteria data

#cyano<-read.table("sizes_and_transfers", h=F, comment.char = "!")
# Updated version: we now use the Cyanobacteria data with 41 species, not the 2013 one with 36 species.
cyano<-read.table("size_nb_transfers_cyano", h=F, comment.char = "!")

summary(cyano)
##        V1                V2         
##  Min.   :  2.000   Min.   : 0.0000  
##  1st Qu.:  2.000   1st Qu.: 0.0060  
##  Median :  4.000   Median : 0.0550  
##  Mean   :  9.481   Mean   : 0.5684  
##  3rd Qu.: 10.000   3rd Qu.: 0.6270  
##  Max.   :286.000   Max.   :89.5660
cyano$V2 <- cyano$V2+0.1
summary(cyano$V2/(2*cyano$V1-2))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0008333 0.0897059 0.1263889 0.1211481 0.1525000 0.3025000
plot(2*cyano$V1-2, cyano$V2, pch=20, xlab="Number of branches in gene family tree", ylab="Inferred number of transfers per family", ylim=c(0,max(cyano$V2)), col=rgb(0,0,0,0.2))

hist(cyano$V2/(2*cyano$V1-2), pch=20, xlab="Normalized inferred number of transfers per family", ylab="Number of families", nc=30)

plot(density(cyano$V2/(2*cyano$V1-2)), lwd=2, xlab="Normalized inferred number of transfers per family", ylab="Number of families", main="")

boxplot(cyano$V2/(2*cyano$V1-2), xlab="Normalized inferred number of transfers per family", main="", horizontal = TRUE, outline=F, col=rgb(0,0.5,0,0.8), lwd=2)

Fungi analysis

fungi<-read.table("fungi_size_nb_transfers", h=F, comment.char = "!")

summary(fungi)
##        V1                V2         
##  Min.   :  2.000   Min.   : 0.0000  
##  1st Qu.:  4.000   1st Qu.: 0.0000  
##  Median :  8.000   Median : 0.0000  
##  Mean   :  8.973   Mean   : 0.9581  
##  3rd Qu.: 11.000   3rd Qu.: 1.0600  
##  Max.   :200.000   Max.   :55.0200
fungi$V2 <- fungi$V2+0.1
summary(fungi$V2/(2*fungi$V1-2))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001351 0.007143 0.025000 0.057612 0.074286 0.830000
plot(2*fungi$V1-2, fungi$V2, pch=20, xlab="Number of branches in gene family tree", ylab="Inferred number of transfers per family", ylim=c(0,max(fungi$V2)), col=rgb(0,0,0,0.2))

hist(fungi$V2/(2*fungi$V1-2), pch=20, xlab="Normalized inferred number of transfers per family", ylab="Number of families", nc=30)

plot(density(fungi$V2/(2*fungi$V1-2)), lwd=2, xlab="Normalized inferred number of transfers per family", ylab="Number of families", main="")

boxplot(fungi$V2/(2*fungi$V1-2), xlab="Normalized inferred number of transfers per family", main="", horizontal = TRUE, outline=F, col=rgb(0.5,0,0,0.8), lwd=2)

d<-read.table("similarities", h=T, comment.char = "!")

summary(d)
##      X.exp    number_transfers_per_family transfer_rate      
##  2A_1   : 1   Min.   : 0.01301            Min.   :1.000e-09  
##  2A1_1  : 1   1st Qu.: 1.30838            1st Qu.:8.750e-08  
##  2A2_1  : 1   Median :15.26319            Median :1.000e-06  
##  2A3_1  : 1   Mean   :25.02384            Mean   :2.242e-06  
##  2A4_1  : 1   3rd Qu.:40.96979            3rd Qu.:3.250e-06  
##  2A5_2  : 1   Max.   :80.01358            Max.   :1.000e-05  
##  (Other):10                                                  
##     kendall          conf_sol        conf_true       conf_sol_0.8   
##  Min.   :0.6013   Min.   : 0.000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:0.7903   1st Qu.: 1.352   1st Qu.: 4.285   1st Qu.:0.6172  
##  Median :0.9161   Median : 7.781   Median : 9.453   Median :1.9496  
##  Mean   :0.8612   Mean   : 9.971   Mean   :11.644   Mean   :2.4324  
##  3rd Qu.:0.9459   3rd Qu.:17.406   3rd Qu.:17.916   3rd Qu.:3.6892  
##  Max.   :0.9644   Max.   :25.774   Max.   :27.062   Max.   :6.6026  
##                                                                     
##  conf_true_0.8  
##  Min.   :0.000  
##  1st Qu.:3.043  
##  Median :4.247  
##  Mean   :4.607  
##  3rd Qu.:6.561  
##  Max.   :9.651  
## 
#d$number_transfers_per_family <- d$number_transfers_per_family+0.1

Figure 6 in the manuscript

plot(d$number_transfers_per_family/(2*100-2), d$conf_sol, pch=20, xlab="Normalized inferred number of transfers per family", ylab="Percentage of conflicting constraints", ylim=c(0,max(c(d$conf_sol, d$conf_true)) + 18 ), log="x", cex=1.5, xaxt="n")
ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
axis(1, at=ticks, labels=ticks)

points(d$number_transfers_per_family/(2*100-2), d$conf_true, pch=20, col="red", cex=1.5, xaxt="n")
legend("topleft", legend=c("Conflict with reconstructed time order","Conflict with true time order"), col=c("black", "red"), pch=20, cex=1)
lines(lowess(d$number_transfers_per_family/(2*100-2), d$conf_sol), col = rgb(0,0,0,0.3), lwd=2, xaxt="n")
lines(lowess(d$number_transfers_per_family/(2*100-2), d$conf_true), col = rgb(1,0,0,0.3), lwd=2, xaxt="n")
boxplot(cyano$V2/(2*cyano$V1-2), xlab="", main="", horizontal = TRUE, add=T, outline=F, col=rgb(0,0.5,0,0.8), lwd=2, boxwex=5, at=max(c(d$conf_sol, d$conf_true)) + 1, xaxt="n")
boxplot(fungi$V2/(2*fungi$V1-2), xlab="", main="", horizontal = TRUE, add=T, outline=F, col=rgb(0.5,0,0,0.8), lwd=2, boxwex=5, at=max(c(d$conf_sol, d$conf_true)) + 3, xaxt="n")

Same thing, no boxplot

#d$number_transfers_per_family <- d$number_transfers_per_family+0.1

plot(d$number_transfers_per_family/(2*100-2), d$conf_sol, pch=20, xlab="Normalized inferred number of transfers per family", ylab="Percentage of conflicting constraints", ylim=c(0,max(c(d$conf_sol, d$conf_true)) + 0 ), log="x", cex=1.5, xaxt="n")
ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
axis(1, at=ticks, labels=ticks)
points(d$number_transfers_per_family/(2*100-2), d$conf_true, pch=20, col="red", cex=1.5)
legend("topleft", legend=c("Conflict with reconstructed time order","Conflict with true time order"), col=c("black", "red"), pch=20, cex=1)
lines(lowess(d$number_transfers_per_family/(2*100-2), d$conf_sol), col = rgb(0,0,0,0.3), lwd=2)
lines(lowess(d$number_transfers_per_family/(2*100-2), d$conf_true), col = rgb(1,0,0,0.3), lwd=2)

Figure 5 in the manuscript

mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

plot(d$transfer_rate, d$number_transfers_per_family/(2*100-2), pch=20, xlab="Transfer rate for simulation", ylab="Normalized inferred number of transfers\nper family",  ylim=c(0,max(d$number_transfers_per_family/(2*100-2))), log="x", cex=1.5)#, xaxt="n")
#ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
#axis(1, at=ticks, labels=ticks)

lines(lowess(d$transfer_rate, d$number_transfers_per_family/(2*100-2)), col = "grey", lwd=2)
#ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
#axis(1, at=ticks, labels=ticks)

boxplot(cyano$V2/(2*cyano$V1-2), xlab="", main="", add=T, outline=F, col=rgb(0,0.5,0,0.8), lwd=2, at=1e-05)
boxplot(fungi$V2/(2*fungi$V1-2), xlab="", main="", add=T, outline=F, col=rgb(0.5,0,0,0.8), lwd=2, at=5e-06)

#ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
#axis(1, at=ticks, labels=ticks)

Même chose, sans Boxplot:

mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

plot(d$transfer_rate, d$number_transfers_per_family/(2*100-2), pch=20, xlab="Transfer rate for simulation", ylab="Normalized inferred number of transfers\nper family",  ylim=c(0,max(d$number_transfers_per_family/(2*100-2))), log="x", cex=1.5)
lines(lowess(d$transfer_rate, d$number_transfers_per_family/(2*100-2)), col = "grey", lwd=2)

Figure 8 in the manuscript

mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 
plot(d$number_transfers_per_family/(2*100-2), d$kendall, pch=20, xlab="Normalized inferred number of transfers\nper family", ylab="Normalized Kendall similarity\n between inferred and true orders", ylim=c(0.3,max(d$kendall))  , cex=1.5, log="x", xaxt="n")
ticks =c(5e-4, 1e-4,5e-3,1e-3,5e-2,1e-2,5e-1)
axis(1, at=ticks, labels=ticks)
#lines(lowess(d$number_transfers_per_family/(2*100-2), d$kendall), col = "grey", lwd=2)
boxplot(cyano$V2/(2*cyano$V1-2), xlab="", main="", horizontal = TRUE, add=T, outline=F, col=rgb(0,0.5,0,0.8), lwd=2, at=0.4, boxwex=0.1, xaxt="n")
boxplot(fungi$V2/(2*fungi$V1-2), xlab="", main="", horizontal = TRUE, add=T, outline=F, col=rgb(0.5,0,0,0.8), lwd=2, boxwex=0.1, at=0.5, xaxt="n")

Figure 4

#d<-read.table("ForFig4.txt", h=T)
d<-read.table("conflictPerNumbersOfGeneTrees", h=T)
summary(d)
##      unused        conflict       kendallToTrue   
##  Min.   :   1   Min.   : 0.5597   Min.   :0.6624  
##  1st Qu.: 575   1st Qu.:20.1353   1st Qu.:0.9440  
##  Median :2050   Median :20.6687   Median :0.9586  
##  Mean   :2133   Mean   :19.7308   Mean   :0.9379  
##  3rd Qu.:3525   3rd Qu.:20.7161   3rd Qu.:0.9616  
##  Max.   :5000   Max.   :20.7645   Max.   :0.9652
#plot(5000-d$unused, d$conflict, pch=20, xlab="Number of gene families", ylab="Percentage of conflicting constraints", ylim=c(0,max(d$conflict)), cex=1.5)
plot(d$unused, d$conflict, pch=20, xlab="Number of gene families", ylab="Percentage of conflicting constraints", ylim=c(0,max(d$conflict)), cex=1.5)

#lines(lowess(5000-d$unused, d$conflict), col = "grey", lwd=2)
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

#plot(5000-d$unused, d$kendallToTrue, pch=20, xlab="Number of gene families", ylab="Normalized Kendall similarity between inferred and true orders", ylim=c(0.6,max(d$kendallToTrue)), cex=1.5)
plot(d$unused, d$kendallToTrue, pch=20, xlab="Number of gene families", ylab="Normalized Kendall similarity\nbetween inferred and true orders", ylim=c(0.6,max(d$kendallToTrue)), cex=1.5)

#lines(lowess(5000-d$unused, d$kendallToTrue), col = "grey", lwd=2)

Figure 7 in the manuscript: 2 plots in one figure

par(mfrow=c(2,1))

layout(matrix(c(1,2,2,2), nrow = 4, ncol = 1, byrow = TRUE), c(5,5), c(5,5), respect = F)

mar.default <- c(5,4,4,2) + 0.1
par(mar = c(0, 6, 0, 2)+0.1) 

plot(d$unused, d$conflict, pch=20, xlab="", ylab="Percentage of\nconflicting constraints", ylim=c(0,max(d$conflict)), cex=1.5, xaxt='n', main="", cex.lab=1.5, cex.axis=1.5)

par(mar=c(5,6,0,2) + 0.1)
plot(d$unused, d$kendallToTrue, pch=20, xlab="Number of gene families", ylab="Normalized Kendall similarity\nbetween inferred and true orders", ylim=c(0.6,max(d$kendallToTrue)), cex=1.5, main="", cex.lab=1.5, cex.axis=1.5)

d<-read.table("pop_size", h=T, comment.char = "!")

summary(d)
##    X.pop_size      mean_nb_transfers_per_family    conflict    
##  Min.   :      2   Min.   :29.39                Min.   :12.33  
##  1st Qu.:    110   1st Qu.:51.80                1st Qu.:22.95  
##  Median :   2000   Median :54.29                Median :23.80  
##  Mean   : 317460   Mean   :56.36                Mean   :24.77  
##  3rd Qu.: 110000   3rd Qu.:60.83                3rd Qu.:27.01  
##  Max.   :2000000   Max.   :85.55                Max.   :37.35  
##     kendall      
##  Min.   :0.8520  
##  1st Qu.:0.9221  
##  Median :0.9462  
##  Mean   :0.9281  
##  3rd Qu.:0.9475  
##  Max.   :0.9597

Figure 9 in the manuscript

par(mfrow=c(3,1))

layout(matrix(c(1,2,2,3,3,3,3,3,3,3,3), nrow = 11, ncol = 1, byrow = TRUE), c(5,5), c(5,5), respect = F)

#mar.default <- c(5,4,4,2) + 0.1
par(mar = c(0, 26, 0, 2)+0.1) 

plot(d$X.pop_size, d$conflict, pch=20, xlab="", ylab="", ylim=c(0.6,max(d$conflict)), log="x", cex=1.5, xaxt='n', main="", cex.lab=1, cex.axis=1.5)
mtext(side = 2, "Percentage of\nconflicting constraints", line = 3, las=1, adj = 1)

par(mar=c(0,26,0,2) + 0.1)
plot(d$X.pop_size, d$mean_nb_transfers_per_family/100, pch=20, xlab="", ylab="", ylim=c(0.0,max(d$mean_nb_transfers_per_family/100)), log="x", cex=1.5, xaxt='n', main="", cex.lab=1, cex.axis=1.5)
mtext(side = 2, "Normalized inferred number \nof transfers per family", line = 3, las=1, adj = 1)

par(mar = c(4, 26, 0, 2)+0.1) 
plot(d$X.pop_size, d$kendall, pch=20, xlab="Effective population size", ylab="", ylim=c(0.6,max(d$kendall)), log="x", cex=1.5, main="", cex.lab=1.5, cex.axis=1.5)
mtext(side = 2, "Normalized Kendall similarity\nbetween inferred and true orders", line = 3, las=1, adj = 1)

#plot(d$X.pop_size, d$mean_nb_transfers_per_family, pch=20, xlab="Effective population size", ylab="Percentage of conflicting constraints", ylim=c(0.6,max(d$conflict)), log="x", cex=1.5)
#lines(lowess(d$X.pop_size, d$conflict), col = "grey", lwd=2)
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

plot(d$X.pop_size, d$kendall, pch=20, xlab="Effective population size", ylab="Normalized Kendall similarity\n between inferred and true orders", ylim=c(0.6,max(d$kendall)), log="x", cex=1.5)

#lines(lowess(d$X.pop_size, d$kendall), col = "grey", lwd=2)

Figure 7

#d<-read.table("tree_perturbation.txtOLD", h=T )
d<-read.table("tree_perturbationOK.txt", h=T )
d$tree <- as.character(d$tree)
summary(d)
##      tree           percentCommon      kendall      
##  Length:25          Min.   :82.00   Min.   :0.8905  
##  Class :character   1st Qu.:85.00   1st Qu.:0.9212  
##  Mode  :character   Median :92.00   Median :0.9358  
##                     Mean   :90.92   Mean   :0.9313  
##                     3rd Qu.:97.00   3rd Qu.:0.9440  
##                     Max.   :98.00   Max.   :0.9521

We have to add in the true trees:

d<-rbind(c("true_tree_1",100,0.95143603133), d)
d<-rbind(c("true_tree_2",100,0.94639944069), d)
d<-rbind(c("true_tree_3",100,0.95167464114), d)
d<-rbind(c("true_tree_4",100,0.96209016393), d)
d<-rbind(c("true_tree_5",100,0.96421225833), d)
#plot(sort(rep( 1:6, 5)), c(d$true_tree, d$reroot_grandchildren, d$X5NNI, d$X10NNI, d$X15NNI, d$X20NNI), xlab="Species tree topology used for MaxTic", ylab="Normalized Kendall similarity\n between inferred and true orders", xaxt="n", pch=20, col=rgb(0,0,0,0.6), ylim=c(0.85, 1.0), cex=2)
#axis(1, at=c(1,2,3,4, 5, 6), labels=c("True", "Rerooted", "5 NNIs", "10 NNIs","15 NNIs","20 NNIs"))


mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

plot(sort(rep( 1:6, 5)), d$kendall, xlab="Species tree topology used for MaxTic", ylab="Normalized Kendall similarity\n between inferred and true orders", xaxt="n", pch=20, col=rgb(0,0,0,0.6), ylim=c(0.85, 1.0), cex=2)
axis(1, at=c(1,2,3,4, 5, 6), labels=c("True", "Rerooted", "5 NNIs", "10 NNIs","15 NNIs","20 NNIs"))

mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

plot(sort(rep( 1:6, 5)), d$percentCommon, xlab="Species tree topology used for MaxTic", ylab="Percentage of clades in common \n between altered and true trees", xaxt="n", pch=20, col=rgb(0,0,0,0.6), ylim=c(80, 100), cex=2)
axis(1, at=c(1,2,3,4, 5, 6), labels=c("True", "Rerooted", "5 NNIs", "10 NNIs","15 NNIs","20 NNIs"))

Figure 10 in the manuscript

print(d$tree)
##  [1] "true_tree_5" "true_tree_4" "true_tree_3" "true_tree_2" "true_tree_1"
##  [6] "root1"       "root2"       "root3"       "root4"       "root5"      
## [11] "NNI5_1"      "NNI5_2"      "NNI5_3"      "NNI5_4"      "NNI5_5"     
## [16] "NNI10_1"     "NNI10_2"     "NNI10_3"     "NNI10_4"     "NNI10_5"    
## [21] "NNI15_1"     "NNI15_2"     "NNI15_3"     "NNI15_4"     "NNI15_5"    
## [26] "NNI20_1"     "NNI20_2"     "NNI20_3"     "NNI20_4"     "NNI20_5"
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

plot(d$percentCommon, d$kendall, xlab="Percentage of clades in common \n between altered and true trees", ylab="Normalized Kendall similarity\n between inferred and true orders", pch=20, col=c(rep( rgb(0,0,1,0.6), 5), rep( rgb(1,0,0,0.6), 5), rep( rgb(0,1,0,0.6), 20) ), ylim=c(0.85, 1.0), cex=2, xlim=c(max(as.numeric(d$percentCommon)), min(as.numeric(d$percentCommon))))
abline(lm(d$kendall~as.numeric(d$percentCommon)), col="grey", lwd=2)

Now we are going to read in Kendall similarities obtained on random node orders on trees with some proportion of common clades with the true tree.

randSim<-read.table("random_trees_similarity", h=F )
summary(randSim)
##                 V1            V2               V3        
##  random_trees    :100   Min.   : 82.00   Min.   :0.6417  
##  random_trees_100:100   1st Qu.: 85.00   1st Qu.:0.7180  
##  random_trees_84 :100   Median : 90.00   Median :0.7352  
##  random_trees_85 :100   Mean   : 90.82   Mean   :0.7351  
##  random_trees_87 :100   3rd Qu.: 97.00   3rd Qu.:0.7530  
##  random_trees_89 :100   Max.   :100.00   Max.   :0.8188  
##  (Other)         :500
mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

plot(d$percentCommon, d$kendall, xlab="Percentage of clades in common \n between altered and true trees", ylab="Normalized Kendall similarity\n between inferred and true orders", pch=20, col=c(rep( rgb(0,0,1,0.6), 5), rep( rgb(1,0,0,0.6), 5), rep( rgb(0,1,0,0.6), 20) ), ylim=c(0.60, 1.0), cex=2, xlim=c(max(as.numeric(d$percentCommon)), min(as.numeric(d$percentCommon))))
abline(lm(d$kendall~as.numeric(d$percentCommon)), col="grey", lwd=2)
points(jitter(randSim$V2), randSim$V3, pch=20, cex=1, col=rgb(0,0,0,0.3))
legend("bottomleft", col=c(rgb(0,0,1,0.6), rgb(1,0,0,0.6), rgb(0,1,0,0.6), rgb(0,0,0,0.6) ), pch=20,cex=0.75, pt.cex=1.5, legend=c("True tree", "Rerooted", "NNIs", "Random node order"), horiz=T)

Même chose, mais on montre les quantiles des distributions.

mar.default <- c(5,4,4,2) + 0.1
par(mar = mar.default + c(0, 2, 0, 0)) 

plot(d$percentCommon, d$kendall, xlab="Percentage of clades in common \n between altered and true trees", ylab="Normalized Kendall similarity\n between inferred and true orders", pch=20, col=c(rep( rgb(0,0,1,0.6), 5), rep( rgb(1,0,0,0.6), 5), rep( rgb(0,1,0,0.6), 20) ), ylim=c(0.64, 1.0), cex=2, xlim=c(max(as.numeric(d$percentCommon)), min(as.numeric(d$percentCommon))))
abline(lm(d$kendall~as.numeric(d$percentCommon)), col="grey", lwd=2)
quants <- quantile (randSim$V3, probs=seq(0, 1, 0.05))
#points(jitter(randSim$V2), randSim$V3, pch=20, cex=1, col=rgb(0,0,0,0.3))
abline(h=quants[2], lwd=2, lty=2)
abline(h=quants[20], lwd=2, lty=2)
abline(h=quants[11], lwd=2)
legend("bottomleft", col=c(rgb(0,0,1,0.6), rgb(1,0,0,0.6), rgb(0,1,0,0.6) ), pch=20, cex=1, legend=c("True tree", "Rerooted", "NNIs"), ncol=3)