R Notebook: Provides reproducible analysis for Perfect Homologs in the following manuscript:
Citation: Romanowicz KJ, Resnick C, Hinton SR, Plesa C. Exploring antibiotic resistance in diverse homologs of the dihydrofolate reductase protein family through broad mutational scanning. bioRxiv, 2025.
GitHub Repository: https://github.com/PlesaLab/DHFR
NCBI BioProject: https://www.ncbi.nlm.nih.gov/bioproject/1189478
This pipeline processes a library of 1,536 DHFR homologs and their associated mutants, with two-fold redundancy (two codon variants per sequence). Fitness scores are derived from a multiplexed in-vivo assay using a trimethoprim concentration gradient, assessing the ability of these homologs and their mutants to complement functionality in an E. coli knockout strain and their tolerance to trimethoprim treatment. This analysis provides insights into how antibiotic resistance evolves across a range of evolutionary starting points. Sequence data were generated using the Illumina NovaSeq platform with 100 bp paired-end sequencing of amplicons.
The following R packages must be installed prior to loading into the R session. See the Reproducibility tab for a complete list of packages and their versions used in this workflow.
# Load the latest version of python (3.10.14) for downstream use:
library(reticulate)
use_python("/Users/krom/miniforge3/bin/python3")
# Make a vector of required packages
required.packages <- c("ape", "bio3d", "Biostrings", "castor", "cowplot", "devtools", "dplyr", "ggExtra", "ggnewscale", "ggplot2", "ggpubr", "ggridges", "ggtree", "ggtreeExtra", "glmnet", "gridExtra","igraph", "knitr", "matrixStats", "patchwork", "pheatmap", "purrr", "pscl", "RColorBrewer", "reshape","reshape2", "ROCR", "rstatix", "seqinr", "scales", "stringr", "stringi", "tidyr", "tidytree", "viridis")
# Load required packages with error handling
loaded.packages <- lapply(required.packages, function(package) {
if (!require(package, character.only = TRUE)) {
install.packages(package, dependencies = TRUE)
if (!require(package, character.only = TRUE)) {
message("Package ", package, " could not be installed and loaded.")
return(NULL)
}
}
return(package)
})
# Remove NULL entries from loaded packages
loaded.packages <- loaded.packages[!sapply(loaded.packages, is.null)]
Loaded packages: ape, bio3d, Biostrings, castor, cowplot, devtools, dplyr, ggExtra, ggnewscale, ggplot2, ggpubr, ggridges, ggtree, ggtreeExtra, glmnet, gridExtra, igraph, knitr, matrixStats, patchwork, pheatmap, purrr, pscl, RColorBrewer, reshape, reshape2, ROCR, rstatix, seqinr, scales, stringr, stringi, tidyr, tidytree, viridis
Import MAPPING files generated from DHFR.1.Mapping.RMD relevant for downstream analysis.
### BCcontrols
# Lib15 (Codon 1)
BCcontrols_15_16_shared_median_WT <- read.csv("Count/count_files_formatted/BCcontrols_15_16_shared_median_WT.csv",
header = TRUE, stringsAsFactors = FALSE)
# Lib16 (Codon 2)
BCcontrols_15_16_shared_median_Neg <- read.csv("Count/count_files_formatted/BCcontrols_15_16_shared_median_Neg.csv",
header = TRUE, stringsAsFactors = FALSE)
### mutIDinfo
# mutIDinfo.15.16.zeros.shared
mutIDinfo.15.16.zeros.shared <- read.csv("Mapping/map_files_formatted/mutIDinfo.15.16.zeros.shared.csv",
header = TRUE, stringsAsFactors = FALSE)
# mutIDinfo.15.16.zeros.unique
mutIDinfo.15.16.zeros.unique <- read.csv("Mapping/map_files_formatted/mutIDinfo.15.16.zeros.unique.csv",
header = TRUE, stringsAsFactors = FALSE)
Import COUNTS files generated from DHFR.2.Counts.RMD relevant for downstream analysis.
### BCs_map
# Lib15 (Codon 1)
BCs15_map <- read.csv("Count/count_files_formatted/BCs15_map.csv",
header = TRUE, stringsAsFactors = FALSE)
# Lib16 (Codon 2)
BCs16_map <- read.csv("Count/count_files_formatted/BCs16_map.csv",
header = TRUE, stringsAsFactors = FALSE)
### mutIDinfo (mapping files that have been updated with count data)
# Lib15 (Codon 1)
mutIDinfo15 <- read.csv("Count/count_files_formatted/mutIDinfo15.csv",
header = TRUE, stringsAsFactors = FALSE)
# Lib16 (Codon 2)
mutIDinfo16 <- read.csv("Count/count_files_formatted/mutIDinfo16.csv",
header = TRUE, stringsAsFactors = FALSE)
This section is based on the R file: “R_perfects_only.R”. It describes how to select only for perfects (perfect match to designed homolog sequence where mutations = 0).
Next, begin by selecting perfects BCs from “BCs_map” object where mutations = 0:
# Lib15
BCperfect15 <- BCs15_map %>%
filter(mutations==0) %>%
dplyr::rename(ID=IDalign)
# Lib16
BCperfect16 <- BCs16_map %>%
filter(mutations==0) %>%
dplyr::rename(ID=IDalign)
Perform a quick check to verify no IDs are missing:
# Lib15
#check how many are missing ID
nrow(BCperfect15 %>% filter(ID == ""))
[1] 0
# Lib16
#check how many are missing ID
nrow(BCperfect16 %>% filter(ID == ""))
[1] 0
Create a function to check if each unique perfect sequence represents a “true perfect” based on the designed homolog sequence or a “synonymous mutation” that is functionally redundant.
#check if this is a synonymous mutant
cigar_syn_mutant_check <- function(cigarin){
flag_out <- "Perfect"
if (length(grep("X",cigarin)) != 0) {
flag_out <- "SynMutant"
}
return(flag_out)
}
Apply the “perfect check function” to the “BCperfect” dataset and add a column to indicate each BCs status.
# Lib15
#check if this BC is a synonymous mutant
BCperfect15 <- BCperfect15 %>%
rowwise() %>%
mutate(synflag=cigar_syn_mutant_check(cigar))
# Lib16
#check if this BC is a synonymous mutant
BCperfect16 <- BCperfect16 %>%
rowwise() %>%
mutate(synflag=cigar_syn_mutant_check(cigar))
Count the number of “Perfects” from the dataset:
# Lib15
# Number of perfects
perfects.count15 <- length(which(BCperfect15$synflag=="Perfect"))
format(perfects.count15, big.mark = ",")
[1] "69,114"
# Lib16
# Number of perfects
perfects.count16 <- length(which(BCperfect16$synflag=="Perfect"))
format(perfects.count16, big.mark = ",")
[1] "98,968"
Count the number of “SynMutants” from the dataset:
# Lib15
# Number of synonymous mutants
synmut.count15 <- length(which(BCperfect15$synflag=="SynMutant"))
format(synmut.count15, big.mark = ",")
[1] "3,838"
# Lib16
# Number of synonymous mutants
synmut.count16 <- length(which(BCperfect16$synflag=="SynMutant"))
format(synmut.count16, big.mark = ",")
[1] "8,335"
Create a perfects object that further filters the
mutIDinfo object to retain mutIDs only if they have 0
mutations:
# Lib15
perfects15 <- mutIDinfo15 %>%
filter(mutations==0) %>%
dplyr::rename(ID = IDalign)
# Lib16
perfects16 <- mutIDinfo16 %>%
filter(mutations==0) %>%
dplyr::rename(ID = IDalign)
Select only those perfects with a minimum number of barcode:
# Lib15
# Select only those with a minimum of 2 BCs
perfects15_2BCs <- perfects15 %>%
filter(numprunedBCs > 1)
# Select only those with a minimum of 5 BCs
perfects15_5BCs <- perfects15 %>%
filter(numprunedBCs > 4)
# Select only those with a minimum of 10 BCs
perfects15_10BCs <- perfects15 %>%
filter(numprunedBCs > 9)
# Select only those with a minimum of 100 BCs
perfects15_100BCs <- perfects15 %>%
filter(numprunedBCs > 99)
# Select only those with a minimum of 200 BCs
perfects15_200BCs <- perfects15 %>%
filter(numprunedBCs > 199)
# Lib16
# Select only those with a minimum of 2 BCs
perfects16_2BCs <- perfects16 %>%
filter(numprunedBCs > 1)
### Select only those with a minimum of 5 BCs
perfects16_5BCs <- perfects16 %>%
filter(numprunedBCs > 4)
### Select only those with a minimum of 10 BCs
perfects16_10BCs <- perfects16 %>%
filter(numprunedBCs > 9)
# Select only those with a minimum of 100 BCs
perfects16_100BCs <- perfects16 %>%
filter(numprunedBCs > 99)
# Select only those with a minimum of 200 BCs
perfects16_200BCs <- perfects16 %>%
filter(numprunedBCs > 199)
Count the number of perfects before and after filtering by minimum BC counts (2BCs and 5BCs):
# Lib15
# Count the number of unique perfects prior to filtering by BCs in Lib15
perfects15.count <- length(unique(perfects15$ID))
format(perfects15.count, big.mark = ",")
[1] "961"
# Count the number of unique perfects at 2BCs minimum retained in Lib15
perfects15_2BCs.count <- length(unique(perfects15_2BCs$ID))
format(perfects15_2BCs.count, big.mark = ",")
[1] "896"
# Count the number of unique perfects at 5BCs minimum retained in Lib15
perfects15_5BCs.count <- length(unique(perfects15_5BCs$ID))
format(perfects15_5BCs.count, big.mark = ",")
[1] "797"
# Count the number of unique perfects at 10BCs minimum retained in Lib15
perfects15_10BCs.count <- length(unique(perfects15_10BCs$ID))
format(perfects15_10BCs.count, big.mark = ",")
[1] "704"
# Count the number of unique perfects at 100BCs minimum retained in Lib15
perfects15_100BCs.count <- length(unique(perfects15_100BCs$ID))
format(perfects15_100BCs.count, big.mark = ",")
[1] "257"
# Count the number of unique perfects at 200BCs minimum retained in Lib15
perfects15_200BCs.count <- length(unique(perfects15_200BCs$ID))
format(perfects15_200BCs.count, big.mark = ",")
[1] "105"
# Lib16
# Count the number of unique perfects prior to filtering by BCs in Lib16
perfects16.count <- length(unique(perfects16$ID))
format(perfects16.count, big.mark = ",")
[1] "818"
# Count the number of unique perfects at 2BCs minimum retained in Lib16
perfects16_2BCs.count <- length(unique(perfects16_2BCs$ID))
format(perfects16_2BCs.count, big.mark = ",")
[1] "748"
# Count the number of unique perfects at 5BCs minimum retained in Lib16
perfects16_5BCs.count <- length(unique(perfects16_5BCs$ID))
format(perfects16_5BCs.count, big.mark = ",")
[1] "666"
# Count the number of unique perfects at 10BCs minimum retained in Lib16
perfects16_10BCs.count <- length(unique(perfects16_10BCs$ID))
format(perfects16_10BCs.count, big.mark = ",")
[1] "600"
# Count the number of unique perfects at 100BCs minimum retained in Lib15
perfects16_100BCs.count <- length(unique(perfects16_100BCs$ID))
format(perfects16_100BCs.count, big.mark = ",")
[1] "293"
# Count the number of unique perfects at 200BCs minimum retained in Lib15
perfects16_200BCs.count <- length(unique(perfects16_200BCs$ID))
format(perfects16_200BCs.count, big.mark = ",")
[1] "180"
Filter the Perfects datasets to retain only those with good fitness scores for complementation (>=-1) and those with poor fitness in complementation (<-1). The good fitness Perfects will be used for downstream complementation and TMP resistance analyses, while the poor fitness Perfects will be used for gain-of-function (GOF) analysis.
Good Perfects: Select only those perfects with a minimum fitness value >=-1 for each BC level:
# Lib15
# Select only those with a minimum of 1 BC
perfects15_good <- perfects15 %>%
filter(fitD05D03 >= -1)
# Select only those with a minimum of 2 BCs
perfects15_2BCs_good <- perfects15_2BCs %>%
filter(fitD05D03 >= -1)
# Select only those with a minimum of 5 BCs
perfects15_5BCs_good <- perfects15_5BCs %>%
filter(fitD05D03 >= -1)
# Select only those with a minimum of 10 BCs
perfects15_10BCs_good <- perfects15_10BCs %>%
filter(fitD05D03 >= -1)
# Select only those with a minimum of 100 BCs
perfects15_100BCs_good <- perfects15_100BCs %>%
filter(fitD05D03 >= -1)
# Select only those with a minimum of 200 BCs
perfects15_200BCs_good <- perfects15_200BCs %>%
filter(fitD05D03 >= -1)
# Lib16
# Select only those with a minimum of 1 BCs
perfects16_good <- perfects16 %>%
filter(fitD12D04 >= -1)
# Select only those with a minimum of 2 BCs
perfects16_2BCs_good <- perfects16_2BCs %>%
filter(fitD12D04 >= -1)
### Select only those with a minimum of 5 BCs
perfects16_5BCs_good <- perfects16_5BCs %>%
filter(fitD12D04 >= -1)
### Select only those with a minimum of 10 BCs
perfects16_10BCs_good <- perfects16_10BCs %>%
filter(fitD12D04 >= -1)
# Select only those with a minimum of 100 BCs
perfects16_100BCs_good <- perfects16_100BCs %>%
filter(fitD12D04 >= -1)
# Select only those with a minimum of 200 BCs
perfects16_200BCs_good <- perfects16_200BCs %>%
filter(fitD12D04 >= -1)
Count the number of perfects before and after filtering by minimum fitness (fitness >= -1):
# Lib15
# Count the number of unique perfects prior to filtering by BCs in Lib15
perfects15.good.count <- length(unique(perfects15_good$ID))
format(perfects15.good.count, big.mark = ",")
[1] "489"
# Determine Minimum fitness perfect prior to filtering by BCs in Lib15
perfects15.good.min <- min(perfects15_good$fitD05D03)
format(perfects15.good.min, big.mark = ",")
[1] "-0.9993416"
# Determine Maximum fitness perfect prior to filtering by BCs in Lib15
perfects15.good.max <- max(perfects15_good$fitD05D03)
format(perfects15.good.max, big.mark = ",")
[1] "1.856875"
# Filtered by BCs
# Count the number of unique perfects at 2BCs minimum retained in Lib15
perfects15_2BCs.good.count <- length(unique(perfects15_2BCs_good$ID))
format(perfects15_2BCs.good.count, big.mark = ",")
[1] "459"
# Count the number of unique perfects at 5BCs minimum retained in Lib15
perfects15_5BCs.good.count <- length(unique(perfects15_5BCs_good$ID))
format(perfects15_5BCs.good.count, big.mark = ",")
[1] "416"
# Count the number of unique perfects at 10BCs minimum retained in Lib15
perfects15_10BCs.good.count <- length(unique(perfects15_10BCs_good$ID))
format(perfects15_10BCs.good.count, big.mark = ",")
[1] "371"
# Count the number of unique perfects at 100BCs minimum retained in Lib15
perfects15_100BCs.good.count <- length(unique(perfects15_100BCs_good$ID))
format(perfects15_100BCs.good.count, big.mark = ",")
[1] "144"
# Count the number of unique perfects at 200BCs minimum retained in Lib15
perfects15_200BCs.good.count <- length(unique(perfects15_200BCs_good$ID))
format(perfects15_200BCs.good.count, big.mark = ",")
[1] "54"
# Lib16
# Count the number of unique perfects prior to filtering by BCs in Lib16
perfects16.good.count <- length(unique(perfects16_good$ID))
format(perfects16.good.count, big.mark = ",")
[1] "447"
# Determine Minimum fitness perfect prior to filtering by BCs in Lib16
perfects16.good.min <- min(perfects16_good$fitD12D04)
format(perfects16.good.min, big.mark = ",")
[1] "-0.9948904"
# Determine Maximum fitness perfect prior to filtering by BCs in Lib16
perfects16.good.max <- max(perfects16_good$fitD12D04)
format(perfects16.good.max, big.mark = ",")
[1] "1.298934"
# Filtered by BCs
# Count the number of unique perfects at 2BCs minimum retained in Lib16
perfects16_2BCs.good.count <- length(unique(perfects16_2BCs_good$ID))
format(perfects16_2BCs.good.count, big.mark = ",")
[1] "416"
# Count the number of unique perfects at 5BCs minimum retained in Lib16
perfects16_5BCs.good.count <- length(unique(perfects16_5BCs_good$ID))
format(perfects16_5BCs.good.count, big.mark = ",")
[1] "377"
# Count the number of unique perfects at 10BCs minimum retained in Lib16
perfects16_10BCs.good.count <- length(unique(perfects16_10BCs_good$ID))
format(perfects16_10BCs.good.count, big.mark = ",")
[1] "343"
# Count the number of unique perfects at 100BCs minimum retained in Lib16
perfects16_100BCs.good.count <- length(unique(perfects16_100BCs_good$ID))
format(perfects16_100BCs.good.count, big.mark = ",")
[1] "176"
# Count the number of unique perfects at 200BCs minimum retained in Lib16
perfects16_200BCs.good.count <- length(unique(perfects16_200BCs_good$ID))
format(perfects16_200BCs.good.count, big.mark = ",")
[1] "108"
Merge perfects filtered at 1BC derived from each library into a single dataframe based on shared ID between the two dataset
# Merge by shared "mutID" (1BC)
perfects_15_16_shared <- merge(perfects15, perfects16, by = "mutID", all = FALSE)
# Merge by shared "mutID" (2BC)
perfects_15_16_shared_2BCs <- merge(perfects15_2BCs, perfects16_2BCs, by = "mutID", all = FALSE)
# Merge by shared "mutID" (5BC)
perfects_15_16_shared_5BCs <- merge(perfects15_5BCs, perfects16_5BCs, by = "mutID", all = FALSE)
# Merge by shared "mutID" (10BC)
perfects_15_16_shared_10BCs <- merge(perfects15_10BCs, perfects16_10BCs, by = "mutID", all = FALSE)
# Merge by shared "mutID" (100BC)
perfects_15_16_shared_100BCs <- merge(perfects15_100BCs, perfects16_100BCs, by = "mutID", all = FALSE)
# Merge by shared "mutID" (200BC)
perfects_15_16_shared_200BCs <- merge(perfects15_200BCs, perfects16_200BCs, by = "mutID", all = FALSE)
Count the number of perfects shared between both libraries.
# Count the number of unique perfects shared between libraries (1BC)
perfects_15_16_shared.count <- length(unique(perfects_15_16_shared$mutID))
format(perfects_15_16_shared.count, big.mark = ",")
[1] "643"
# Count the number of unique perfects shared between libraries (2BC)
perfects_15_16_shared_2BCs.count <- length(unique(perfects_15_16_shared_2BCs$mutID))
format(perfects_15_16_shared_2BCs.count, big.mark = ",")
[1] "579"
# Count the number of unique perfects shared between libraries (5BC)
perfects_15_16_shared_5BCs.count <- length(unique(perfects_15_16_shared_5BCs$mutID))
format(perfects_15_16_shared_5BCs.count, big.mark = ",")
[1] "493"
# Count the number of unique perfects shared between libraries (10BC)
perfects_15_16_shared_10BCs.count <- length(unique(perfects_15_16_shared_10BCs$mutID))
format(perfects_15_16_shared_10BCs.count, big.mark = ",")
[1] "419"
# Count the number of unique perfects shared between libraries (100BC)
perfects_15_16_shared_100BCs.count <- length(unique(perfects_15_16_shared_100BCs$mutID))
format(perfects_15_16_shared_100BCs.count, big.mark = ",")
[1] "115"
# Count the number of unique perfects shared between libraries (200BC)
perfects_15_16_shared_200BCs.count <- length(unique(perfects_15_16_shared_200BCs$mutID))
format(perfects_15_16_shared_200BCs.count, big.mark = ",")
[1] "38"
Count the number of perfects unique to one library or the other.
# Create a new dataset retaining unique values from both datasets (1BC)
perfects_15_16_unique <- bind_rows(
anti_join(perfects15, perfects16, by = "mutID"),
anti_join(perfects16, perfects15, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique.count <- length(unique(perfects_15_16_unique$mutID))
format(perfects_15_16_unique.count, big.mark = ",")
[1] "493"
# Create a new dataset retaining unique values from both datasets (2BC)
perfects_15_16_unique_2BCs <- bind_rows(
anti_join(perfects15_2BCs, perfects16_2BCs, by = "mutID"),
anti_join(perfects16_2BCs, perfects15_2BCs, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique_2BCs.count <- length(unique(perfects_15_16_unique_2BCs$mutID))
format(perfects_15_16_unique_2BCs.count, big.mark = ",")
[1] "486"
# Create a new dataset retaining unique values from both datasets (5BC)
perfects_15_16_unique_5BCs <- bind_rows(
anti_join(perfects15_5BCs, perfects16_5BCs, by = "mutID"),
anti_join(perfects16_5BCs, perfects15_5BCs, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique_5BCs.count <- length(unique(perfects_15_16_unique_5BCs$mutID))
format(perfects_15_16_unique_5BCs.count, big.mark = ",")
[1] "477"
# Create a new dataset retaining unique values from both datasets (10BC)
perfects_15_16_unique_10BCs <- bind_rows(
anti_join(perfects15_10BCs, perfects16_10BCs, by = "mutID"),
anti_join(perfects16_10BCs, perfects15_10BCs, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique_10BCs.count <- length(unique(perfects_15_16_unique_10BCs$mutID))
format(perfects_15_16_unique_10BCs.count, big.mark = ",")
[1] "466"
# Create a new dataset retaining unique values from both datasets (100BC)
perfects_15_16_unique_100BCs <- bind_rows(
anti_join(perfects15_100BCs, perfects16_100BCs, by = "mutID"),
anti_join(perfects16_100BCs, perfects15_100BCs, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique_100BCs.count <- length(unique(perfects_15_16_unique_100BCs$mutID))
format(perfects_15_16_unique_100BCs.count, big.mark = ",")
[1] "320"
# Create a new dataset retaining unique values from both datasets (200BC)
perfects_15_16_unique_200BCs <- bind_rows(
anti_join(perfects15_200BCs, perfects16_200BCs, by = "mutID"),
anti_join(perfects16_200BCs, perfects15_200BCs, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique_200BCs.count <- length(unique(perfects_15_16_unique_200BCs$mutID))
format(perfects_15_16_unique_200BCs.count, big.mark = ",")
[1] "209"
Summarize the number of perfects shared or unique between both libraries based on minimum BC count.
# Count number of shared and unique perfects (1BC)
perfects_15_16_all.count <- sum(perfects_15_16_shared.count + perfects_15_16_unique.count)
format(perfects_15_16_all.count, big.mark = ",")
[1] "1,136"
# Count number of shared and unique perfects (2BC)
perfects_15_16_all_2BCs.count <- sum(perfects_15_16_shared_2BCs.count + perfects_15_16_unique_2BCs.count)
format(perfects_15_16_all_2BCs.count, big.mark = ",")
[1] "1,065"
# Count number of shared and unique perfects (5BC)
perfects_15_16_all_5BCs.count <- sum(perfects_15_16_shared_5BCs.count + perfects_15_16_unique_5BCs.count)
format(perfects_15_16_all_5BCs.count, big.mark = ",")
[1] "970"
# Count number of shared and unique perfects (10BC)
perfects_15_16_all_10BCs.count <- sum(perfects_15_16_shared_10BCs.count + perfects_15_16_unique_10BCs.count)
format(perfects_15_16_all_10BCs.count, big.mark = ",")
[1] "885"
# Count number of shared and unique perfects (100BC)
perfects_15_16_all_100BCs.count <- sum(perfects_15_16_shared_100BCs.count + perfects_15_16_unique_100BCs.count)
format(perfects_15_16_all_100BCs.count, big.mark = ",")
[1] "435"
# Count number of shared and unique perfects (200BC)
perfects_15_16_all_200BCs.count <- sum(perfects_15_16_shared_200BCs.count + perfects_15_16_unique_200BCs.count)
format(perfects_15_16_all_200BCs.count, big.mark = ",")
[1] "247"
Merge perfects filtered by minimum fitness at complementation (>-1) based on shared ID between both libraries:
# Merge by shared "mutID" (1BC)
perfects_15_16_shared_good <- merge(perfects15_good, perfects16_good, by = "mutID", all = FALSE)
# Merge by shared "mutID" (2BC)
perfects_15_16_shared_2BCs_good <- merge(perfects15_2BCs_good, perfects16_2BCs_good, by = "mutID", all = FALSE)
# Merge by shared "mutID" (5BC)
perfects_15_16_shared_5BCs_good <- merge(perfects15_5BCs_good, perfects16_5BCs_good, by = "mutID", all = FALSE)
Count the number of perfects shared between both libraries.
# Count the number of unique perfects shared between libraries (1BC)
perfects_15_16_shared.good.count <- length(unique(perfects_15_16_shared_good$mutID))
format(perfects_15_16_shared.good.count, big.mark = ",")
[1] "236"
# Count the number of unique perfects shared between libraries (2BC)
perfects_15_16_shared_2BCs.good.count <- length(unique(perfects_15_16_shared_2BCs_good$mutID))
format(perfects_15_16_shared_2BCs.good.count, big.mark = ",")
[1] "217"
# Count the number of unique perfects shared between libraries (5BC)
perfects_15_16_shared_5BCs.good.count <- length(unique(perfects_15_16_shared_5BCs_good$mutID))
format(perfects_15_16_shared_5BCs.good.count, big.mark = ",")
[1] "194"
Count the number of perfects unique to one library or the other.
# Create a new dataset retaining unique values from both datasets (1BC)
perfects_15_16_unique_good <- bind_rows(
anti_join(perfects15_good, perfects16_good, by = "mutID"),
anti_join(perfects16_good, perfects15_good, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique.good.count <- length(unique(perfects_15_16_unique_good$mutID))
format(perfects_15_16_unique.good.count, big.mark = ",")
[1] "464"
# Create a new dataset retaining unique values from both datasets (2BC)
perfects_15_16_unique_2BCs_good <- bind_rows(
anti_join(perfects15_2BCs_good, perfects16_2BCs_good, by = "mutID"),
anti_join(perfects16_2BCs_good, perfects15_2BCs_good, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique_2BCs.good.count <- length(unique(perfects_15_16_unique_2BCs_good$mutID))
format(perfects_15_16_unique_2BCs.good.count, big.mark = ",")
[1] "441"
# Create a new dataset retaining unique values from both datasets (5BC)
perfects_15_16_unique_5BCs_good <- bind_rows(
anti_join(perfects15_5BCs_good, perfects16_5BCs_good, by = "mutID"),
anti_join(perfects16_5BCs_good, perfects15_5BCs_good, by = "mutID"))
# Count the number of perfects unique to one library or the other
perfects_15_16_unique_5BCs.good.count <- length(unique(perfects_15_16_unique_5BCs_good$mutID))
format(perfects_15_16_unique_5BCs.good.count, big.mark = ",")
[1] "405"
Summarize the number of perfects shared or unique between both libraries based on minimum BC count.
# Count number of shared and unique perfects (1BC)
perfects_15_16_all.good.count <- sum(perfects_15_16_shared.good.count + perfects_15_16_unique.good.count)
format(perfects_15_16_all.good.count, big.mark = ",")
[1] "700"
# Count number of shared and unique perfects (2BC)
perfects_15_16_all_2BCs.good.count <- sum(perfects_15_16_shared_2BCs.good.count + perfects_15_16_unique_2BCs.good.count)
format(perfects_15_16_all_2BCs.good.count, big.mark = ",")
[1] "658"
# Count number of shared and unique perfects (5BC)
perfects_15_16_all_5BCs.good.count <- sum(perfects_15_16_shared_5BCs.good.count + perfects_15_16_unique_5BCs.good.count)
format(perfects_15_16_all_5BCs.good.count, big.mark = ",")
[1] "599"
Because many of the shared mutIDs have “NA” values for one library or the other at certain treatment conditions, we need to subset the dataframe into smaller datasets relevant for each condition we want to plot and remove rows containing “NA” values. All correlations are based on the “BCs.15.16.mutID.fitness.perfects.shared” dataset.
Subset relevant data columns and remove rows containing “NA” values:
# Complementation - 0-TMP
L15.L16.Counts.0.TMP <- perfects_15_16_shared_5BCs[, c("mutID", "numprunedBCs.x", "numprunedBCs.y", "fitD05D03", "fitD12D04")] %>%
na.omit(L15.L16.Counts.0.TMP)
# Trimethoprim - 0.058-TMP
L15.L16.Counts.0.058.TMP <- perfects_15_16_shared_5BCs_good[, c("mutID", "numprunedBCs.x", "numprunedBCs.y", "fitD06D03", "fitE01D04")] %>%
na.omit(L15.L16.Counts.0.058.TMP)
# Trimethoprim - 0.5-TMP
L15.L16.Counts.0.5.TMP <- perfects_15_16_shared_5BCs_good[, c("mutID", "numprunedBCs.x", "numprunedBCs.y", "fitD07D03", "fitE02D04")] %>%
na.omit(L15.L16.Counts.0.5.TMP)
# Trimethoprim - 1.0-TMP
L15.L16.Counts.1.0.TMP <- perfects_15_16_shared_5BCs_good[, c("mutID", "numprunedBCs.x", "numprunedBCs.y", "fitD08D03", "fitE03D04")] %>%
na.omit(L15.L16.Counts.1.0.TMP)
# Trimethoprim - 10-TMP
L15.L16.Counts.10.TMP <- perfects_15_16_shared_5BCs_good[, c("mutID", "numprunedBCs.x", "numprunedBCs.y", "fitD09D03", "fitE04D04")] %>%
na.omit(L15.L16.Counts.10.TMP)
# Trimethoprim - 50-TMP
L15.L16.Counts.50.TMP <- perfects_15_16_shared_5BCs_good[, c("mutID", "numprunedBCs.x", "numprunedBCs.y", "fitD10D03", "fitE05D04")] %>%
na.omit(L15.L16.Counts.50.TMP)
# Trimethoprim - 200-TMP
L15.L16.Counts.200.TMP <- perfects_15_16_shared_5BCs_good[, c("mutID", "numprunedBCs.x", "numprunedBCs.y", "fitD11D03", "fitE06D04")]%>%
na.omit(L15.L16.Counts.200.TMP)
The following section calculates Pearson’s correlations based on the fitness scores of unique mutIDs shared between Lib15 and Lib16 for Time Point 1.
Pearson’s Correlation: Determine correlations using fitness values from Libraries 15 and 16 at Time Point 1:
# Pearson's Correlation: 0-TMP (Complementation)
cor_test_Counts_shared_0_tmp <- cor.test(L15.L16.Counts.0.TMP$fitD05D03,
L15.L16.Counts.0.TMP$fitD12D04)
# Pearson's Correlation: 0.058-TMP
cor_test_Counts_shared_0.058_tmp <- cor.test(L15.L16.Counts.0.058.TMP$fitD06D03,
L15.L16.Counts.0.058.TMP$fitE01D04)
# Pearson's Correlation: 0.5-TMP
cor_test_Counts_shared_0.5_tmp <- cor.test(L15.L16.Counts.0.5.TMP$fitD07D03,
L15.L16.Counts.0.5.TMP$fitE02D04)
# Pearson's Correlation: 1.0-TMP
cor_test_Counts_shared_1.0_tmp <- cor.test(L15.L16.Counts.1.0.TMP$fitD08D03,
L15.L16.Counts.1.0.TMP$fitE03D04)
# Pearson's Correlation: 10-TMP
cor_test_Counts_shared_10_tmp <- cor.test(L15.L16.Counts.10.TMP$fitD09D03,
L15.L16.Counts.10.TMP$fitE04D04)
# Pearson's Correlation: 50-TMP
cor_test_Counts_shared_50_tmp <- cor.test(L15.L16.Counts.50.TMP$fitD10D03,
L15.L16.Counts.50.TMP$fitE05D04)
# Pearson's Correlation: 200-TMP
cor_test_Counts_shared_200_tmp <- cor.test(L15.L16.Counts.200.TMP$fitD11D03,
L15.L16.Counts.200.TMP$fitE06D04)
# Print the full statistical output:
print(cor_test_Counts_shared_0_tmp)
Pearson's product-moment correlation
data: L15.L16.Counts.0.TMP$fitD05D03 and L15.L16.Counts.0.TMP$fitD12D04
t = 11.66, df = 483, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3961813 0.5353816
sample estimates:
cor
0.4686859
print(cor_test_Counts_shared_0.058_tmp)
Pearson's product-moment correlation
data: L15.L16.Counts.0.058.TMP$fitD06D03 and L15.L16.Counts.0.058.TMP$fitE01D04
t = 20.713, df = 189, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7839299 0.8720023
sample estimates:
cor
0.8331753
print(cor_test_Counts_shared_0.5_tmp)
Pearson's product-moment correlation
data: L15.L16.Counts.0.5.TMP$fitD07D03 and L15.L16.Counts.0.5.TMP$fitE02D04
t = 23.064, df = 188, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8172151 0.8926925
sample estimates:
cor
0.8595716
print(cor_test_Counts_shared_1.0_tmp)
Pearson's product-moment correlation
data: L15.L16.Counts.1.0.TMP$fitD08D03 and L15.L16.Counts.1.0.TMP$fitE03D04
t = 21.535, df = 188, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7968731 0.8801966
sample estimates:
cor
0.8435376
print(cor_test_Counts_shared_10_tmp)
Pearson's product-moment correlation
data: L15.L16.Counts.10.TMP$fitD09D03 and L15.L16.Counts.10.TMP$fitE04D04
t = 19.153, df = 187, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7593228 0.8570267
sample estimates:
cor
0.8138494
print(cor_test_Counts_shared_50_tmp)
Pearson's product-moment correlation
data: L15.L16.Counts.50.TMP$fitD10D03 and L15.L16.Counts.50.TMP$fitE05D04
t = 13.155, df = 187, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6109957 0.7607242
sample estimates:
cor
0.6932678
print(cor_test_Counts_shared_200_tmp)
Pearson's product-moment correlation
data: L15.L16.Counts.200.TMP$fitD11D03 and L15.L16.Counts.200.TMP$fitE06D04
t = 10.781, df = 172, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5370002 0.7161493
sample estimates:
cor
0.6350373
Plot median fitness correlations between Lib15 & Lib16 (perfects) for Complementation:
# Extract correlation value from cor_value_shared object
cor_value_shared_0_tmp <- cor_test_Counts_shared_0_tmp$estimate
# Format p-value in scientific notation
p_value_scientific_shared_0_tmp <- format(cor_test_Counts_shared_0_tmp$p.value, scientific = TRUE, digits = 4)
# Extract number of rows
num_rows.counts.0.tmp <- nrow(L15.L16.Counts.0.TMP)
# Plot based on shared mutID
Lib15_16_0_TMP <- ggplot(L15.L16.Counts.0.TMP,
aes(x=fitD05D03, y=fitD12D04)) +
labs(x = "Codon 1 Median Fitness (LogFC) \n(0 μg/mL tmp)",
y ="Codon 2 Median Fitness (LogFC) \n(0 μg/mL tmp)") +
geom_smooth(method=lm,colour="black") +
geom_density2d(colour="black",alpha=0.2) +
geom_point(aes(color = case_when(
fitD05D03 >= -1 & fitD12D04 >= -1 ~ "lightblue4",
fitD05D03 >= -1 & fitD12D04 < -1 ~ "#0072B2",
fitD12D04 >= -1 & fitD05D03 < -1 ~ "#E69F00",
TRUE ~ "black"
),
fill = case_when(
fitD05D03 >= -1 & fitD12D04 >= -1 ~ "lightblue4",
fitD05D03 >= -1 & fitD12D04 < -1 ~ "#0072B2",
fitD12D04 >= -1 & fitD05D03 < -1 ~ "#E69F00",
TRUE ~ "white"
),
shape = case_when(
fitD05D03 >= -1 & fitD12D04 >= -1 ~ 16,
fitD05D03 >= -1 & fitD12D04 < -1 ~ 16,
fitD12D04 >= -1 & fitD05D03 < -1 ~ 16,
TRUE ~ 21
)),
alpha = 0.75, size = 2.5) +
scale_shape_identity() +
scale_color_identity() +
scale_fill_identity() +
# Add a new point for WT E. coli median fitness
geom_point(data = BCcontrols_15_16_shared_median_WT,
aes(x = fitD05D03, y = fitD12D04),
fill = "red", color = "black", size = 4, shape = 24) +
# Add a new point for Neg Ctrl (D27N, mCherry) median fitness
geom_point(data = BCcontrols_15_16_shared_median_Neg,
aes(x = fitD05D03, y = fitD12D04),
color = "black", size = 5, shape = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
axis.line = element_line(colour = 'black', size = 1.0),
axis.ticks = element_line(colour = "black", size = 1.0),
panel.background = element_blank()) + #,
#panel.border = element_rect(colour = "black", fill=NA, size=1.0)) +
annotate("text",
x = max(L15.L16.Counts.0.TMP$fitD05D03),
y = min(L15.L16.Counts.0.TMP$fitD12D04),
label = paste("p-value =", p_value_scientific_shared_0_tmp), hjust = 1, vjust = 0) +
annotate("text",
x = max(L15.L16.Counts.0.TMP$fitD05D03),
y = min(L15.L16.Counts.0.TMP$fitD12D04),
label = paste("Correlation =", round(cor_value_shared_0_tmp, 2)), hjust = 1, vjust = -1.5) +
annotate("text",
x = min(L15.L16.Counts.0.TMP$fitD05D03),
y = max(L15.L16.Counts.0.TMP$fitD12D04),
label = paste("Shared Assemblies =", num_rows.counts.0.tmp), hjust = 0, vjust = 1.5) +
scale_x_continuous(breaks = seq(floor(min(L15.L16.Counts.0.TMP$fitD05D03)),
ceiling(max(L15.L16.Counts.0.TMP$fitD05D03)), by = 1)) +
scale_y_continuous(breaks = seq(floor(min(L15.L16.Counts.0.TMP$fitD12D04)),
ceiling(max(L15.L16.Counts.0.TMP$fitD12D04)), by = 1))
Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
Please use the `linewidth` argument instead.
Lib15_16_0_TMP_p01 <- ggMarginal(Lib15_16_0_TMP, type = "histogram", fill = "lightblue4", alpha = 0.5)
`geom_smooth()` using formula = 'y ~ x'`geom_smooth()` using formula = 'y ~ x'
Lib15_16_0_TMP_p01
Count Complementing Homologs Unique to Codon Version: Count the number of unique homologs capable of complementation in both codon versions and in one codon version or the other.
library(dplyr)
# Count unique mutIDs for each condition
counts <- L15.L16.Counts.0.TMP %>%
summarise(
both_ge_neg1 = n_distinct(mutID[fitD05D03 >= -1 & fitD12D04 >= -1]),
D05D03_ge_neg1_D12D04_lt_neg1 = n_distinct(mutID[fitD05D03 >= -1 & fitD12D04 < -1]),
D12D04_ge_neg1_D05D03_lt_neg1 = n_distinct(mutID[fitD12D04 >= -1 & fitD05D03 < -1]),
both_lt_neg1 = n_distinct(mutID[fitD05D03 < -1 & fitD12D04 < -1])
)
# Print the results
print("Number of unique mutIDs where:")
[1] "Number of unique mutIDs where:"
print(paste("1. Both fitD05D03 and fitD12D04 are >= -1:", counts$both_ge_neg1))
[1] "1. Both fitD05D03 and fitD12D04 are >= -1: 194"
print(paste("2. fitD05D03 is >= -1 but fitD12D04 is < -1:", counts$D05D03_ge_neg1_D12D04_lt_neg1))
[1] "2. fitD05D03 is >= -1 but fitD12D04 is < -1: 77"
print(paste("3. fitD12D04 is >= -1 but fitD05D03 is < -1:", counts$D12D04_ge_neg1_D05D03_lt_neg1))
[1] "3. fitD12D04 is >= -1 but fitD05D03 is < -1: 86"
print(paste("4. Both fitD05D03 and fitD12D04 are < -1:", counts$both_lt_neg1))
[1] "4. Both fitD05D03 and fitD12D04 are < -1: 128"
# Optional: Calculate total number of unique mutIDs
total_unique_mutIDs <- n_distinct(L15.L16.Counts.0.TMP$mutID)
print(paste("Total number of unique mutIDs:", total_unique_mutIDs))
[1] "Total number of unique mutIDs: 485"
Plot shared fitness correlations between Lib15 & Lib16 (perfects) for 0.058 TMP:
# Extract correlation value from cor_value_shared object
cor_value_shared_0.058_tmp <- cor_test_Counts_shared_0.058_tmp$estimate
# Format p-value in scientific notation
p_value_scientific_shared_0.058_tmp <- format(cor_test_Counts_shared_0.058_tmp$p.value, scientific = TRUE, digits = 4)
# Extract number of rows
num_rows.counts.0.058.tmp <- nrow(L15.L16.Counts.0.058.TMP)
# Plot based on shared mutID
Lib15_16_0.058_TMP <- ggplot(L15.L16.Counts.0.058.TMP,
aes(x=fitD06D03, y=fitE01D04)) +
labs(x = "Codon 1 Median Fitness (LogFC) \n(0.058 μg/mL tmp)", y ="Codon 2 Median Fitness (LogFC) \n(0.058 μg/mL tmp)") +
geom_smooth(method=lm,colour="black") +
geom_density2d(colour="black",alpha=0.2) +
geom_point(aes(color = case_when(
fitD06D03 >= -1 & fitE01D04 >= -1 ~ "lightblue4",
fitD06D03 >= -1 & fitE01D04 < -1 ~ "#0072B2",
fitE01D04 >= -1 & fitD06D03 < -1 ~ "#E69F00",
TRUE ~ "black"
),
fill = case_when(
fitD06D03 >= -1 & fitE01D04 >= -1 ~ "lightblue4",
fitD06D03 >= -1 & fitE01D04 < -1 ~ "#0072B2",
fitE01D04 >= -1 & fitD06D03 < -1 ~ "#E69F00",
TRUE ~ "white"
),
shape = case_when(
fitD06D03 >= -1 & fitE01D04 >= -1 ~ 16,
fitD06D03 >= -1 & fitE01D04 < -1 ~ 16,
fitE01D04 >= -1 & fitD06D03 < -1 ~ 16,
TRUE ~ 21
)),
alpha = 0.75, size = 4) +
scale_shape_identity() +
scale_color_identity() +
scale_fill_identity() +
# Add a new point for WT E. coli median fitness
geom_point(data = BCcontrols_15_16_shared_median_WT,
aes(x = fitD06D03, y = fitE01D04),
fill = "red", color = "black", size = 3, shape = 24) +
# Add a new point for Neg Ctrl (D27N, mCherry) median fitness
geom_point(data = BCcontrols_15_16_shared_median_Neg,
aes(x = fitD06D03, y = fitE01D04),
color = "black", size = 5, shape = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.line = element_line(colour = 'black', size = 1.0),
axis.ticks = element_line(colour = "black", size = 1.0),
panel.background = element_blank()) + #,
#panel.border = element_rect(colour = "black", fill=NA, size=1.0)) +
annotate("text",
x = max(L15.L16.Counts.0.058.TMP$fitD06D03),
y = min(L15.L16.Counts.0.058.TMP$fitE01D04),
label = paste("p-value =", p_value_scientific_shared_0.058_tmp), hjust = 1, vjust = 0) +
annotate("text",
x = max(L15.L16.Counts.0.058.TMP$fitD06D03),
y = min(L15.L16.Counts.0.058.TMP$fitE01D04),
label = paste("Correlation =", round(cor_value_shared_0.058_tmp, 2)), hjust = 1, vjust = -1.5) +
annotate("text",
x = min(L15.L16.Counts.0.058.TMP$fitD06D03),
y = max(L15.L16.Counts.0.058.TMP$fitE01D04),
label = paste("Shared Assemblies =", num_rows.counts.0.058.tmp), hjust = 0, vjust = 1.5) +
scale_x_continuous(breaks = seq(floor(min(L15.L16.Counts.0.058.TMP$fitD06D03)),
ceiling(max(L15.L16.Counts.0.058.TMP$fitD06D03)), by = 1)) +
scale_y_continuous(breaks = seq(floor(min(L15.L16.Counts.0.058.TMP$fitE01D04)),
ceiling(max(L15.L16.Counts.0.058.TMP$fitE01D04)), by = 1))
Lib15_16_0.058_TMP_p01 <- ggMarginal(Lib15_16_0.058_TMP, type = "histogram", fill = "lightblue4", alpha = 0.5)
`geom_smooth()` using formula = 'y ~ x'`geom_smooth()` using formula = 'y ~ x'
Lib15_16_0.058_TMP_p01
Plot shared fitness correlations between Lib15 & Lib16 (perfects) for 0.5 TMP:
# Extract correlation value from cor_value_shared object
cor_value_shared_0.5_tmp <- cor_test_Counts_shared_0.5_tmp$estimate
# Format p-value in scientific notation
p_value_scientific_shared_0.5_tmp <- format(cor_test_Counts_shared_0.5_tmp$p.value, scientific = TRUE, digits = 4)
# Extract number of rows
num_rows.counts.0.5.tmp <- nrow(L15.L16.Counts.0.5.TMP)
# Plot based on shared mutID
Lib15_16_0.5_TMP <- ggplot(L15.L16.Counts.0.5.TMP,
aes(x=fitD07D03, y=fitE02D04)) +
labs(x = "Codon 1 Median Fitness (LogFC) \nMIC (0.5 μg/mL tmp)", y ="Codon 2 Median Fitness (LogFC) \nMIC (0.5 μg/mL tmp)") +
geom_smooth(method=lm,colour="black") +
geom_density2d(colour="black",alpha=0.2) +
geom_point(aes(color = case_when(
fitD07D03 >= -1 & fitE02D04 >= -1 ~ "lightblue4",
fitD07D03 >= -1 & fitE02D04 < -1 ~ "#0072B2",
fitE02D04 >= -1 & fitD07D03 < -1 ~ "#E69F00",
TRUE ~ "black"
),
fill = case_when(
fitD07D03 >= -1 & fitE02D04 >= -1 ~ "lightblue4",
fitD07D03 >= -1 & fitE02D04 < -1 ~ "#0072B2",
fitE02D04 >= -1 & fitD07D03 < -1 ~ "#E69F00",
TRUE ~ "white"
),
shape = case_when(
fitD07D03 >= -1 & fitE02D04 >= -1 ~ 16,
fitD07D03 >= -1 & fitE02D04 < -1 ~ 16,
fitE02D04 >= -1 & fitD07D03 < -1 ~ 16,
TRUE ~ 21
)),
alpha = 0.75, size = 4) +
scale_shape_identity() +
scale_color_identity() +
scale_fill_identity() +
# Add a new point for WT E. coli median fitness
geom_point(data = BCcontrols_15_16_shared_median_WT,
aes(x = fitD07D03, y = fitE02D04),
fill = "red", color = "black", size = 4, shape = 24) +
# Add a new point for Neg Ctrl (D27N, mCherry) median fitness
geom_point(data = BCcontrols_15_16_shared_median_Neg,
aes(x = fitD07D03, y = fitE02D04),
color = "black", size = 5, shape = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.line = element_line(colour = 'black', size = 1.0),
axis.ticks = element_line(colour = "black", size = 1.0),
panel.background = element_blank()) + #,
#panel.border = element_rect(colour = "black", fill=NA, size=1.0)) +
annotate("text",
x = max(L15.L16.Counts.0.5.TMP$fitD07D03),
y = min(L15.L16.Counts.0.5.TMP$fitE02D04),
label = paste("p-value =", p_value_scientific_shared_0.5_tmp), hjust = 1, vjust = 0, size = 5) +
annotate("text",
x = max(L15.L16.Counts.0.5.TMP$fitD07D03),
y = min(L15.L16.Counts.0.5.TMP$fitE02D04),
label = paste("Correlation =", round(cor_value_shared_0.5_tmp, 2)), hjust = 1, vjust = -1.5, size = 5) +
annotate("text",
x = min(L15.L16.Counts.0.5.TMP$fitD07D03),
y = max(L15.L16.Counts.0.5.TMP$fitE02D04),
label = paste("Shared Assemblies =", num_rows.counts.0.5.tmp), hjust = 0, vjust = 1.5, size = 5) +
scale_x_continuous(breaks = seq(floor(min(L15.L16.Counts.0.5.TMP$fitD07D03)),
ceiling(max(L15.L16.Counts.0.5.TMP$fitD07D03)), by = 1)) +
scale_y_continuous(breaks = seq(floor(min(L15.L16.Counts.0.5.TMP$fitE02D04)),
ceiling(max(L15.L16.Counts.0.5.TMP$fitE02D04)), by = 1))
Lib15_16_0.5_TMP_p01 <- ggMarginal(Lib15_16_0.5_TMP, type = "histogram", fill = "lightblue4", alpha = 0.5)
`geom_smooth()` using formula = 'y ~ x'`geom_smooth()` using formula = 'y ~ x'
Lib15_16_0.5_TMP_p01
Plot shared fitness correlations between Lib15 & Lib16 (perfects) for 1.0 TMP:
# Extract correlation value from cor_value_shared object
cor_value_shared_1.0_tmp <- cor_test_Counts_shared_1.0_tmp$estimate
# Format p-value in scientific notation
p_value_scientific_shared_1.0_tmp <- format(cor_test_Counts_shared_1.0_tmp$p.value, scientific = TRUE, digits = 4)
# Extract number of rows
num_rows.counts.1.0.tmp <- nrow(L15.L16.Counts.1.0.TMP)
# Plot based on shared mutID
Lib15_16_1.0_TMP <- ggplot(L15.L16.Counts.1.0.TMP,
aes(x=fitD08D03, y=fitE03D04)) +
labs(x = "Codon 1 Median Fitness (LogFC) \n(1.0 μg/mL tmp)", y ="Codon 2 Median Fitness (LogFC) \n(1.0 μg/mL tmp)") +
geom_smooth(method=lm,colour="black") +
geom_density2d(colour="black",alpha=0.2) +
geom_point(aes(color = case_when(
fitD08D03 >= -1 & fitE03D04 >= -1 ~ "lightblue4",
fitD08D03 >= -1 & fitE03D04 < -1 ~ "#0072B2",
fitE03D04 >= -1 & fitD08D03 < -1 ~ "#E69F00",
TRUE ~ "black"
),
fill = case_when(
fitD08D03 >= -1 & fitE03D04 >= -1 ~ "lightblue4",
fitD08D03 >= -1 & fitE03D04 < -1 ~ "#0072B2",
fitE03D04 >= -1 & fitD08D03 < -1 ~ "#E69F00",
TRUE ~ "white"
),
shape = case_when(
fitD08D03 >= -1 & fitE03D04 >= -1 ~ 16,
fitD08D03 >= -1 & fitE03D04 < -1 ~ 16,
fitE03D04 >= -1 & fitD08D03 < -1 ~ 16,
TRUE ~ 21
)),
alpha = 0.75, size = 4) +
scale_shape_identity() +
scale_color_identity() +
scale_fill_identity() +
# Add a new point for WT E. coli median fitness
geom_point(data = BCcontrols_15_16_shared_median_WT,
aes(x = fitD08D03, y = fitE03D04),
fill = "red", color = "black", size = 3, shape = 24) +
# Add a new point for Neg Ctrl (D27N, mCherry) median fitness
geom_point(data = BCcontrols_15_16_shared_median_Neg,
aes(x = fitD08D03, y = fitE03D04),
color = "black", size = 5, shape = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.line = element_line(colour = 'black', size = 1.0),
axis.ticks = element_line(colour = "black", size = 1.0),
panel.background = element_blank()) + #,
#panel.border = element_rect(colour = "black", fill=NA, size=1.0)) +
annotate("text",
x = max(L15.L16.Counts.1.0.TMP$fitD08D03),
y = min(L15.L16.Counts.1.0.TMP$fitE03D04),
label = paste("p-value =", p_value_scientific_shared_1.0_tmp), hjust = 1, vjust = 0) +
annotate("text",
x = max(L15.L16.Counts.1.0.TMP$fitD08D03),
y = min(L15.L16.Counts.1.0.TMP$fitE03D04),
label = paste("Correlation =", round(cor_value_shared_1.0_tmp, 2)), hjust = 1, vjust = -1.5) +
annotate("text",
x = min(L15.L16.Counts.1.0.TMP$fitD08D03),
y = max(L15.L16.Counts.1.0.TMP$fitE03D04),
label = paste("Shared Assemblies =", num_rows.counts.1.0.tmp), hjust = 0, vjust = 1.5) +
scale_x_continuous(breaks = seq(floor(min(L15.L16.Counts.1.0.TMP$fitD08D03)),
ceiling(max(L15.L16.Counts.1.0.TMP$fitD08D03)), by = 1)) +
scale_y_continuous(breaks = seq(floor(min(L15.L16.Counts.1.0.TMP$fitE03D04)),
ceiling(max(L15.L16.Counts.1.0.TMP$fitE03D04)), by = 1))
Lib15_16_1.0_TMP_p01 <- ggMarginal(Lib15_16_1.0_TMP, type = "histogram", fill = "lightblue4", alpha = 0.5)
`geom_smooth()` using formula = 'y ~ x'`geom_smooth()` using formula = 'y ~ x'
Lib15_16_1.0_TMP_p01
Plot shared fitness correlations between Lib15 & Lib16 (perfects) for 10 TMP:
# Extract correlation value from cor_value_shared object
cor_value_shared_10_tmp <- cor_test_Counts_shared_10_tmp$estimate
# Format p-value in scientific notation
p_value_scientific_shared_10_tmp <- format(cor_test_Counts_shared_10_tmp$p.value, scientific = TRUE, digits = 4)
# Extract number of rows
num_rows.counts.10.tmp <- nrow(L15.L16.Counts.10.TMP)
# Plot based on shared mutID
Lib15_16_10_TMP <- ggplot(L15.L16.Counts.10.TMP,
aes(x=fitD09D03, y=fitE04D04)) +
labs(x = "Codon 1 Median Fitness (LogFC) \n(10 μg/mL tmp)", y ="Codon 2 Median Fitness (LogFC) \n(10 μg/mL tmp)") +
geom_smooth(method=lm,colour="black") +
geom_density2d(colour="black",alpha=0.2) +
geom_point(aes(color = case_when(
fitD09D03 >= -1 & fitE04D04 >= -1 ~ "lightblue4",
fitD09D03 >= -1 & fitE04D04 < -1 ~ "#0072B2",
fitE04D04 >= -1 & fitD09D03 < -1 ~ "#E69F00",
TRUE ~ "black"
),
fill = case_when(
fitD09D03 >= -1 & fitE04D04 >= -1 ~ "lightblue4",
fitD09D03 >= -1 & fitE04D04 < -1 ~ "#0072B2",
fitE04D04 >= -1 & fitD09D03 < -1 ~ "#E69F00",
TRUE ~ "white"
),
shape = case_when(
fitD09D03 >= -1 & fitE04D04 >= -1 ~ 16,
fitD09D03 >= -1 & fitE04D04 < -1 ~ 16,
fitE04D04 >= -1 & fitD09D03 < -1 ~ 16,
TRUE ~ 21
)),
alpha = 0.75, size = 4) +
scale_shape_identity() +
scale_color_identity() +
scale_fill_identity() +
# Add a new point for WT E. coli median fitness
geom_point(data = BCcontrols_15_16_shared_median_WT,
aes(x = fitD09D03, y = fitE04D04),
fill = "red", color = "black", size = 3, shape = 24) +
# Add a new point for Neg Ctrl (D27N, mCherry) median fitness
geom_point(data = BCcontrols_15_16_shared_median_Neg,
aes(x = fitD09D03, y = fitE04D04),
color = "black", size = 5, shape = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.line = element_line(colour = 'black', size = 1.0),
axis.ticks = element_line(colour = "black", size = 1.0),
panel.background = element_blank()) + #,
#panel.border = element_rect(colour = "black", fill=NA, size=1.0)) +
annotate("text",
x = max(L15.L16.Counts.10.TMP$fitD09D03),
y = min(L15.L16.Counts.10.TMP$fitE04D04),
label = paste("p-value =", p_value_scientific_shared_10_tmp), hjust = 1, vjust = 0) +
annotate("text",
x = max(L15.L16.Counts.10.TMP$fitD09D03),
y = min(L15.L16.Counts.10.TMP$fitE04D04),
label = paste("Correlation =", round(cor_value_shared_10_tmp, 2)), hjust = 1, vjust = -1.5) +
annotate("text",
x = min(L15.L16.Counts.10.TMP$fitD09D03),
y = max(L15.L16.Counts.10.TMP$fitE04D04),
label = paste("Shared Assemblies =", num_rows.counts.10.tmp), hjust = 0, vjust = 1.5) +
scale_x_continuous(breaks = seq(floor(min(L15.L16.Counts.10.TMP$fitD09D03)),
ceiling(max(L15.L16.Counts.10.TMP$fitD09D03)), by = 1)) +
scale_y_continuous(breaks = seq(floor(min(L15.L16.Counts.10.TMP$fitE04D04)),
ceiling(max(L15.L16.Counts.10.TMP$fitE04D04)), by = 1))
Lib15_16_10_TMP_p01 <- ggMarginal(Lib15_16_10_TMP, type = "histogram", fill = "lightblue4", alpha = 0.5)
`geom_smooth()` using formula = 'y ~ x'`geom_smooth()` using formula = 'y ~ x'
Lib15_16_10_TMP_p01
Plot shared fitness correlations between Lib15 & Lib16 (perfects) for 50 TMP:
# Extract correlation value from cor_value_shared object
cor_value_shared_50_tmp <- cor_test_Counts_shared_50_tmp$estimate
# Format p-value in scientific notation
p_value_scientific_shared_50_tmp <- format(cor_test_Counts_shared_50_tmp$p.value, scientific = TRUE, digits = 4)
# Extract number of rows
num_rows.counts.50.tmp <- nrow(L15.L16.Counts.50.TMP)
# Plot based on shared mutID
Lib15_16_50_TMP <- ggplot(L15.L16.Counts.50.TMP,
aes(x=fitD10D03, y=fitE05D04)) +
labs(x = "Codon 1 Median Fitness (LogFC) \n(50 μg/mL tmp)", y ="Codon 2 Median Fitness (LogFC) \n(50 μg/mL tmp)") +
geom_smooth(method=lm,colour="black") +
geom_density2d(colour="black",alpha=0.2) +
geom_point(aes(color = case_when(
fitD10D03 >= -1 & fitE05D04 >= -1 ~ "lightblue4",
fitD10D03 >= -1 & fitE05D04 < -1 ~ "#0072B2",
fitE05D04 >= -1 & fitD10D03 < -1 ~ "#E69F00",
TRUE ~ "black"
),
fill = case_when(
fitD10D03 >= -1 & fitE05D04 >= -1 ~ "lightblue4",
fitD10D03 >= -1 & fitE05D04 < -1 ~ "#0072B2",
fitE05D04 >= -1 & fitD10D03 < -1 ~ "#E69F00",
TRUE ~ "white"
),
shape = case_when(
fitD10D03 >= -1 & fitE05D04 >= -1 ~ 16,
fitD10D03 >= -1 & fitE05D04 < -1 ~ 16,
fitE05D04 >= -1 & fitD10D03 < -1 ~ 16,
TRUE ~ 21
)),
alpha = 0.75, size = 4) +
scale_shape_identity() +
scale_color_identity() +
scale_fill_identity() +
# Add a new point for WT E. coli median fitness
geom_point(data = BCcontrols_15_16_shared_median_WT,
aes(x = fitD10D03, y = fitE05D04),
fill = "red", color = "black", size = 3, shape = 24) +
# Add a new point for Neg Ctrl (D27N, mCherry) median fitness
geom_point(data = BCcontrols_15_16_shared_median_Neg,
aes(x = fitD10D03, y = fitE05D04),
color = "black", size = 5, shape = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.line = element_line(colour = 'black', size = 1.0),
axis.ticks = element_line(colour = "black", size = 1.0),
panel.background = element_blank()) + #,
#panel.border = element_rect(colour = "black", fill=NA, size=1.0)) +
annotate("text",
x = max(L15.L16.Counts.50.TMP$fitD10D03),
y = min(L15.L16.Counts.50.TMP$fitE05D04),
label = paste("p-value =", p_value_scientific_shared_50_tmp), hjust = 1, vjust = 0) +
annotate("text",
x = max(L15.L16.Counts.50.TMP$fitD10D03),
y = min(L15.L16.Counts.50.TMP$fitE05D04),
label = paste("Correlation =", round(cor_value_shared_50_tmp, 2)), hjust = 1, vjust = -1.5) +
annotate("text",
x = min(L15.L16.Counts.50.TMP$fitD10D03),
y = max(L15.L16.Counts.50.TMP$fitE05D04),
label = paste("Shared Assemblies =", num_rows.counts.50.tmp), hjust = 0, vjust = 1.5) +
scale_x_continuous(breaks = seq(floor(min(L15.L16.Counts.50.TMP$fitD10D03)),
ceiling(max(L15.L16.Counts.50.TMP$fitD10D03)), by = 1)) +
scale_y_continuous(breaks = seq(floor(min(L15.L16.Counts.50.TMP$fitE05D04)),
ceiling(max(L15.L16.Counts.50.TMP$fitE05D04)), by = 1))
Lib15_16_50_TMP_p01 <- ggMarginal(Lib15_16_50_TMP, type = "histogram", fill = "lightblue4", alpha = 0.5)
`geom_smooth()` using formula = 'y ~ x'`geom_smooth()` using formula = 'y ~ x'
Lib15_16_50_TMP_p01
Plot shared fitness correlations between Lib15 & Lib16 (perfects) for 200 TMP:
# Extract correlation value from cor_value_shared object
cor_value_shared_200_tmp <- cor_test_Counts_shared_200_tmp$estimate
# Format p-value in scientific notation
p_value_scientific_shared_200_tmp <- format(cor_test_Counts_shared_200_tmp$p.value, scientific = TRUE, digits = 4)
# Extract number of rows
num_rows.counts.200.tmp <- nrow(L15.L16.Counts.200.TMP)
# Plot based on shared mutID
Lib15_16_200_TMP <- ggplot(L15.L16.Counts.200.TMP,
aes(x=fitD11D03, y=fitE06D04)) +
labs(x = "Codon 1 Median Fitness (LogFC) \n400x MIC (200 μg/mL tmp)",
y ="Codon 2 Median Fitness (LogFC) \n400x MIC (200 μg/mL tmp)") +
geom_smooth(method=lm,colour="black") +
geom_density2d(colour="black",alpha=0.2) +
geom_point(aes(color = case_when(
fitD11D03 >= -1 & fitE06D04 >= -1 ~ "lightblue4",
fitD11D03 >= -1 & fitE06D04 < -1 ~ "#0072B2",
fitE06D04 >= -1 & fitD11D03 < -1 ~ "#E69F00",
TRUE ~ "black"
),
fill = case_when(
fitD11D03 >= -1 & fitE06D04 >= -1 ~ "lightblue4",
fitD11D03 >= -1 & fitE06D04 < -1 ~ "#0072B2",
fitE06D04 >= -1 & fitD11D03 < -1 ~ "#E69F00",
TRUE ~ "white"
),
shape = case_when(
fitD11D03 >= -1 & fitE06D04 >= -1 ~ 16,
fitD11D03 >= -1 & fitE06D04 < -1 ~ 16,
fitE06D04 >= -1 & fitD11D03 < -1 ~ 16,
TRUE ~ 21
)),
alpha = 0.75, size = 4) +
scale_shape_identity() +
scale_color_identity() +
scale_fill_identity() +
# Add a new point for WT E. coli median fitness
geom_point(data = BCcontrols_15_16_shared_median_WT,
aes(x = fitD11D03, y = fitE06D04),
fill = "red", color = "black", size = 4, shape = 24) +
# Add a new point for Neg Ctrl (D27N, mCherry) median fitness
geom_point(data = BCcontrols_15_16_shared_median_Neg,
aes(x = fitD11D03, y = fitE06D04),
color = "black", size = 5, shape = 18) +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.line = element_line(colour = 'black', size = 1.0),
axis.ticks = element_line(colour = "black", size = 1.0),
panel.background = element_blank()) + #,
#panel.border = element_rect(colour = "black", fill=NA, size=1.0)) +
annotate("text",
x = max(L15.L16.Counts.200.TMP$fitD11D03),
y = min(L15.L16.Counts.200.TMP$fitE06D04),
label = paste("p-value =", p_value_scientific_shared_200_tmp), hjust = 1, vjust = 0, size = 5) +
annotate("text",
x = max(L15.L16.Counts.200.TMP$fitD11D03),
y = min(L15.L16.Counts.200.TMP$fitE06D04),
label = paste("Correlation =", round(cor_value_shared_200_tmp, 2)), hjust = 1, vjust = -1.5, size = 5) +
annotate("text",
x = min(L15.L16.Counts.200.TMP$fitD11D03),
y = max(L15.L16.Counts.200.TMP$fitE06D04),
label = paste("Shared Assemblies =", num_rows.counts.200.tmp), hjust = 0, vjust = 1.5, size = 5) +
scale_x_continuous(breaks = seq(floor(min(L15.L16.Counts.200.TMP$fitD11D03)),
ceiling(max(L15.L16.Counts.200.TMP$fitD11D03)), by = 1)) +
scale_y_continuous(breaks = seq(floor(min(L15.L16.Counts.200.TMP$fitE06D04)),
ceiling(max(L15.L16.Counts.200.TMP$fitE06D04)), by = 1))
Lib15_16_200_TMP_p01 <- ggMarginal(Lib15_16_200_TMP, type = "histogram", fill = "lightblue4", alpha = 0.5)
`geom_smooth()` using formula = 'y ~ x'`geom_smooth()` using formula = 'y ~ x'
Lib15_16_200_TMP_p01
The following section plots various summarizes of the Perfects BCs (minimum 5BC count/mutID):
Re-order the “perfects” dataset by “numprunedBCs” in ascending order:
# Rank order based on numprunedBCs in Lib15
perfects_shared.rank.15 <- perfects_15_16_shared %>%
arrange(numprunedBCs.x)
# Rank order based on numprunedBCs in Lib16
perfects_shared.rank.16 <- perfects_15_16_shared %>%
arrange(numprunedBCs.y)
Make plotting key based on “numprunedBCs”
# Plotting key based on numprunedBCs in Lib15
perfects_shared.rank.15$numprunedBCs.x.key <- 1:length(perfects_shared.rank.15$mutID)
# Plotting key based on numprunedBCs in Lib16
perfects_shared.rank.16$numprunedBCs.y.key <- 1:length(perfects_shared.rank.16$mutID)
Plot the number of perfects barcodes recovered based on the rank order of homologs:
Shared mutIDs
# Rename columns in one of the data frames
colnames(perfects_shared.rank.16) <- paste0(colnames(perfects_shared.rank.16), "_16")
# Combine the data frames
perfects_shared.rank.15.16 <- cbind(perfects_shared.rank.15, perfects_shared.rank.16)
# Create the plot
perfects_shared.rank.15.16_plot <- ggplot(perfects_shared.rank.15.16, aes(x = numprunedBCs.x.key)) +
geom_point(aes(y = numprunedBCs.x, color = "Lib15"), size = 2) +
geom_point(aes(y = numprunedBCs.y_16, color = "Lib16"), size = 2) +
scale_color_manual(name = "Library", values = c("Lib15" = "#0072B2", "Lib16" = "#E69F00")) +
ylab("Number of Barcodes") +
xlab("Rank Ordered Homolog") +
scale_y_log10(limits = c(1, 2000)) +
theme_minimal() +
theme(axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.position = "none")
perfects_shared.rank.15.16_plot
Plot the Perfects counts based on fitness values between D05 vs. D03 (Lib15) or D12 vs. D04 (Lib16):
rank_histogram_1BC_15_16 <- ggplot(perfects_shared.rank.15.16, aes(x = fitD05D03, fill = "Lib15")) +
geom_histogram(binwidth = 0.1, alpha = 0.5, color = "black") +
geom_histogram(aes(x = fitD12D04, fill = "Lib16"), binwidth = 0.1, alpha = 0.5, color = "black") +
xlab("Median Fitness \n(Log2 Fold Change)") +
ylab("Counts") +
ggtitle("Complementation Rank Count \n(Perfects >1 BC)") +
theme_minimal() +
theme(
axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.position = "none") +
scale_x_continuous(limits = c(-7, 3)) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 50)) +
scale_fill_manual(values = c("Lib15" = "#0072B2", "Lib16" = "#E69F00"), name = "Library")
rank_histogram_1BC_15_16
Summarize the total number of unique Perfects regardless of fitness value:
# Lib15
perfects15 %>%
nrow(.)
[1] 961
# Lib16
perfects16 %>%
nrow(.)
[1] 818
# Shared Perfects
perfects_15_16_shared %>%
nrow(.)
[1] 643
Summarize the number of Perfects with fitness greater than -1.0 in D05 vs. D03 (Lib15) or D12 vs. D04 (Lib16) conditions:
# Shared Perfects - Lib15
perfects_15_16_shared %>% filter(fitD05D03>-1.0) %>%
nrow(.)
[1] 335
# Shared Perfects - Lib16
perfects_15_16_shared %>% filter(fitD12D04>-1.0) %>%
nrow(.)
[1] 350
Plot the Perfects counts based on fitness values between D05 vs. D03 (Lib15) or D12 vs. D04 (Lib16) at > 2 BCs:
rank_histogram_2BC_15_16 <- ggplot(perfects_15_16_shared_2BCs, aes(x = fitD05D03, fill = "Lib15")) +
geom_histogram(binwidth = 0.1, alpha = 0.5, color = "black") +
geom_histogram(aes(x = fitD12D04, fill = "Lib16"), binwidth = 0.1, alpha = 0.5, color = "black") +
xlab("Fitness (Log2 Fold Change)") +
ylab("Counts") +
ggtitle("Complementation Rank Count \n(Perfects >2 BC)") +
theme_minimal() +
theme(
axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.position = "none") +
scale_x_continuous(limits = c(-7, 3)) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 50)) +
scale_fill_manual(values = c("Lib15" = "#0072B2", "Lib16" = "#E69F00"), name = "Library")
rank_histogram_2BC_15_16
Summarize the total number of unique Perfects (>2 BCs) regardless of fitness value:
# Shared Perfects
perfects_15_16_shared_2BCs %>%
nrow(.)
[1] 579
Summarize the number of Perfects (>2 BCs) with fitness greater than -1.0 in D05 vs. D03 (Lib15) or D12 vs. D04 (Lib16) conditions:
# Lib15
perfects_15_16_shared_2BCs %>% filter(fitD05D03>-1.0) %>%
nrow(.)
[1] 306
# Lib16
perfects_15_16_shared_2BCs %>% filter(fitD12D04>-1.0) %>%
nrow(.)
[1] 322
Plot the Perfects counts based on fitness values between D05 vs. D03 (Lib15) or D12 vs. D04 (Lib16) at > 5 BCs:
rank_histogram_5BC_15_16 <- ggplot(perfects_15_16_shared_5BCs, aes(x = fitD05D03, fill = "Lib15")) +
geom_histogram(binwidth = 0.1, alpha = 0.5, color = "black") +
geom_histogram(aes(x = fitD12D04, fill = "Lib16"), binwidth = 0.1, alpha = 0.5, color = "black") +
xlab("Fitness (Log2 Fold Change)") +
ylab("Counts") +
ggtitle("Complementation Rank Count \n(Perfects >5 BC)") +
theme_minimal() +
theme(
axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.position = "bottom") +
scale_x_continuous(limits = c(-7, 3)) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 50)) +
scale_fill_manual(values = c("Lib15" = "#0072B2", "Lib16" = "#E69F00"), name = "Library")
rank_histogram_5BC_15_16
Summarize the total number of unique Perfects (>2 BCs) regardless of fitness value:
# Shared Perfects
perfects_15_16_shared_5BCs %>%
nrow(.)
[1] 493
Summarize the number of Perfects (>2 BCs) with fitness greater than -1.0 in D05 vs. D03 (Lib15) or D12 vs. D04 (Lib16) conditions:
# Lib15
perfects_15_16_shared_5BCs %>% filter(fitD05D03>-1.0) %>%
nrow(.)
[1] 273
# Lib16
perfects_15_16_shared_5BCs %>% filter(fitD12D04>-1.0) %>%
nrow(.)
[1] 280
Plot all minimum BC levels together
Plot the Perfects fitness scores for D05 vs. D03 (Lib15) or D12 vs. D04 (Lib16) based on the number of recovered barcodes:
BC_scatter_5BC_15_16 <- ggplot() +
geom_point(data = perfects_15_16_shared_5BCs_good, aes(x = numprunedBCs.x, y = fitD05D03), color = "#0072B2", alpha = 0.8) +
geom_point(data = perfects_15_16_shared_5BCs_good, aes(x = numprunedBCs.y, y = fitD12D04), color = "#E69F00", alpha = 0.8) +
xlab("Number of Barcodes") +
ylab("Fitness (Log2 Fold Change)") +
ggtitle("Barcode Counts \n(Perfects >5 BC)") +
scale_x_log10() +
theme_minimal() +
theme(
axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
BC_scatter_5BC_15_16
Gather the median fitness of Perfects at each Trimethoprim concentration based on filtered 1BC dataset.
Library 15
# Filter perfects_15_16 dataset by minimum of numprunedBCs
perfects15_5BCsdr <- perfects_15_16_shared_5BCs_good %>%
select(mutID, fitD05D03, fitD06D03, fitD07D03, fitD08D03, fitD09D03, fitD10D03, fitD11D03, numprunedBCs.x) %>%#
gather("selection","fitness",-mutID,-numprunedBCs.x)
# Create naming factors to convert column names into trimethoprim concentrations for plotting
perfects15_5BCsdr$tmpfactor <- "0"
perfects15_5BCsdr$tmpfactor[perfects15_5BCsdr$selection== "fitD05D03"] <- "0"
perfects15_5BCsdr$tmpfactor[perfects15_5BCsdr$selection== "fitD06D03"] <- "0.0058"
perfects15_5BCsdr$tmpfactor[perfects15_5BCsdr$selection== "fitD07D03"] <- "0.5"
perfects15_5BCsdr$tmpfactor[perfects15_5BCsdr$selection== "fitD08D03"] <- "1"
perfects15_5BCsdr$tmpfactor[perfects15_5BCsdr$selection== "fitD09D03"] <- "10"
perfects15_5BCsdr$tmpfactor[perfects15_5BCsdr$selection== "fitD10D03"] <- "50"
perfects15_5BCsdr$tmpfactor[perfects15_5BCsdr$selection== "fitD11D03"] <- "200"
perfects15_5BCsdr$tmpname[perfects15_5BCsdr$selection== "fitD05D03"] <- "A"
perfects15_5BCsdr$tmpname[perfects15_5BCsdr$selection== "fitD06D03"] <- "B"
perfects15_5BCsdr$tmpname[perfects15_5BCsdr$selection== "fitD07D03"] <- "C"
perfects15_5BCsdr$tmpname[perfects15_5BCsdr$selection== "fitD08D03"] <- "D"
perfects15_5BCsdr$tmpname[perfects15_5BCsdr$selection== "fitD09D03"] <- "E"
perfects15_5BCsdr$tmpname[perfects15_5BCsdr$selection== "fitD10D03"] <- "F"
perfects15_5BCsdr$tmpname[perfects15_5BCsdr$selection== "fitD11D03"] <- "G"
perfects15_5BCsdrlabs <- c("0","0.058","0.5","1","10","50","200")
TMP_plot_5BC_15 <- ggplot(perfects15_5BCsdr,aes(x=tmpname,y=fitness)) +
geom_violin(fill = "#0072B2", alpha=0.75) +
geom_boxplot(width=0.1) +
xlab("Trimethoprim (ug/mL)") +
ylab("Median Fitness") +
scale_x_discrete(labels= perfects15_5BCsdrlabs) +
ggtitle("Library 15 Dose Response \n(Perfects >5 BC)") +
theme_minimal() +
theme(axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.position = c(0.16, 1),
legend.justification = c(1, 1),
legend.box.background = element_rect(colour = "black")) +
scale_y_continuous(expand = c(0, 0), limits = c(-12,8))
Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2 3.5.0.
Please use the `legend.position.inside` argument of `theme()` instead.
TMP_plot_5BC_15
# Filter perfects16_5BCs dataset by minimum of numprunedBCs
perfects16_5BCsdr <- perfects_15_16_shared_5BCs_good %>%
#filter(numprunedBCs>10) %>%
select(mutID, fitD12D04, fitE01D04, fitE02D04, fitE03D04, fitE04D04, fitE05D04, fitE06D04, numprunedBCs.y) %>%#
gather("selection","fitness",-mutID,-numprunedBCs.y)
# Create naming factors to convert column names into trimethoprim concentrations for plotting
perfects16_5BCsdr$tmpfactor <- "0"
perfects16_5BCsdr$tmpfactor[perfects16_5BCsdr$selection== "fitD12D04"] <- "0"
perfects16_5BCsdr$tmpfactor[perfects16_5BCsdr$selection== "fitE01D04"] <- "0.0058"
perfects16_5BCsdr$tmpfactor[perfects16_5BCsdr$selection== "fitE02D04"] <- "0.5"
perfects16_5BCsdr$tmpfactor[perfects16_5BCsdr$selection== "fitE03D04"] <- "1"
perfects16_5BCsdr$tmpfactor[perfects16_5BCsdr$selection== "fitE04D04"] <- "10"
perfects16_5BCsdr$tmpfactor[perfects16_5BCsdr$selection== "fitE05D04"] <- "50"
perfects16_5BCsdr$tmpfactor[perfects16_5BCsdr$selection== "fitE06D04"] <- "200"
perfects16_5BCsdr$tmpname[perfects16_5BCsdr$selection== "fitD12D04"] <- "A"
perfects16_5BCsdr$tmpname[perfects16_5BCsdr$selection== "fitE01D04"] <- "B"
perfects16_5BCsdr$tmpname[perfects16_5BCsdr$selection== "fitE02D04"] <- "C"
perfects16_5BCsdr$tmpname[perfects16_5BCsdr$selection== "fitE03D04"] <- "D"
perfects16_5BCsdr$tmpname[perfects16_5BCsdr$selection== "fitE04D04"] <- "E"
perfects16_5BCsdr$tmpname[perfects16_5BCsdr$selection== "fitE05D04"] <- "F"
perfects16_5BCsdr$tmpname[perfects16_5BCsdr$selection== "fitE06D04"] <- "G"
perfects16_5BCsdrlabs <- c("0","0.058","0.5","1","10","50","200")
Lib16 Violin Plot: based on filtered 5BC dataset:
TMP_plot_5BC_16 <- ggplot(perfects16_5BCsdr,aes(x=tmpname,y=fitness)) +
geom_violin(fill = "#E69F00") +
geom_boxplot(width=0.1) +
xlab("Trimethoprim (ug/mL)") +
ylab("Median Fitness") +
scale_x_discrete(labels= perfects16_5BCsdrlabs) +
ggtitle("Library 16 Dose Response \n(Perfects >5 BC)") +
theme_minimal() +
theme(axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.position = c(0.16, 1),
legend.justification = c(1, 1),
legend.box.background = element_rect(colour = "black")) +
scale_y_continuous(expand = c(0, 0), limits = c(-12,8))
TMP_plot_5BC_16
Combine the two data frames:
perfects_15_16_5BCsdr <- bind_rows(
perfects15_5BCsdr %>% mutate(Lib = "Lib15"), # Values derived from Lib15 perfects_15_16_shared_5BCs
perfects16_5BCsdr %>% mutate(Lib = "Lib16"), # Values derived from Lib16 perfects_15_16_shared_5BCs
.id = "id")
perfects_15_16_5BCsdrlabs <- c("0","0.058","0.5","1","10","50","200")
Plot the median fitness of Perfects at each Trimethoprim concentration for Day 1 based on filtered 5BC dataset:
perfects_15_16_5BCsdr_plot <- ggplot(perfects_15_16_5BCsdr, aes(x = tmpname, y = fitness, fill = Lib)) +
#geom_hline(yintercept = 0, linetype = "dashed", color = "black", size = 0.5) +
geom_boxplot(position = "dodge", alpha=0.8) +
xlab("Trimethoprim (ug/mL)") +
ylab(expression(paste("Median Fitness (LogFC)"))) +
scale_x_discrete(labels = perfects_15_16_5BCsdrlabs) +
#ggtitle("Median Fitness \nPerfects (>5BCs)") +
theme_minimal() +
theme(
axis.line = element_line(colour = 'black', size = 1.0),
axis.ticks = element_line(colour = "black", size = 1.0),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.position = "bottom") +
scale_y_continuous(expand = c(0, 0), limits = c(-12, 8)) +
scale_fill_manual(values = c("Lib15" = "#0072B2", "Lib16" = "#E69F00"),
labels = c("Codon1", "Codon2"))
# Display the plot
perfects_15_16_5BCsdr_plot
Load in the organism dataset associated with DHFR genes for mapping taxonomy to the phylogenetic tree.
#load in all the organisms info for unique BCs associated with DHFR genes:
orginfo = read.csv("Perfects/INPUT/DHFR_chip_3_proteins_addedseveral.csv", head=TRUE) # read csv file
Subset the dataset:
orginfo <- orginfo %>%
select(-Lib.codon1, -Lib.codon2, -Construct., -Barcode) %>%
dplyr::rename(IDfull=Accession) %>%
#mutate(ID=strsplit(IDfull, split = ".")[1])
tidyr::separate(IDfull, c("ID", "ver"), "\\.",remove = FALSE) %>%
select(-ver)
Warning: Expected 2 pieces. Missing pieces filled with `NA` in 15 rows [4834, 4835, 5690, 5691, 5692, 5693, 5694, 5695, 5696, 5697, 5698, 5699, 5700, 5701, 5712].
names(orginfo)
[1] "IDfull" "ID" "PctIdentEcoli" "Source" "Definition" "TaxID" "Taxa1"
[8] "Taxa2" "Taxa3" "Taxa4" "Sequence"
Add the organism info to the perfects info as a new object called “perfects_tree”:
# Make a copy of "mutID" column called "ID" for matching with orginfo object:
perfects_15_16_shared_5BCs <- perfects_15_16_shared_5BCs %>%
mutate(ID = mutID)
# Add orginfo to perfects_5BCs_tree
perfects_15_16_5BCs_tree <- right_join(orginfo,perfects_15_16_shared_5BCs,by="ID") %>%
select(-IDfull,-ID.x,-seq.x,-pct_ident.x,-ID.y,-seq.y,-pct_ident.y)
Plot the Perfects fitness scores for D05 vs. D03 (Lib15) or D12 vs. D04 (Lib16) based on percent similarity of the each variant’s gene sequences to the E. coli DHFR gene sequence:
perfects_15_16_5BCs_scatter_plot <- ggplot() +
geom_point(data = perfects_15_16_5BCs_tree, aes(x = PctIdentEcoli, y = fitD05D03), color = "#0072B2", alpha = 0.8) +
geom_point(data = perfects_15_16_5BCs_tree, aes(x = PctIdentEcoli, y = fitD12D04), color = "#E69F00", alpha = 0.8) +
xlab("Percent Similarity to E. coli DHFR") +
ylab("Fitness (Log2 Fold Change)") +
ggtitle("Median Fitness based on \n% Similarity to E. coli") +
scale_x_log10() +
theme_minimal() +
theme(
axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) #+
#scale_y_continuous(expand = c(0, 0), limits = c(-7, 1))
perfects_15_16_5BCs_scatter_plot
Bin the % Similarity values in 10% increments and replot as boxplots:
# Convert data to long format
data_long <- perfects_15_16_5BCs_tree %>%
pivot_longer(cols = c(fitD05D03, fitD12D04), names_to = "variable", values_to = "value")
# Define new x-axis labels
new_x_labels <- c("30-40%", "40-50%", "50-60%", "60-70%", "70-80%", "80-90%", "90-100%")
perfects_15_16_5BCs_binned_plot <- ggplot(data_long, aes(x = cut(PctIdentEcoli, breaks = seq(0, 1.0, by = 0.1)), y = value, fill = variable)) +
geom_boxplot(position = "dodge", alpha = 0.8) +
geom_hline(yintercept = -1, linetype = "dashed", color = "red") +
xlab("Percent Similarity to E. coli DHFR (Binned)") +
ylab("Median Fitness \n(Log2 Fold Change)") +
ggtitle("Percent Similarity to E. coli \n(Perfects >5 BC)") +
scale_x_discrete(labels = new_x_labels) +
theme_minimal() +
theme(
axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 12),
legend.title = element_blank(),
legend.position = "bottom") +
scale_fill_manual(values = c("#0072B2", "#E69F00"), labels = c("Codon 1", "Codon 2"))
perfects_15_16_5BCs_binned_plot
Calculate the median percent similarity to E. coli across all homologs (5 BCs):
# Make a copy of "mutID" column called "ID" for matching with orginfo object:
perfects15_5BCs_PctIdentEcoli <- perfects15_5BCs
# Add orginfo to perfects_5BCs_tree
perfects15_5BCs_tree <- right_join(orginfo,perfects15_5BCs_PctIdentEcoli,by="ID") #%>%
#select(-IDfull,-ID.x,-seq.x,-pct_ident.x,-ID.y,-seq.y,-pct_ident.y)
# Calculate mean and median PctIdentEcoli value across all homologs for Lib15
mean(perfects15_5BCs_tree$PctIdentEcoli)
[1] 0.4831003
median(perfects15_5BCs_tree$PctIdentEcoli)
[1] 0.4567901
This section uses the library(ggridges)
package.
Subset the perfects_tree_5BCs object to retain only “ID” and fitness scores for select conditions. This code will transpose the individual fitness score columns into two columns with the fitness label and values, respectively, for each unique ID.
# Lib15
perfects15_tree_5BCs_d1 <- perfects_15_16_5BCs_tree %>%
select(ID, fitD05D03, fitD06D03, fitD07D03, fitD08D03, fitD09D03, fitD10D03, fitD11D03) %>%
pivot_longer(!ID, names_to = "fc", values_to = "val") %>%
mutate(TMP = case_when(
fc == "fitD05D03" ~ "0-TMP",
fc == "fitD06D03" ~ "0.058-TMP",
fc == "fitD07D03" ~ "0.5-TMP",
fc == "fitD08D03" ~ "1.0-TMP",
fc == "fitD09D03" ~ "10-TMP",
fc == "fitD10D03" ~ "50-TMP",
fc == "fitD11D03" ~ "200-TMP",
TRUE ~ NA_character_))
# Lib16
perfects16_tree_5BCs_d1 <- perfects_15_16_5BCs_tree %>%
select(ID, fitD12D04, fitE01D04, fitE02D04, fitE03D04, fitE04D04, fitE05D04, fitE06D04) %>%
pivot_longer(!ID, names_to = "fc", values_to = "val") %>%
mutate(TMP = case_when(
fc == "fitD12D04" ~ "0-TMP",
fc == "fitE01D04" ~ "0.058-TMP",
fc == "fitE02D04" ~ "0.5-TMP",
fc == "fitE03D04" ~ "1.0-TMP",
fc == "fitE04D04" ~ "10-TMP",
fc == "fitE05D04" ~ "50-TMP",
fc == "fitE06D04" ~ "200-TMP",
TRUE ~ NA_character_))
# Combine the two data frames
perfects_15_16_tree_5BCs_d1 <- bind_rows(
perfects15_tree_5BCs_d1 %>% mutate(Lib = "Lib15"),
perfects16_tree_5BCs_d1 %>% mutate(Lib = "Lib16"),
.id = "id")
Plot Perfects fitness scores based on Supplementation treatment for first sampling time point
perfects_15_16_tree_5BCs_d1_order <- c("200-TMP", "50-TMP", "10-TMP", "1.0-TMP", "0.5-TMP", "0.058-TMP", "0-TMP")
# Rename the levels in the Lib column
perfects_15_16_tree_5BCs_d1$Lib <- factor(perfects_15_16_tree_5BCs_d1$Lib,
levels = c("Lib15", "Lib16"),
labels = c("Codon1", "Codon2"))
tmp_ridges_15_16_d1 <- ggplot(perfects_15_16_tree_5BCs_d1, aes(x = val, y = factor(TMP, level = perfects_15_16_tree_5BCs_d1_order), fill = Lib)) +
geom_density_ridges(alpha = 0.7) +
scale_y_discrete(labels = c('200 μg/mL TMP', '50 μg/mL TMP', '10 μg/mL TMP', '1 μg/mL TMP', '0.5 μg/mL TMP', '0.058 μg/mL TMP', 'Complementation')) +
xlab("Median Fitness (LogFC)") +
ylab("Selection Condition (ug/mL TMP)") +
#ggtitle("Dose Response \nDay 1") +
theme_minimal() +
theme(
axis.line = element_line(colour = 'black', size = 0.5),
axis.ticks = element_line(colour = "black", size = 0.5),
plot.title = element_text(size = 14, hjust = 0.5, face = "bold"),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "bottom",
legend.text = element_text(size = 12),
legend.title = element_blank()) +
scale_x_continuous(limits = c(-15, 10)) +
scale_fill_manual(values = c("Codon1" = "#0072B2", "Codon2" = "#E69F00"), name = "Library")
# Display the plot
tmp_ridges_15_16_d1
patch6 <- (perfects_15_16_5BCs_binned_plot / perfects_15_16_5BCsdr_plot) | (tmp_ridges_15_16_d1)
patch6
Plot the mapped perfects for each library and when the shared perfects are combined with the unique perfects:
# Create the data frame
mapped.data <- data.frame(
Library = c("Codon1", "Codon2", "Both"),
Perfects = c(1048, 904, 1208),
Percent = c(68.2, 58.9, 78.7)
)
# Create a ggplot object
L15.L16.1208.Mapped.Barchart <- ggplot(mapped.data) +
# Add a horizontal line at y=0
geom_hline(yintercept = 0, color = "black") +
# Wider bar for "Both" plotted behind
geom_bar(data = subset(mapped.data, Library == "Both"),
aes(x = 1.5, y = Perfects, fill = Library),
stat = "identity",
width = 0.75,
alpha = 0.5) +
# Bar plot for Codon1 and Codon2
geom_bar(data = subset(mapped.data, Library != "Both"),
aes(x = as.numeric(factor(Library, levels = c("Codon1", "Codon2"))), y = Perfects, fill = Library),
stat = "identity",
width = 0.6) +
scale_x_continuous(breaks = c(1, 2), labels = c("Codon1", "Codon2"),
limits = c(0.5, 2.5)) +
scale_y_continuous(
name = "Assemblies Represented \n(1536 Designs)",
limits = c(0, 1536),
expand = c(0, 0), # Remove padding
sec.axis = sec_axis(~ . * 100 / 1536, name = "Library Representation (%)", breaks = seq(0, 100, by = 20))
) +
# Add an invisible layer to ensure the secondary y-axis is plotted
geom_point(data = subset(mapped.data, Library != "Both"),
aes(x = as.numeric(factor(Library, levels = c("Codon1", "Codon2"))), y = Percent * 1536 / 100),
color = NA) +
theme_minimal() +
theme(
axis.title.y.right = element_text(color = "red", size = 14),
axis.text.y.right = element_text(color = "red", size = 12),
axis.ticks.y.right = element_line(color = "red"),
axis.title.x = element_text(size = 14),
axis.text.x = element_blank(),
axis.title.y = element_text(size = 14),
axis.text.y = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.position = "none",
axis.line = element_line(color = "black", size=1.0),
axis.ticks = element_line(color = "black", size=1.0),
axis.line.x.top = element_blank(),
axis.ticks.y.left = element_line(color = "black"),
axis.ticks.length = unit(0.2, "cm")
) +
scale_fill_manual(values = c("Codon1" = "#0072B2", "Codon2" = "#E69F00", "Both" = "lightblue4"),
breaks = c("Codon1", "Codon2", "Both")) +
labs(x = "Mapped \nAssemblies") +
coord_cartesian(xlim = c(0.5, 2.5), ylim = c(0, 1536)) +
guides(fill = guide_legend(override.aes = list(alpha = 1)))
# Display the plot
print(L15.L16.1208.Mapped.Barchart)
Plot the perfects recovered in the KO model for each library and the combined perfects (shared perfects + unique perfects):
# Create the data frame
recovered.data <- data.frame(
Library = c("Codon1", "Codon2", "Both"),
Perfects = c(961, 818, 1136),
Percent = c(80, 68, 94)
)
# Create a ggplot object
L15.L16.1150.Recovered.Barchart <- ggplot(recovered.data) +
# Add a horizontal line at y=0
geom_hline(yintercept = 0, color = "black") +
# Wider bar for "Both" plotted behind
geom_bar(data = subset(recovered.data, Library == "Both"),
aes(x = 1.5, y = Perfects, fill = Library),
stat = "identity",
width = 0.75,
alpha = 0.5) +
# Bar plot for Codon1 and Codon2
geom_bar(data = subset(recovered.data, Library != "Both"),
aes(x = as.numeric(factor(Library, levels = c("Codon1", "Codon2"))), y = Perfects, fill = Library),
stat = "identity",
width = 0.6) +
scale_x_continuous(breaks = c(1, 2), labels = c("Codon1", "Codon2"),
limits = c(0.5, 2.5)) +
scale_y_continuous(
name = "Homologs Represented \n(1208 Assembled)",
limits = c(0, 1208),
expand = c(0, 0), # Remove padding
sec.axis = sec_axis(~ . * 100 / 1208, name = "Library Representation (%)", breaks = seq(0, 100, by = 20))
) +
# Add an invisible layer to ensure the secondary y-axis is plotted
geom_point(data = subset(recovered.data, Library != "Both"),
aes(x = as.numeric(factor(Library, levels = c("Codon1", "Codon2"))), y = Percent * 1208 / 100),
color = NA) +
theme_minimal() +
theme(
axis.title.y.right = element_blank(),
axis.text.y.right = element_blank(),
axis.ticks.y.right = element_blank(),
axis.line.y.right = element_blank(),
axis.title.x = element_text(size = 15),
axis.text.x = element_blank(),
axis.title.y = element_text(size = 18),
axis.text.y = element_text(size = 16),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
legend.title = element_blank(),
legend.text = element_text(size = 14),
legend.position = "none",
axis.line = element_line(color = "black", size=1.0),
axis.ticks = element_line(color = "black", size=1.0),
axis.line.x.top = element_blank(),
axis.ticks.y.left = element_line(color = "black"),
axis.ticks.length = unit(0.2, "cm")
) +
scale_fill_manual(values = c("Codon1" = "#0072B2", "Codon2" = "#E69F00", "Both" = "lightblue4"),
breaks = c("Codon1", "Codon2", "Both")) +
labs(x = "Recovered \nHomologs") +
coord_cartesian(xlim = c(0.5, 2.5), ylim = c(0, 1250)) +
guides(fill = guide_legend(override.aes = list(alpha = 1)))
# Display the plot
print(L15.L16.1150.Recovered.Barchart)
Plot the perfects that complement in the KO model for each library and the combined perfects (shared perfects + unique perfects):
# Create the data frame
complement.data <- data.frame(
Library = c("Codon1", "Codon2", "Both"),
Perfects = c(417, 377, 600),
Percent = c(35, 31, 50)
)
# Create a ggplot object
L15.L16.1150.Complement.Barchart <- ggplot(complement.data) +
# Add a horizontal line at y=0
geom_hline(yintercept = 0, color = "black") +
# Wider bar for "Both" plotted behind
geom_bar(data = subset(complement.data, Library == "Both"),
aes(x = 1.5, y = Perfects, fill = Library),
stat = "identity",
width = 0.75,
alpha = 0.5) +
# Bar plot for Codon1 and Codon2
geom_bar(data = subset(complement.data, Library != "Both"),
aes(x = as.numeric(factor(Library, levels = c("Codon1", "Codon2"))), y = Perfects, fill = Library),
stat = "identity",
width = 0.6) +
scale_x_continuous(breaks = c(1, 2), labels = c("Codon1", "Codon2"),
limits = c(0.5, 2.5)) +
scale_y_continuous(
name = " ",
limits = c(0, 1208),
expand = c(0, 0), # Remove padding
sec.axis = sec_axis(~ . * 100 / 1208, name = "Library Representation (%)", breaks = seq(0, 100, by = 20))
) +
# Add an invisible layer to ensure the secondary y-axis is plotted
geom_point(data = subset(complement.data, Library != "Both"),
aes(x = as.numeric(factor(Library, levels = c("Codon1", "Codon2"))), y = Percent * 1208 / 80),
color = NA) +
theme_minimal() +
theme(
axis.title.y.right = element_text(color = "red", size = 18),
axis.text.y.right = element_text(color = "red", size = 16),
axis.ticks.y.right = element_line(color = "black"),
axis.ticks.y = element_blank(),
axis.title.x = element_text(size = 15),
axis.text.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
legend.title = element_blank(),
legend.text = element_blank(),
legend.position = "none",
axis.line = element_line(color = "black", size=1.0),
axis.ticks = element_line(color = "black", size=1.0),
axis.line.y.right = element_line(color = "black"),
axis.line.y = element_blank(),
axis.line.x.top = element_blank(),
axis.ticks.length = unit(0.2, "cm")
) +
scale_fill_manual(values = c("Codon1" = "#0072B2", "Codon2" = "#E69F00", "Both" = "lightblue4"),
breaks = c("Codon1", "Codon2", "Both")) +
labs(x = "Complementing \nHomologs") + # Changed x-axis label
coord_cartesian(xlim = c(0.5, 2.5), ylim = c(0, 1250)) +
guides(fill = guide_legend(override.aes = list(alpha = 1)))
# Display the plot
print(L15.L16.1150.Complement.Barchart)
patch7 <- L15.L16.1150.Recovered.Barchart | L15.L16.1150.Complement.Barchart
patch7
This section uses the library(ggtree) and
library(castor) packages.
Create a full unique perfects mutID dataframe based on
mutIDinfo.15.16.zeros.shared and
mutIDinfo.15.16.zeros.unique to generate a phylogenetic
tree composed on 1,208 mutID sequences.
# Subset relevant columns before combining datasets:
# Shared mutIDs
mutIDinfo.15.16.zeros.shared.perfects <- mutIDinfo.15.16.zeros.shared %>%
filter(mutations.x == 0) %>%
select(mutID, seq = seq.x)
# Unique mutIDs
mutIDinfo.15.16.zeros.unique.perfects <- mutIDinfo.15.16.zeros.unique %>%
filter(mutations == 0) %>%
select(mutID, seq)
# Combine both datasets
mutIDinfo.15.16.zeros.all.perfects <- rbind(mutIDinfo.15.16.zeros.shared.perfects,
mutIDinfo.15.16.zeros.unique.perfects)
Add “taxID” and “PctIdentEcoli” to each “mutID”
# Rename "mutID" to "ID" to merge with `orginfo` df:
mutIDinfo.15.16.zeros.all.perfects <- mutIDinfo.15.16.zeros.all.perfects %>% rename(ID = mutID)
# Merge "TaxID" and "PctIdentEcoli" from `orginfo` to `mutIDinfo.15.16.zeros.all.perfects`:
mutIDinfo.15.16.zeros.all.perfects <- mutIDinfo.15.16.zeros.all.perfects %>%
left_join(orginfo %>% select(ID, TaxID, PctIdentEcoli), by = "ID")
Create a FASTA file containing the ID and its associated protein sequence for alignment
# Collect the sequences in FASTA format
mutIDinfo.15.16.zeros.all.perfects_fasta_content <- paste(">",mutIDinfo.15.16.zeros.all.perfects$ID, "\n", mutIDinfo.15.16.zeros.all.perfects$seq, "\n", sep = "", collapse = "")
# Define the file path in the working directory
mutIDinfo.15.16.zeros.all.perfects_fasta_path <- file.path(getwd(), "Perfects/TREES/DHFR.Lib.15.16.ID.all.perfects.mapped.fasta")
# Write the FASTA content to the file
writeLines(mutIDinfo.15.16.zeros.all.perfects_fasta_content, con = mutIDinfo.15.16.zeros.all.perfects_fasta_path)
Perform a multiple sequence alignment on the FASTA file using CLUSTAL Omega:
# May need to enable permissions to run the executable:
#chmod +x clustalo
./Scripts/clustalo -i Perfects/TREES/DHFR.Lib.15.16.ID.all.perfects.mapped.fasta -o Perfects/TREES/DHFR.Lib.15.16.ID.all.perfects.mapped.aligned.fasta --outfmt=fa --force
Use FastTree (phylogenetic tree building program) to infer the tree from the aligned amino acid sequences:
# ML Model: Jones-Taylor-Thorton
# chmod +x FastTree
./Scripts/FastTree Perfects/TREES/DHFR.Lib.15.16.ID.all.perfects.mapped.aligned.fasta > Perfects/TREES/DHFR.Lib.15.16.ID.all.perfects.mapped.tree.newick
Import the newick tree file based on the sequence alignment of shared perfects derived from Lib15 & Lib16 mapping file:
# Full tree alignment (1,208 mutID)
Alltree15 <- read.tree("Perfects/TREES/DHFR.Lib.15.16.ID.all.perfects.mapped.tree.newick") # newick format
Alltree15
Phylogenetic tree with 1208 tips and 1206 internal nodes.
Tip labels:
WP_011201021, WP_005655824, WP_011272274, WP_006995561, WP_005632134, WP_009500922, ...
Node labels:
, 0.917, 0.957, 0.898, 0.923, 0.819, ...
Unrooted; includes branch lengths.
Extract the tip labels from the newick tree file to match with NCBI taxonomy for downstream plotting:
# Extract tip labels from Alltree15
Alltree15_tip_labels <- Alltree15$tip.label
# Create a new data frame with unique tip labels
Alltree15_tip_labels_df <- data.frame(tip.label = unique(Alltree15_tip_labels))
# Print the first few rows of the new data frame
head(Alltree15_tip_labels_df)
Match each tip.label ID in Alltree15 with it’s
associated TaxID from orginfo dataframe:
# Rename column "tip.label" to "ID"
colnames(Alltree15_tip_labels_df) <- c("ID")
# Merge orginfo with Alltree15_tip_labels_df based on the shared ID
Alltree15_taxa <- merge(Alltree15_tip_labels_df, orginfo, by = "ID", all.x = TRUE)
# Print the first few rows of the merged data frame
head(Alltree15_taxa)
Import the full up-to-date NCBI taxonomy dataset containing 2,580,388 unique TaxID.
# Import updated NCBI taxonomy mapping file
ncbi_taxa = read.csv("Perfects/INPUT/all.ncbi.taxa.lineage.csv", head=TRUE) # read csv file
Warning: EOF within quoted string
# Convert TaxID column from chr to int
ncbi_taxa$TaxID <- as.integer(ncbi_taxa$TaxID)
Warning: NAs introduced by coercion
# Print the first few rows of the NCBI taxonomy data frame
head(ncbi_taxa)
# Merge the NCBI taxonomy columns to Alltree15_taxa based on shared TaxID
Alltree15_taxa_merged <- Alltree15_taxa
Alltree15_taxa_merged$NCBI.name <- NA
Alltree15_taxa_merged$NCBI.superkingdom <- NA
Alltree15_taxa_merged$NCBI.phylum <- NA
Alltree15_taxa_merged$NCBI.class <- NA
Alltree15_taxa_merged$NCBI.order <- NA
Alltree15_taxa_merged$NCBI.family <- NA
Alltree15_taxa_merged$NCBI.genus <- NA
Alltree15_taxa_merged$NCBI.species <- NA
# NCBI.name
Alltree15_taxa_merged$NCBI.name[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.name[match(Alltree15_taxa_merged$TaxID[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.superkingdom
Alltree15_taxa_merged$NCBI.superkingdom[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.superkingdom[match(Alltree15_taxa_merged$TaxID[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.phylum
Alltree15_taxa_merged$NCBI.phylum[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.phylum[match(Alltree15_taxa_merged$TaxID[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.class
Alltree15_taxa_merged$NCBI.class[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.class[match(Alltree15_taxa_merged$TaxID[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.order
Alltree15_taxa_merged$NCBI.order[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.order[match(Alltree15_taxa_merged$TaxID[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Alltree15_taxa_merged$NCBI.family[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.family[match(Alltree15_taxa_merged$TaxID[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Alltree15_taxa_merged$NCBI.genus[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.genus[match(Alltree15_taxa_merged$TaxID[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Alltree15_taxa_merged$NCBI.species[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.species[match(Alltree15_taxa_merged$TaxID[Alltree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# Replace the value in "NCBI.phylum" column with the value from "NCBI.class" if "NCBI.phylum" is "Pseudomonadota"
Alltree15_taxa_merged$NCBI.phylum <- ifelse(Alltree15_taxa_merged$NCBI.phylum == "Pseudomonadota", Alltree15_taxa_merged$NCBI.class, Alltree15_taxa_merged$NCBI.phylum)
# Remove rows with NA in NCBI.phylum column
Alltree15_taxa_merged <- Alltree15_taxa_merged[!is.na(Alltree15_taxa_merged$NCBI.phylum) & Alltree15_taxa_merged$NCBI.phylum != "NA", ]
# Print the first few rows of the Alltree15_taxa_merged data frame
head(Alltree15_taxa_merged)
Plot a circular phylogenetic tree with color-scale based on phylogenetic distance between each DHFR homolog relative to the E. coli DHFR gene.
phylotree_NCBI_ecoli_ident <- ggtree(Alltree15, layout="circular", branch.length="none") %<+%
Alltree15_taxa_merged +
aes(color=PctIdentEcoli*100) +
geom_tippoint(aes(color=PctIdentEcoli*100), size=1, alpha=0.8) +
geom_tiplab2(aes(color=PctIdentEcoli*100, label=NCBI.name, angle=angle), size=1) +
theme(legend.position="left") +
scale_colour_gradient2("Sequence\nIdentity (%)", low="blue", mid="green", high="red",
midpoint=(100+min(Alltree15_taxa_merged$PctIdentEcoli)*100)/2)
phylotree_NCBI_ecoli_ident
Plot with only the pathogenic homologs labelled:
# Define the specific NCBI.name values you want to label
pathogenic_labels <- c("Anaerococcus tetradius", "Bacillus cereus", "Escherichia coli", "Haemophilus influenzae",
"Salmonella enterica", "Staphylococcus aureus", "Streptococcus pneumoniae", "Vibrio cholerae",
"Enterococcus faecium", "Klebsiella pneumoniae", "Acinetobacter baumannii", "Pseudomonas aeruginosa",
"Proteus mirabilis", "Bordetella holmesii", "Mycoplasmoides pneumoniae", "Chlamydia pneumoniae")
phylotree_NCBI_ecoli_ident_pathogenic <- ggtree(Alltree15, layout="circular", branch.length="none") %<+%
Alltree15_taxa_merged +
aes(color=PctIdentEcoli*100) +
geom_tippoint(aes(color=PctIdentEcoli*100), size=1, alpha=0.8) +
geom_tiplab2(aes(color=PctIdentEcoli*100,
label=ifelse(NCBI.name %in% pathogenic_labels, NCBI.name, ""),
angle=angle),
size=1) +
theme(legend.position="left") +
scale_colour_gradient2("Sequence\nIdentity (%)", low="blue", mid="green", high="red",
midpoint=(100+min(Alltree15_taxa_merged$PctIdentEcoli)*100)/2)
phylotree_NCBI_ecoli_ident_pathogenic
The following section builds on the initial DHFR phylogenetic tree and adds the NCBI taxonomic lineages as an outer ring to display the breadth of sequence diversity in the 1,536-DHFR designed library. However, this tree only contains 643 unique branch tips (41% of full library diversity) from the recovered shared perfects between libraries.
Distinct Phylum Colors for Plotting
# Generate distinct colors for each unique value in the "NCBI.phylum" column
distinct_colors_ncbi <- rainbow(length(unique(Alltree15_taxa_merged$NCBI.phylum)))
# Reverse the order of colors
#distinct_colors_ncbi <- rev(distinct_colors_ncbi)
# Create a new column "NCBI.phylum_colors" and assign colors based on the unique values in "NCBI.phylum"
Alltree15_taxa_merged <- Alltree15_taxa_merged %>%
mutate(NCBI.phylum_colors = distinct_colors_ncbi[as.integer(factor(NCBI.phylum))])
#Establish color scheme for plotting
NCBIphyloColor <- Alltree15_taxa_merged %>%
select(c("NCBI.phylum", "NCBI.phylum_colors")) %>%
distinct()
NCBIphyloColor <- NCBIphyloColor[order(NCBIphyloColor$NCBI.phylum, decreasing=FALSE),]
# List phylum in alphabetical order for legend and plotting
Alltree15_taxa_merged$NCBI.phylum <- factor(Alltree15_taxa_merged$NCBI.phylum, levels=NCBIphyloColor$NCBI.phylum)
Distinct Class Colors for Plotting
# Generate distinct colors for each unique value in the "NCBI.class" column
distinct_colors_ncbi_class <- rainbow(length(unique(Alltree15_taxa_merged$NCBI.class)))
# Create a new column "NCBI.class_colors" and assign colors based on the unique values in "NCBI.class"
Alltree15_taxa_merged <- Alltree15_taxa_merged %>%
mutate(NCBI.class_colors = distinct_colors_ncbi_class[as.integer(factor(NCBI.class))])
#Establish color scheme for plotting
NCBIclassColor <- Alltree15_taxa_merged %>%
select(c("NCBI.class", "NCBI.class_colors")) %>%
distinct()
NCBIclassColor <- NCBIclassColor[order(NCBIclassColor$NCBI.class, decreasing=FALSE),]
# List phylum in alphabetical order for legend and plotting
Alltree15_taxa_merged$NCBI.class <- factor(Alltree15_taxa_merged$NCBI.class, levels=NCBIclassColor$NCBI.class)
Build the initial DHFR tree with color-scale based on phylogenetic distance between each DHFR homolog relative to the E. coli DHFR gene and add an additional ring representing taxonomic lineages at the Phylum-level.
Note that the phylum “Pseudomonadota” has been replaced with it’s associated classes (“Alphaproteobacteria”, “Betaproteobacteria”, etc.). This tree includes Archaea (Euryarchaeota) and dsDNA Viruses (Uroviricota), but no Eukaryota from the original 1,536-DHFR library.
phylotree_NCBI_ecoli_phylum_p1 <- ggtree(Alltree15, layout="circular", branch.length="none") %<+%
Alltree15_taxa_merged +
aes(color=PctIdentEcoli*100) +
geom_tippoint(aes(color=PctIdentEcoli*100), size=1, alpha=0.8) +
theme(legend.position="left") +
scale_colour_gradient2("Sequence\nIdentity (%)", low="blue", mid="green", high="red",
midpoint=(100+min(Alltree15_taxa_merged$PctIdentEcoli)*100)/2)
phylotree_NCBI_ecoli_phylum_p2 <- phylotree_NCBI_ecoli_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum),
width=4,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=NCBIphyloColor$NCBI.phylum_colors,
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
phylotree_NCBI_ecoli_phylum_p2
Build the initial DHFR tree with color-scale based on phylogenetic distance between each DHFR homolog relative to the E. coli DHFR gene and add an additional ring representing taxonomic lineages at the Class-level.
phylotree_NCBI_ecoli_class_p1 <- ggtree(Alltree15, layout="circular", branch.length="none") %<+%
Alltree15_taxa_merged +
aes(color=PctIdentEcoli*100) +
geom_tippoint(aes(color=PctIdentEcoli*100), size=1, alpha=0.8) +
theme(legend.position="left") +
scale_colour_gradient2("Sequence\nIdentity (%)", low="blue", mid="green", high="red",
midpoint=(100+min(Alltree15_taxa_merged$PctIdentEcoli)*100)/2)
phylotree_NCBI_ecoli_class_p2 <- phylotree_NCBI_ecoli_class_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.class),
width=4,
offset=0.1
) +
scale_fill_manual(
name="Class",
values=NCBIclassColor$NCBI.class_colors,
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
phylotree_NCBI_ecoli_class_p2
Show both version of the phylogeny tree (Phylum and Class level annotations):
patch8 <- (phylotree_NCBI_ecoli_phylum_p2 | phylotree_NCBI_ecoli_class_p2)
patch8
Re-build the phylogenetic tree but color the branches by the NCBI phylum-level taxonomy (instead of by pctIdent E. coli). Leave the phylum name labels on the tree as reference.
phylotree_NCBI_ecoli_phylum_names_p1 <- ggtree(Alltree15, layout="circular", branch.length="none") %<+%
Alltree15_taxa_merged +
aes(color=NCBI.phylum) +
geom_tippoint(aes(color=NCBI.phylum), size=1, alpha=0.8) +
geom_tiplab2(aes(color=NCBI.phylum, label=NCBI.phylum, angle=angle), size=1)
#p13NCBI_15taxa2
phylotree_NCBI_ecoli_phylum_names_p2 <- phylotree_NCBI_ecoli_phylum_names_p1 +
geom_tippoint(
mapping=aes(colour=NCBI.phylum),
size=1.5,
stroke=0,
alpha=0.4
) +
scale_colour_manual(
name="Phylum",
values=NCBIphyloColor$NCBI.phylum_colors,
guide=guide_legend(keywidth=0.3,
keyheight=0.3,
ncol=1,
override.aes=list(size=2,alpha=1),
order=1)
) +
theme(
legend.title=element_text(size=12),
legend.text=element_text(size=10),
legend.spacing.y = unit(0.1, "cm")
)
phylotree_NCBI_ecoli_phylum_names_p2
Re-build the phylogenetic tree but color the branches by the NCBI class-level taxonomy (instead of by pctIdent E. coli). Leave the class name labels on the tree as reference.
phylotree_NCBI_ecoli_class_names_p1 <- ggtree(Alltree15, layout="circular", branch.length="none") %<+%
Alltree15_taxa_merged +
aes(color=NCBI.class) +
geom_tippoint(aes(color=NCBI.class), size=1, alpha=0.8) +
geom_tiplab2(aes(color=NCBI.class, label=NCBI.class, angle=angle), size=1)
#p13NCBI_15taxa2
phylotree_NCBI_ecoli_class_names_p2 <- phylotree_NCBI_ecoli_class_names_p1 +
geom_tippoint(
mapping=aes(colour=NCBI.class),
size=1.5,
stroke=0,
alpha=0.4) +
scale_colour_manual(
name="Class",
values=NCBIclassColor$NCBI.class_colors,
guide=guide_legend(keywidth=0.3,
keyheight=0.3,
ncol=1,
override.aes=list(size=2,alpha=1),
order=1)) +
theme(
legend.title=element_text(size=12),
legend.text=element_text(size=10),
legend.spacing.y = unit(0.1, "cm"))
phylotree_NCBI_ecoli_class_names_p2
Taxonomy: Count the number of unique IDs associated with each taxonomic level:
# Count unique values in each column
unique_taxa_counts <- Alltree15_taxa_merged %>%
summarise(across(c(NCBI.superkingdom, NCBI.phylum, NCBI.class, NCBI.order, NCBI.family, NCBI.genus, NCBI.species),
~ n_distinct(.)))
# Print the result
print(unique_taxa_counts)
Domain: Count the number of unique IDs associated with each superkingdom (domain):
# Sum the number of rows for each unique NCBI.phylum
domain_counts <- Alltree15_taxa_merged %>%
group_by(NCBI.superkingdom) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Print the table
print(domain_counts)
Phylum: Count the number of unique IDs associated with each Phylum:
# Sum the number of rows for each unique NCBI.phylum
phylum_counts <- Alltree15_taxa_merged %>%
group_by(NCBI.phylum) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Print the table
print(phylum_counts)
Class: Count the number of unique IDs associated with each Class:
# Sum the number of rows for each unique NCBI.class
class_counts <- Alltree15_taxa_merged %>%
group_by(NCBI.class) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Print the table
print(class_counts)
Order: Count the number of unique IDs associated with each Order:
# Sum the number of rows for each unique NCBI.class
order_counts <- Alltree15_taxa_merged %>%
group_by(NCBI.order) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Print the table
print(order_counts)
Family: Count the number of unique IDs associated with each Family:
# Sum the number of rows for each unique NCBI.class
family_counts <- Alltree15_taxa_merged %>%
group_by(NCBI.family) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Print the table
print(family_counts)
Genus: Count the number of unique IDs associated with each Genus:
# Sum the number of rows for each unique NCBI.class
genus_counts <- Alltree15_taxa_merged %>%
group_by(NCBI.genus) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Print the table
print(genus_counts)
Species: Count the number of unique IDs associated with each Species:
# Sum the number of rows for each unique NCBI.class
species_counts <- Alltree15_taxa_merged %>%
group_by(NCBI.species) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Print the table
print(species_counts)
Retain unique DHFR perfects with a fitness score >= -1 and a minimum of 5 BCs (numprunedBCs > 4) in Lib15 (fitD05D03):
# Subset dataset for tree building
L15_perfects_complementation_tree <- perfects15_5BCs %>%
select(mutID, seq,
fitD05D03, fitD06D03, fitD07D03, fitD08D03, fitD09D03, fitD10D03, fitD11D03)
Count perfects associated with fitD05D03 (Complementation):
# Count unique perfects in dataset
count_unique_comp_perfects <- L15_perfects_complementation_tree %>%
filter(!is.na(fitD05D03)) %>%
distinct(fitD05D03) %>%
nrow()
print(paste("Number of unique perfects:", count_unique_comp_perfects))
[1] "Number of unique perfects: 780"
# Count unique perfects with fitness greater than -1 in fitD05D03 (Complementation):
count_unique_fit_comp_high <- L15_perfects_complementation_tree %>%
filter(!is.na(fitD05D03) & fitD05D03 >= -1) %>%
distinct(fitD05D03) %>%
nrow()
print(paste("Number of unique perfects with fitD05D03 values greater than -1:", count_unique_fit_comp_high))
[1] "Number of unique perfects with fitD05D03 values greater than -1: 412"
# Count unique perfects with fitness less than -1 in fitD05D03 (Complementation):
count_unique_fit_comp_low <- L15_perfects_complementation_tree %>%
filter(!is.na(fitD05D03) & fitD05D03 <= -1) %>%
distinct(fitD05D03) %>%
nrow()
print(paste("Number of unique perfects with fitD05D03 values less than -1:", count_unique_fit_comp_low))
[1] "Number of unique perfects with fitD05D03 values less than -1: 368"
Add “taxID” and “PctIdentEcoli” to each “mutID”
# Rename "mutID" to "ID" to merge with `orginfo` df:
L15_perfects_complementation_tree <- L15_perfects_complementation_tree %>% rename(ID = mutID)
# Merge "TaxID" and "PctIdentEcoli" from `orginfo` to `L15_perfects_complementation_tree`:
L15_perfects_complementation_tree <- L15_perfects_complementation_tree %>%
left_join(orginfo %>% select(ID, TaxID, PctIdentEcoli), by = "ID")
Create a FASTA file containing the ID and its associated protein sequence for alignment
# Collect the sequences in FASTA format
L15_perfects_complementation_tree_fasta_content <- paste(">",L15_perfects_complementation_tree$ID, "\n", L15_perfects_complementation_tree$seq, "\n", sep = "", collapse = "")
# Define the file path in the working directory
L15_perfects_complementation_tree_fasta_path <- file.path(getwd(),
"Perfects/TREES/Lib.15.5BCs.perfects.complement.fasta")
# Write the FASTA content to the file
writeLines(L15_perfects_complementation_tree_fasta_content, con = L15_perfects_complementation_tree_fasta_path)
Perform a multiple sequence alignment on the FASTA file using CLUSTAL Omega:
# May need to enable permissions to run the executable:
#chmod +x clustalo
./Scripts/clustalo -i Perfects/TREES/Lib.15.5BCs.perfects.complement.fasta -o Perfects/TREES/Lib.15.5BCs.perfects.complement.aligned.fasta --outfmt=fa --force
Use FastTree (phylogenetic tree building program) to infer the tree from the aligned amino acid sequences:
# ML Model: Jones-Taylor-Thorton
# chmod +x FastTree
./Scripts/FastTree Perfects/TREES/Lib.15.5BCs.perfects.complement.aligned.fasta > Perfects/TREES/Lib.15.5BCs.perfects.complement.tree.newick
FastTree Version 2.1.11 No SSE3
Alignment: Perfects/TREES/Lib.15.5BCs.perfects.complement.aligned.fasta
Amino acid distances: BLOSUM45 Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (2 rounds range 10) +ML-NNI opt-each=1
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Jones-Taylor-Thorton, CAT approximation with 20 rate categories
0.12 seconds: Joined 200 of 794
0.24 seconds: Joined 500 of 794
0.35 seconds: Joined 700 of 794
Initial topology in 0.40 seconds
Refining topology: 39 rounds ME-NNIs, 2 rounds ME-SPRs, 19 rounds ML-NNIs
0.45 seconds: ME NNI round 3 of 39, 201 of 795 splits, 4 changes (max delta 0.015)
0.57 seconds: SPR round 1 of 2, 201 of 1592 nodes
0.71 seconds: SPR round 1 of 2, 501 of 1592 nodes
0.85 seconds: SPR round 1 of 2, 801 of 1592 nodes
0.96 seconds: SPR round 1 of 2, 1001 of 1592 nodes
1.10 seconds: SPR round 1 of 2, 1301 of 1592 nodes
1.23 seconds: ME NNI round 14 of 39, 1 of 795 splits
1.33 seconds: SPR round 2 of 2, 101 of 1592 nodes
1.44 seconds: SPR round 2 of 2, 301 of 1592 nodes
1.56 seconds: SPR round 2 of 2, 501 of 1592 nodes
1.66 seconds: SPR round 2 of 2, 701 of 1592 nodes
1.79 seconds: SPR round 2 of 2, 1001 of 1592 nodes
1.90 seconds: SPR round 2 of 2, 1301 of 1592 nodes
2.01 seconds: SPR round 2 of 2, 1501 of 1592 nodes
Total branch-length 148.069 after 2.13 sec
2.13 seconds: ML Lengths 1 of 795 splits
2.24 seconds: ML Lengths 101 of 795 splits
2.34 seconds: ML Lengths 201 of 795 splits
2.45 seconds: ML Lengths 301 of 795 splits
2.55 seconds: ML Lengths 401 of 795 splits
2.66 seconds: ML Lengths 501 of 795 splits
2.76 seconds: ML Lengths 601 of 795 splits
2.86 seconds: ML Lengths 701 of 795 splits
3.36 seconds: ML NNI round 1 of 19, 101 of 795 splits, 16 changes (max delta 3.489)
3.72 seconds: ML NNI round 1 of 19, 201 of 795 splits, 42 changes (max delta 11.221)
4.08 seconds: ML NNI round 1 of 19, 301 of 795 splits, 54 changes (max delta 11.221)
4.49 seconds: ML NNI round 1 of 19, 401 of 795 splits, 67 changes (max delta 11.221)
4.87 seconds: ML NNI round 1 of 19, 501 of 795 splits, 82 changes (max delta 11.221)
5.23 seconds: ML NNI round 1 of 19, 601 of 795 splits, 101 changes (max delta 11.221)
5.62 seconds: ML NNI round 1 of 19, 701 of 795 splits, 118 changes (max delta 11.221)
ML-NNI round 1: LogLk = -129180.341 NNIs 140 max delta 11.22 Time 6.01
6.06 seconds: Site likelihoods with rate category 1 of 20
6.16 seconds: Site likelihoods with rate category 3 of 20
6.27 seconds: Site likelihoods with rate category 5 of 20
6.38 seconds: Site likelihoods with rate category 7 of 20
6.48 seconds: Site likelihoods with rate category 9 of 20
6.59 seconds: Site likelihoods with rate category 11 of 20
6.69 seconds: Site likelihoods with rate category 13 of 20
6.80 seconds: Site likelihoods with rate category 15 of 20
6.90 seconds: Site likelihoods with rate category 17 of 20
7.01 seconds: Site likelihoods with rate category 19 of 20
Switched to using 20 rate categories (CAT approximation)
Rate categories were divided by 1.106 so that average rate = 1.0
CAT-based log-likelihoods may not be comparable across runs
Use -gamma for approximate but comparable Gamma(20) log-likelihoods
7.17 seconds: ML NNI round 2 of 19, 1 of 795 splits
7.45 seconds: ML NNI round 2 of 19, 101 of 795 splits, 11 changes (max delta 3.696)
7.72 seconds: ML NNI round 2 of 19, 201 of 795 splits, 22 changes (max delta 4.148)
7.97 seconds: ML NNI round 2 of 19, 301 of 795 splits, 34 changes (max delta 5.429)
8.26 seconds: ML NNI round 2 of 19, 401 of 795 splits, 46 changes (max delta 5.429)
8.54 seconds: ML NNI round 2 of 19, 501 of 795 splits, 53 changes (max delta 5.429)
8.83 seconds: ML NNI round 2 of 19, 601 of 795 splits, 68 changes (max delta 5.448)
9.14 seconds: ML NNI round 2 of 19, 701 of 795 splits, 84 changes (max delta 5.448)
ML-NNI round 2: LogLk = -121930.723 NNIs 93 max delta 5.45 Time 9.46
9.45 seconds: ML NNI round 3 of 19, 1 of 795 splits
9.77 seconds: ML NNI round 3 of 19, 101 of 795 splits, 8 changes (max delta 1.199)
10.03 seconds: ML NNI round 3 of 19, 201 of 795 splits, 16 changes (max delta 3.181)
10.34 seconds: ML NNI round 3 of 19, 301 of 795 splits, 31 changes (max delta 3.924)
10.60 seconds: ML NNI round 3 of 19, 401 of 795 splits, 37 changes (max delta 3.924)
10.89 seconds: ML NNI round 3 of 19, 501 of 795 splits, 47 changes (max delta 5.502)
ML-NNI round 3: LogLk = -121875.324 NNIs 50 max delta 5.50 Time 11.04
11.04 seconds: ML NNI round 4 of 19, 1 of 795 splits
11.33 seconds: ML NNI round 4 of 19, 101 of 795 splits, 4 changes (max delta 3.149)
11.65 seconds: ML NNI round 4 of 19, 201 of 795 splits, 16 changes (max delta 3.149)
11.96 seconds: ML NNI round 4 of 19, 301 of 795 splits, 25 changes (max delta 4.485)
ML-NNI round 4: LogLk = -121838.596 NNIs 27 max delta 4.48 Time 12.08
12.07 seconds: ML NNI round 5 of 19, 1 of 795 splits
12.41 seconds: ML NNI round 5 of 19, 101 of 795 splits, 9 changes (max delta 5.984)
ML-NNI round 5: LogLk = -121822.923 NNIs 10 max delta 5.98 Time 12.68
12.67 seconds: ML NNI round 6 of 19, 1 of 795 splits
ML-NNI round 6: LogLk = -121810.022 NNIs 5 max delta 9.49 Time 12.93
12.92 seconds: ML NNI round 7 of 19, 1 of 795 splits
ML-NNI round 7: LogLk = -121809.234 NNIs 3 max delta 0.42 Time 13.12
13.12 seconds: ML NNI round 8 of 19, 1 of 795 splits
ML-NNI round 8: LogLk = -121806.065 NNIs 1 max delta 0.69 Time 13.27
13.26 seconds: ML NNI round 9 of 19, 1 of 795 splits
ML-NNI round 9: LogLk = -121806.529 NNIs 1 max delta 0.23 Time 13.35
Turning off heuristics for final round of ML NNIs (converged)
13.77 seconds: ML NNI round 10 of 19, 101 of 795 splits, 4 changes (max delta 0.945)
14.22 seconds: ML NNI round 10 of 19, 201 of 795 splits, 7 changes (max delta 0.945)
14.63 seconds: ML NNI round 10 of 19, 301 of 795 splits, 11 changes (max delta 0.945)
15.03 seconds: ML NNI round 10 of 19, 401 of 795 splits, 20 changes (max delta 3.025)
15.46 seconds: ML NNI round 10 of 19, 501 of 795 splits, 21 changes (max delta 3.025)
15.84 seconds: ML NNI round 10 of 19, 601 of 795 splits, 23 changes (max delta 3.025)
16.27 seconds: ML NNI round 10 of 19, 701 of 795 splits, 27 changes (max delta 3.025)
ML-NNI round 10: LogLk = -121743.666 NNIs 32 max delta 10.46 Time 16.69 (final)
16.69 seconds: ML Lengths 1 of 795 splits
16.80 seconds: ML Lengths 101 of 795 splits
16.91 seconds: ML Lengths 201 of 795 splits
17.02 seconds: ML Lengths 301 of 795 splits
17.13 seconds: ML Lengths 401 of 795 splits
17.23 seconds: ML Lengths 501 of 795 splits
17.34 seconds: ML Lengths 601 of 795 splits
17.45 seconds: ML Lengths 701 of 795 splits
Optimize all lengths: LogLk = -121741.655 Time 17.56
17.89 seconds: ML split tests for 100 of 794 internal splits
18.22 seconds: ML split tests for 200 of 794 internal splits
18.55 seconds: ML split tests for 300 of 794 internal splits
18.87 seconds: ML split tests for 400 of 794 internal splits
19.20 seconds: ML split tests for 500 of 794 internal splits
19.51 seconds: ML split tests for 600 of 794 internal splits
19.84 seconds: ML split tests for 700 of 794 internal splits
Total time: 20.15 seconds Unique: 797/797 Bad splits: 4/794 Worst delta-LogLk 0.706
Import the newick tree file based on the sequence alignment of shared perfects derived from Lib15 & Lib16 mapping file:
# Full tree alignment (417 mutID)
Fittree15 <- read.tree("Perfects/TREES/Lib.15.5BCs.perfects.complement.tree.newick") # newick format
Fittree15
Phylogenetic tree with 797 tips and 795 internal nodes.
Tip labels:
WP_011238316, WP_002931755, WP_004375525, WP_006460949, WP_007277186, WP_004138800, ...
Node labels:
, 0.871, 0.397, 0.997, 0.859, 0.202, ...
Unrooted; includes branch lengths.
Extract the tip labels from the newick tree file to match with NCBI taxonomy for downstream plotting:
# Extract tip labels from Fittree15
Fittree15_tip_labels <- Fittree15$tip.label
# Create a new data frame with unique tip labels
Fittree15_tip_labels_df <- data.frame(tip.label = unique(Fittree15_tip_labels))
# Print the first few rows of the new data frame
head(Fittree15_tip_labels_df)
Match each tip.label ID in Alltree15 with it’s
associated TaxID from orginfo dataframe:
# Rename column "tip.label" to "ID"
colnames(Fittree15_tip_labels_df) <- c("ID")
# Merge orginfo with Alltree15_tip_labels_df based on the shared ID
Fittree15_taxa <- merge(Fittree15_tip_labels_df, orginfo, by = "ID", all.x = TRUE)
# Print the first few rows of the merged data frame
head(Fittree15_taxa)
Add fitness scores to the Fittree15_taxa_merged object
prior to plotting
Fittree15_taxa_merged <- Fittree15_taxa %>%
left_join(L15_perfects_complementation_tree %>% select(ID, fitD05D03, fitD06D03, fitD07D03, fitD08D03, fitD09D03, fitD10D03, fitD11D03), by = "ID")
Merge the NCBI taxonomy columns to Fittree15_taxa based
on shared TaxID
# Merge the NCBI taxonomy columns to Fittree15_taxa based on shared TaxID
Fittree15_taxa_merged$NCBI.name <- NA
Fittree15_taxa_merged$NCBI.superkingdom <- NA
Fittree15_taxa_merged$NCBI.phylum <- NA
Fittree15_taxa_merged$NCBI.class <- NA
Fittree15_taxa_merged$NCBI.order <- NA
Fittree15_taxa_merged$NCBI.family <- NA
Fittree15_taxa_merged$NCBI.genus <- NA
Fittree15_taxa_merged$NCBI.species <- NA
# NCBI.name
Fittree15_taxa_merged$NCBI.name[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.name[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.superkingdom
Fittree15_taxa_merged$NCBI.superkingdom[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.superkingdom[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.phylum
Fittree15_taxa_merged$NCBI.phylum[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.phylum[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.class
Fittree15_taxa_merged$NCBI.class[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.class[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.order
Fittree15_taxa_merged$NCBI.order[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.order[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Fittree15_taxa_merged$NCBI.family[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.family[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Fittree15_taxa_merged$NCBI.genus[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.genus[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Fittree15_taxa_merged$NCBI.species[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.species[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
Plot a circular phylogenetic tree with color-scale based on phylogenetic distance between each DHFR homolog relative to the E. coli DHFR gene.
fittree15_NCBI_ecoli_ident <- ggtree(Fittree15, layout="circular", branch.length="none") %<+%
Fittree15_taxa_merged +
aes(color=PctIdentEcoli*100) +
geom_tippoint(aes(color=PctIdentEcoli*100), size=1, alpha=0.8) +
geom_tiplab2(aes(color=PctIdentEcoli*100, label=NCBI.name, angle=angle), size=1) +
theme(legend.position="left") +
scale_colour_gradient2("Sequence\nIdentity (%)", low="blue", mid="green", high="red",
midpoint=(100+min(Fittree15_taxa_merged$PctIdentEcoli)*100)/2)
fittree15_NCBI_ecoli_ident
The following section builds on the initial DHFR phylogenetic tree and adds the NCBI taxonomic lineages as an outer ring to display the breadth of sequence diversity in the 417 unique branch tips (41% of full library diversity) from the recovered perfects with fitness > -1 in Library 15.
Replace the value in “NCBI.phylum” column with the value from “NCBI.class” if “NCBI.phylum” is “Pseudomonadota”
# Replace the value in "NCBI.phylum" column with the value from "NCBI.class" if "NCBI.phylum" is "Pseudomonadota"
Fittree15_taxa_merged$NCBI.phylum <- ifelse(Fittree15_taxa_merged$NCBI.phylum == "Pseudomonadota", Fittree15_taxa_merged$NCBI.class, Fittree15_taxa_merged$NCBI.phylum)
# Remove rows with NA in NCBI.phylum column
Fittree15_taxa_merged <- Fittree15_taxa_merged[!is.na(Fittree15_taxa_merged$NCBI.phylum) & Fittree15_taxa_merged$NCBI.phylum != "NA", ]
Distinct Phylum Colors for Plotting
# Merge distinct phylum colors from Alltree15_taxa_merged dataframe:
Fittree15_taxa_merged$NCBI.phylum_colors <- NA
Fittree15_taxa_merged$NCBI.class_colors <- NA
# NCBI.phylum_colors
Fittree15_taxa_merged$NCBI.phylum_colors[Fittree15_taxa_merged$TaxID %in% Alltree15_taxa_merged$TaxID] <- Alltree15_taxa_merged$NCBI.phylum_colors[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% Alltree15_taxa_merged$TaxID], Alltree15_taxa_merged$TaxID)]
# NCBI.class_colors
Fittree15_taxa_merged$NCBI.class_colors[Fittree15_taxa_merged$TaxID %in% Alltree15_taxa_merged$TaxID] <- Alltree15_taxa_merged$NCBI.class_colors[match(Fittree15_taxa_merged$TaxID[Fittree15_taxa_merged$TaxID %in% Alltree15_taxa_merged$TaxID], Alltree15_taxa_merged$TaxID)]
# List phylum in alphabetical order for legend and plotting
Fittree15_taxa_merged$NCBI.phylum <- factor(Fittree15_taxa_merged$NCBI.phylum, levels=NCBIphyloColor$NCBI.phylum)
# List phylum in alphabetical order for legend and plotting
Fittree15_taxa_merged$NCBI.class <- factor(Fittree15_taxa_merged$NCBI.class, levels=NCBIclassColor$NCBI.class)
Show the minimum and maximum fitness values between D05 (M9 no supp) vs. D03 (M9 full supp) dataset:
min(Fittree15_taxa_merged$fitD05D03,na.rm=T)
[1] -6.29133
max(Fittree15_taxa_merged$fitD05D03,na.rm=T)
[1] 0.7437646
Lib15: Complementation Plot
tree_perfects15_5BCs_good_0tmp_fitness_labelled <- ggtree(Fittree15, layout="circular", branch.length="none") %<+%
Fittree15_taxa_merged +
aes(color = ifelse(fitD05D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD05D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
geom_tiplab2(aes(label=NCBI.species, angle=angle), size=1) +
theme(legend.position="none") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray"
) +
guides(color = guide_legend(title = "Fitness"))
tree_perfects15_5BCs_good_0tmp_fitness_labelled
Build the initial DHFR tree with color-scale based on fitness at COMPLEMENTATION and add an additional ring representing taxonomic lineages at the Phylum-level.
tree_perfects15_5BCs_good_0tmp_fitness_phylum_p1 <- ggtree(Fittree15, layout="circular", branch.length="none") %<+%
Fittree15_taxa_merged +
aes(color = ifelse(fitD05D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD05D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
theme(legend.position="right") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray")
tree_perfects15_5BCs_good_0tmp_fitness_phylum_p2 <- tree_perfects15_5BCs_good_0tmp_fitness_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum),
width=4,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=NCBIphyloColor$NCBI.phylum_colors,
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
tree_perfects15_5BCs_good_0tmp_fitness_phylum_p2
Retain unique DHFR perfects with a fitness score >= -1 and a minimum of 5 BCs (numprunedBCs > 4) in Lib15 (fitD05D03):
# Subset dataset for tree building
L15_perfects_complement_tree <- perfects15_5BCs_good %>%
select(mutID, seq,
fitD05D03, fitD06D03, fitD07D03, fitD08D03, fitD09D03, fitD10D03, fitD11D03)
Add “taxID” and “PctIdentEcoli” to each “mutID”
# Rename "mutID" to "ID" to merge with `orginfo` df:
L15_perfects_complement_tree <- L15_perfects_complement_tree %>% rename(ID = mutID)
# Merge "TaxID" and "PctIdentEcoli" from `orginfo` to `L15_perfects_complement_tree`:
L15_perfects_complement_tree <- L15_perfects_complement_tree %>%
left_join(orginfo %>% select(ID, TaxID, PctIdentEcoli), by = "ID")
Create a FASTA file containing the ID and its associated protein sequence for alignment
# Collect the sequences in FASTA format
L15_perfects_complement_tree_fasta_content <- paste(">",L15_perfects_complement_tree$ID, "\n", L15_perfects_complement_tree$seq, "\n", sep = "", collapse = "")
# Define the file path in the working directory
L15_perfects_complement_tree_fasta_path <- file.path(getwd(), "Perfects/TREES/Lib.15.5BCs.good.perfects.complement.fasta")
# Write the FASTA content to the file
writeLines(L15_perfects_complement_tree_fasta_content, con = L15_perfects_complement_tree_fasta_path)
Perform a multiple sequence alignment on the FASTA file using CLUSTAL Omega:
# May need to enable permissions to run the executable:
#chmod +x clustalo
./Scripts/clustalo -i Perfects/TREES/Lib.15.5BCs.good.perfects.complement.fasta -o Perfects/TREES/Lib.15.5BCs.good.perfects.complement.aligned.fasta --outfmt=fa --force
Use FastTree (phylogenetic tree building program) to infer the tree from the aligned amino acid sequences:
# ML Model: Jones-Taylor-Thorton
# chmod +x FastTree
./Scripts/FastTree Perfects/TREES/Lib.15.5BCs.good.perfects.complement.aligned.fasta > Perfects/TREES/Lib.15.5BCs.good.perfects.complement.tree.newick
FastTree Version 2.1.11 No SSE3
Alignment: Perfects/TREES/Lib.15.5BCs.good.perfects.complement.aligned.fasta
Amino acid distances: BLOSUM45 Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (2 rounds range 10) +ML-NNI opt-each=1
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Jones-Taylor-Thorton, CAT approximation with 20 rate categories
0.10 seconds: Joined 300 of 413
Initial topology in 0.14 seconds
Refining topology: 35 rounds ME-NNIs, 2 rounds ME-SPRs, 17 rounds ML-NNIs
0.22 seconds: SPR round 1 of 2, 101 of 830 nodes
0.33 seconds: SPR round 1 of 2, 401 of 830 nodes
0.46 seconds: SPR round 1 of 2, 701 of 830 nodes
0.57 seconds: SPR round 2 of 2, 101 of 830 nodes
0.68 seconds: SPR round 2 of 2, 401 of 830 nodes
0.81 seconds: SPR round 2 of 2, 701 of 830 nodes
Total branch-length 83.288 after 0.90 sec
0.98 seconds: ML Lengths 101 of 414 splits
1.15 seconds: ML Lengths 301 of 414 splits
1.54 seconds: ML NNI round 1 of 17, 101 of 414 splits, 16 changes (max delta 5.024)
1.86 seconds: ML NNI round 1 of 17, 201 of 414 splits, 36 changes (max delta 7.726)
2.18 seconds: ML NNI round 1 of 17, 301 of 414 splits, 51 changes (max delta 9.061)
2.50 seconds: ML NNI round 1 of 17, 401 of 414 splits, 67 changes (max delta 9.568)
ML-NNI round 1: LogLk = -70987.860 NNIs 71 max delta 9.57 Time 2.55
2.60 seconds: Site likelihoods with rate category 2 of 20
2.71 seconds: Site likelihoods with rate category 6 of 20
2.82 seconds: Site likelihoods with rate category 9 of 20
2.93 seconds: Site likelihoods with rate category 13 of 20
3.04 seconds: Site likelihoods with rate category 17 of 20
Switched to using 20 rate categories (CAT approximation)
Rate categories were divided by 1.118 so that average rate = 1.0
CAT-based log-likelihoods may not be comparable across runs
Use -gamma for approximate but comparable Gamma(20) log-likelihoods
3.17 seconds: ML NNI round 2 of 17, 1 of 414 splits
3.45 seconds: ML NNI round 2 of 17, 101 of 414 splits, 12 changes (max delta 7.045)
3.68 seconds: ML NNI round 2 of 17, 201 of 414 splits, 24 changes (max delta 7.045)
3.90 seconds: ML NNI round 2 of 17, 301 of 414 splits, 33 changes (max delta 7.045)
4.22 seconds: ML NNI round 2 of 17, 401 of 414 splits, 48 changes (max delta 7.045)
ML-NNI round 2: LogLk = -66952.695 NNIs 50 max delta 7.05 Time 4.27
4.50 seconds: ML NNI round 3 of 17, 101 of 414 splits, 5 changes (max delta 2.261)
4.77 seconds: ML NNI round 3 of 17, 201 of 414 splits, 11 changes (max delta 2.261)
ML-NNI round 3: LogLk = -66932.666 NNIs 20 max delta 2.67 Time 4.95
4.94 seconds: ML NNI round 4 of 17, 1 of 414 splits
5.20 seconds: ML NNI round 4 of 17, 101 of 414 splits, 0 changes
ML-NNI round 4: LogLk = -66920.309 NNIs 9 max delta 4.28 Time 5.42
5.41 seconds: ML NNI round 5 of 17, 1 of 414 splits
ML-NNI round 5: LogLk = -66918.357 NNIs 0 max delta 0.00 Time 5.62
Turning off heuristics for final round of ML NNIs (converged)
5.62 seconds: ML NNI round 6 of 17, 1 of 414 splits
5.97 seconds: ML NNI round 6 of 17, 101 of 414 splits, 2 changes (max delta 0.744)
6.32 seconds: ML NNI round 6 of 17, 201 of 414 splits, 6 changes (max delta 1.888)
6.69 seconds: ML NNI round 6 of 17, 301 of 414 splits, 11 changes (max delta 2.170)
7.02 seconds: ML NNI round 6 of 17, 401 of 414 splits, 19 changes (max delta 2.170)
ML-NNI round 6: LogLk = -66881.675 NNIs 19 max delta 2.17 Time 7.08 (final)
7.16 seconds: ML Lengths 101 of 414 splits
7.35 seconds: ML Lengths 301 of 414 splits
Optimize all lengths: LogLk = -66879.917 Time 7.46
7.72 seconds: ML split tests for 100 of 413 internal splits
7.99 seconds: ML split tests for 200 of 413 internal splits
8.26 seconds: ML split tests for 300 of 413 internal splits
8.53 seconds: ML split tests for 400 of 413 internal splits
Total time: 8.57 seconds Unique: 416/416 Bad splits: 3/413 Worst delta-LogLk 5.276
Import the newick tree file based on the sequence alignment of complementing perfects derived from Lib15 mapping file:
# Full tree alignment (417 mutID)
Fittree15good <- read.tree("Perfects/TREES/Lib.15.5BCs.good.perfects.complement.tree.newick") # newick format
Fittree15good
Phylogenetic tree with 416 tips and 414 internal nodes.
Tip labels:
WP_007642986, WP_008302089, WP_010560929, WP_006794536, WP_008136828, WP_007420749, ...
Node labels:
, 0.910, 0.959, 0.651, 0.692, 0.995, ...
Unrooted; includes branch lengths.
Extract the tip labels from the newick tree file to match with NCBI taxonomy for downstream plotting:
# Extract tip labels from Fittree15good
Fittree15good_tip_labels <- Fittree15good$tip.label
# Create a new data frame with unique tip labels
Fittree15good_tip_labels_df <- data.frame(tip.label = unique(Fittree15good_tip_labels))
# Print the first few rows of the new data frame
head(Fittree15good_tip_labels_df)
Match each tip.label ID in Fittree15good with it’s
associated TaxID from orginfo dataframe:
# Rename column "tip.label" to "ID"
colnames(Fittree15good_tip_labels_df) <- c("ID")
# Merge orginfo with Alltree15_tip_labels_df based on the shared ID
Fittree15good_taxa <- merge(Fittree15good_tip_labels_df, orginfo, by = "ID", all.x = TRUE)
# Print the first few rows of the merged data frame
head(Fittree15good_taxa)
Add fitness scores to the Fittree15_taxa_merged object
prior to plotting
Fittree15good_taxa_merged <- Fittree15good_taxa %>%
left_join(L15_perfects_complement_tree %>% select(ID, fitD05D03, fitD06D03, fitD07D03, fitD08D03, fitD09D03, fitD10D03, fitD11D03), by = "ID")
Merge the NCBI taxonomy columns to Fittree15_taxa based
on shared TaxID
# Merge the NCBI taxonomy columns to Fittree15good_taxa based on shared TaxID
Fittree15good_taxa_merged$NCBI.name <- NA
Fittree15good_taxa_merged$NCBI.superkingdom <- NA
Fittree15good_taxa_merged$NCBI.phylum <- NA
Fittree15good_taxa_merged$NCBI.class <- NA
Fittree15good_taxa_merged$NCBI.order <- NA
Fittree15good_taxa_merged$NCBI.family <- NA
Fittree15good_taxa_merged$NCBI.genus <- NA
Fittree15good_taxa_merged$NCBI.species <- NA
# NCBI.name
Fittree15good_taxa_merged$NCBI.name[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.name[match(Fittree15good_taxa_merged$TaxID[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.superkingdom
Fittree15good_taxa_merged$NCBI.superkingdom[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.superkingdom[match(Fittree15good_taxa_merged$TaxID[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.phylum
Fittree15good_taxa_merged$NCBI.phylum[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.phylum[match(Fittree15good_taxa_merged$TaxID[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.class
Fittree15good_taxa_merged$NCBI.class[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.class[match(Fittree15good_taxa_merged$TaxID[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.order
Fittree15good_taxa_merged$NCBI.order[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.order[match(Fittree15good_taxa_merged$TaxID[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Fittree15good_taxa_merged$NCBI.family[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.family[match(Fittree15good_taxa_merged$TaxID[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Fittree15good_taxa_merged$NCBI.genus[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.genus[match(Fittree15good_taxa_merged$TaxID[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
# NCBI.family
Fittree15good_taxa_merged$NCBI.species[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID] <- ncbi_taxa$NCBI.species[match(Fittree15good_taxa_merged$TaxID[Fittree15good_taxa_merged$TaxID %in% ncbi_taxa$TaxID], ncbi_taxa$TaxID)]
The following section builds on the initial DHFR phylogenetic tree and adds the NCBI taxonomic lineages as an outer ring to display the breadth of sequence diversity in the 417 unique branch tips (41% of full library diversity) from the recovered perfects with fitness > -1 in Library 15.
Replace the value in “NCBI.phylum” column with the value from “NCBI.class” if “NCBI.phylum” is “Pseudomonadota”
# Replace the value in "NCBI.phylum" column with the value from "NCBI.class" if "NCBI.phylum" is "Pseudomonadota"
Fittree15good_taxa_merged$NCBI.phylum <- ifelse(Fittree15good_taxa_merged$NCBI.phylum == "Pseudomonadota", Fittree15good_taxa_merged$NCBI.class, Fittree15good_taxa_merged$NCBI.phylum)
# Remove rows with NA in NCBI.phylum column
Fittree15good_taxa_merged <- Fittree15good_taxa_merged[!is.na(Fittree15good_taxa_merged$NCBI.phylum) & Fittree15good_taxa_merged$NCBI.phylum != "NA", ]
Distinct Phylum Colors for Plotting
# Merge distinct phylum colors from Fittree15_taxa_merged dataframe:
Fittree15good_taxa_merged$NCBI.phylum_colors <- NA
# Add NCBI.phylum_colors from Fittree15_taxa_merged based on matching NCBI.phylum
Fittree15good_taxa_merged$NCBI.phylum_colors <- Fittree15_taxa_merged$NCBI.phylum_colors[match(Fittree15good_taxa_merged$NCBI.phylum, Fittree15_taxa_merged$NCBI.phylum)]
# List phylum in alphabetical order for legend and plotting
Fittree15good_taxa_merged$NCBI.phylum <- factor(Fittree15good_taxa_merged$NCBI.phylum, levels=NCBIphyloColor$NCBI.phylum)
Lib15: (0.0 ug/mL TMP)
tree_perfects15_5BCs_good_0.0tmp_fitness_labelled <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD05D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD05D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
#geom_tiplab2(aes(label=NCBI.phylum, angle=angle), size=1) +
theme(legend.position="none") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray"
) +
guides(color = guide_legend(title = "Fitness"))
tree_perfects15_5BCs_good_0.0tmp_fitness_labelled
Count the number of complementing homologs (fit > -1) and dropout homologs (fit < -1):
# Count unique IDs with fitD05D03 < -1
low_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD05D03 < -1) %>%
distinct(ID) %>%
nrow()
# Count unique IDs with fitD06D03 > -1
high_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD05D03 > -1) %>%
distinct(ID) %>%
nrow()
# Print the results
cat("Number of unique IDs with fitD05D03 > -1:", high_fitness_count, "\n")
Number of unique IDs with fitD05D03 > -1: 411
cat("Number of unique IDs with fitD05D03 < -1:", low_fitness_count, "\n")
Number of unique IDs with fitD05D03 < -1: 0
Build the initial DHFR tree with color-scale based on fitness at COMPLEMENTATION and add an additional ring representing taxonomic lineages at the Phylum-level.
tree_perfects15_5BCs_good_0.0tmp_fitness_phylum_p1 <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD05D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD05D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
theme(legend.position="right") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray")
tree_perfects15_5BCs_good_0.0tmp_fitness_phylum_p2 <- tree_perfects15_5BCs_good_0.0tmp_fitness_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum), # Changed this line
width=3,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=setNames(Fittree15good_taxa_merged$NCBI.phylum_colors, Fittree15good_taxa_merged$NCBI.phylum),
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
tree_perfects15_5BCs_good_0.0tmp_fitness_phylum_p2
Lib15: MIC (0.058 ug/mL TMP)
tree_perfects15_5BCs_good_0.058tmp_fitness_labelled <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD06D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD06D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
#geom_tiplab2(aes(label=NCBI.phylum, angle=angle), size=1) +
theme(legend.position="none") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray"
) +
guides(color = guide_legend(title = "Fitness"))
tree_perfects15_5BCs_good_0.058tmp_fitness_labelled
Count the number of complementing homologs (fit > -1) and dropout homologs (fit < -1):
# Count unique IDs with fitD06D03 < -1
low_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD06D03 < -1) %>%
distinct(ID) %>%
nrow()
# Count unique IDs with fitD06D03 > -1
high_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD06D03 > -1) %>%
distinct(ID) %>%
nrow()
# Print the results
cat("Number of unique IDs with fitD06D03 > -1:", high_fitness_count, "\n")
Number of unique IDs with fitD06D03 > -1: 318
cat("Number of unique IDs with fitD06D03 < -1:", low_fitness_count, "\n")
Number of unique IDs with fitD06D03 < -1: 92
Build the initial DHFR tree with color-scale based on fitness at COMPLEMENTATION and add an additional ring representing taxonomic lineages at the Phylum-level.
tree_perfects15_5BCs_good_0.058tmp_fitness_phylum_p1 <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD06D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD06D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
theme(legend.position="right") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray")
tree_perfects15_5BCs_good_0.058tmp_fitness_phylum_p2 <- tree_perfects15_5BCs_good_0.058tmp_fitness_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum), # Changed this line
width=3,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=setNames(Fittree15good_taxa_merged$NCBI.phylum_colors, Fittree15good_taxa_merged$NCBI.phylum),
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
tree_perfects15_5BCs_good_0.058tmp_fitness_phylum_p2
The following statistical tests will determine if fitness clusters by evolutionary distance within the phylogenetic tree
# Create a function to calculate distances and differences between
fitD06D03_dist_diff <- function(pairwise_distances, taxa_merged) {
distance_df <- as.data.frame(as.table(pairwise_distances)) %>%
rename(Tip1 = Var1, Tip2 = Var2, Distance = Freq) %>%
mutate(Tip1 = as.character(Tip1),
Tip2 = as.character(Tip2)) %>%
filter(Tip1 != Tip2) # Remove self-comparisons
results <- distance_df %>%
left_join(taxa_merged, by = c("Tip1" = "ID")) %>%
rename(fitD06D03v1 = fitD06D03) %>%
left_join(taxa_merged, by = c("Tip2" = "ID")) %>%
rename(fitD06D03v2 = fitD06D03) %>%
mutate(
AbsDistance = abs(Distance),
fitD06D03AbsDiff = abs(fitD06D03v1 - fitD06D03v2)
) %>%
select(Tip1, Tip2, AbsDistance, fitD06D03AbsDiff)
return(results)}
Pearson Correlation: Calculate pairwise distances between tips and differences in fitness values. Plot distances and differences as scatter plot and test for significant correlation using Pearson.
# Calculate pairwise distances
fitD06D03_pairwise_distances <- cophenetic.phylo(Fittree15good)
# Calculate distances and differences
fitD06D03_results <- fitD06D03_dist_diff(fitD06D03_pairwise_distances,Fittree15good_taxa_merged)
# Create the scatter plot
fitD06D03_scatter_plot <- ggplot(fitD06D03_results, aes(x = AbsDistance, y = fitD06D03AbsDiff)) +
geom_point(alpha = 0.5) + # Add transparency to points
geom_smooth(method = "lm", color = "red", se = FALSE) + # Add a linear trend line
stat_cor(method = "pearson", p.accuracy = 0.000001, r.accuracy = 0.01) +
theme_minimal() +
labs(title = "Absolute Distance vs Absolute Difference in fitD06D03",
x = "Absolute Value of Distance",
y = "Absolute Value of Difference in fitD06D03") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
# Display the plot
print(fitD06D03_scatter_plot)
# Optional: Save the plot
ggsave("Perfects/PLOTS/Wilcox/fitD06D03_pearson_scatter_plot.png",
fitD06D03_scatter_plot, width = 10, height = 8, dpi = 300)
Close-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance less than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on closely related taxa
fitD06D03_close <- fitD06D03_results %>%
filter(AbsDistance < 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD06D03_close$fitD06D03AbsDiff)
qqline(fitD06D03_close$fitD06D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD06D03_close$fitD06D03AbsDiff, main = "Histogram of Close Data")
Far-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance greater than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on distantly related taxa
fitD06D03_far <- fitD06D03_results %>%
filter(AbsDistance > 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD06D03_far$fitD06D03AbsDiff)
qqline(fitD06D03_far$fitD06D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD06D03_far$fitD06D03AbsDiff, main = "Histogram of Far Data")
Wilcox Test: Perform Wilcoxon Rank Sum Test (also known as Mann-Whitney U test) comparing the absolute distance values between the close-related and far-related groups.
# Wilcox test between close and far related distances
wilcox_result_fitD06D03 <- wilcox.test(fitD06D03_close$fitD06D03AbsDiff,
fitD06D03_far$fitD06D03AbsDiff)
print(wilcox_result_fitD06D03)
Wilcoxon rank sum test with continuity correction
data: fitD06D03_close$fitD06D03AbsDiff and fitD06D03_far$fitD06D03AbsDiff
W = 294122436, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
Combine Data: Examine the distributions of these differences to understand how they differ and whether these differences are generally larger or smaller for closely related taxa.
# Combine the data and add a group label
fitD06D03_combined_data <- bind_rows(
mutate(fitD06D03_close, group = "Close"),
mutate(fitD06D03_far, group = "Far"))
# Remove NA values
fitD06D03_combined_data <- fitD06D03_combined_data %>%
filter(!is.na(fitD06D03AbsDiff))
# Create a box plot
fitD06D03_boxplot <- ggplot(fitD06D03_combined_data, aes(x = group, y = fitD06D03AbsDiff)) +
geom_boxplot() +
geom_jitter(alpha = 0.1, width = 0.2) + # Add individual points with some transparency
labs(title = "Distribution of fitD06D03 Differences",
x = "Phylogenetic Relationship",
y = "Absolute Difference in fitD06D03") +
theme_minimal()
# Display the plot
print(fitD06D03_boxplot)
# Calculate summary statistics
fitD06D03_summary_stats <- fitD06D03_combined_data %>%
group_by(group) %>%
summarise(
n = n(),
mean = mean(fitD06D03AbsDiff),
median = median(fitD06D03AbsDiff),
sd = sd(fitD06D03AbsDiff),
min = min(fitD06D03AbsDiff),
max = max(fitD06D03AbsDiff)
)
# Print summary statistics
print(fitD06D03_summary_stats)
# Perform a formal test for difference in means
fitD06D03_t_test_result <- t.test(fitD06D03AbsDiff ~ group, data = fitD06D03_combined_data)
print(fitD06D03_t_test_result)
Welch Two Sample t-test
data: fitD06D03AbsDiff by group
t = -72.579, df = 7431.6, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Close and group Far is not equal to 0
95 percent confidence interval:
-0.8238030 -0.7804733
sample estimates:
mean in group Close mean in group Far
0.6984165 1.5005546
Lib15: MIC (0.5 ug/mL TMP)
tree_perfects15_5BCs_good_0.5tmp_fitness_labelled <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD07D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD07D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
#geom_tiplab2(aes(label=NCBI.phylum, angle=angle), size=1) +
theme(legend.position="none") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray"
) +
guides(color = guide_legend(title = "Fitness"))
tree_perfects15_5BCs_good_0.5tmp_fitness_labelled
Count the number of complementing homologs (fit > -1) and dropout homologs (fit < -1):
# Count unique IDs with fitD07D03 > -1
high_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD07D03 > -1) %>%
distinct(ID) %>%
nrow()
# Count unique IDs with fitD07D03 < -1
low_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD07D03 < -1) %>%
distinct(ID) %>%
nrow()
# Print the results
cat("Number of unique IDs with fitD07D03 > -1:", high_fitness_count, "\n")
Number of unique IDs with fitD07D03 > -1: 246
cat("Number of unique IDs with fitD07D03 < -1:", low_fitness_count, "\n")
Number of unique IDs with fitD07D03 < -1: 162
Build the initial DHFR tree with color-scale based on fitness at COMPLEMENTATION and add an additional ring representing taxonomic lineages at the Phylum-level.
tree_perfects15_5BCs_good_0.5tmp_fitness_phylum_p1 <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD07D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD07D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
theme(legend.position="right") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray")
tree_perfects15_5BCs_good_0.5tmp_fitness_phylum_p2 <- tree_perfects15_5BCs_good_0.5tmp_fitness_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum), # Changed this line
width=3,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=setNames(Fittree15good_taxa_merged$NCBI.phylum_colors, Fittree15good_taxa_merged$NCBI.phylum),
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
tree_perfects15_5BCs_good_0.5tmp_fitness_phylum_p2
The following statistical tests will determine if fitness clusters by evolutionary distance within the phylogenetic tree
# Create a function to calculate distances and differences between
fitD07D03_dist_diff <- function(pairwise_distances, taxa_merged) {
distance_df <- as.data.frame(as.table(pairwise_distances)) %>%
rename(Tip1 = Var1, Tip2 = Var2, Distance = Freq) %>%
mutate(Tip1 = as.character(Tip1),
Tip2 = as.character(Tip2)) %>%
filter(Tip1 != Tip2) # Remove self-comparisons
results <- distance_df %>%
left_join(taxa_merged, by = c("Tip1" = "ID")) %>%
rename(fitD07D03v1 = fitD07D03) %>%
left_join(taxa_merged, by = c("Tip2" = "ID")) %>%
rename(fitD07D03v2 = fitD07D03) %>%
mutate(
AbsDistance = abs(Distance),
fitD07D03AbsDiff = abs(fitD07D03v1 - fitD07D03v2)
) %>%
select(Tip1, Tip2, AbsDistance, fitD07D03AbsDiff)
return(results)}
Pearson Correlation: Calculate pairwise distances between tips and differences in fitness values. Plot distances and differences as scatter plot and test for significant correlation using Pearson.
# Calculate pairwise distances
fitD07D03_pairwise_distances <- cophenetic.phylo(Fittree15good)
# Calculate distances and differences
fitD07D03_results <- fitD07D03_dist_diff(fitD07D03_pairwise_distances,Fittree15good_taxa_merged)
# Create the scatter plot
fitD07D03_scatter_plot <- ggplot(fitD07D03_results, aes(x = AbsDistance, y = fitD07D03AbsDiff)) +
geom_point(alpha = 0.5) + # Add transparency to points
geom_smooth(method = "lm", color = "red", se = FALSE) + # Add a linear trend line
stat_cor(method = "pearson", p.accuracy = 0.000001, r.accuracy = 0.01) +
theme_minimal() +
labs(title = "Absolute Distance vs Absolute Difference in fitD07D03",
x = "Absolute Value of Distance",
y = "Absolute Value of Difference in fitD07D03") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
# Display the plot
print(fitD07D03_scatter_plot)
# Optional: Save the plot
ggsave("Perfects/PLOTS/Wilcox/fitD07D03_pearson_scatter_plot.png",
fitD07D03_scatter_plot, width = 10, height = 8, dpi = 300)
Close-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance less than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on closely related taxa
fitD07D03_close <- fitD07D03_results %>%
filter(AbsDistance < 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD07D03_close$fitD07D03AbsDiff)
qqline(fitD07D03_close$fitD07D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD07D03_close$fitD07D03AbsDiff, main = "Histogram of Close Data")
Far-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance greater than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on distantly related taxa
fitD07D03_far <- fitD07D03_results %>%
filter(AbsDistance > 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD07D03_far$fitD07D03AbsDiff)
qqline(fitD07D03_far$fitD07D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD07D03_far$fitD07D03AbsDiff, main = "Histogram of Far Data")
Wilcox Test: Perform Wilcoxon Rank Sum Test (also known as Mann-Whitney U test) comparing the absolute distance values between the close-related and far-related groups.
# Wilcox test between close and far related distances
wilcox_result_fitD07D03 <- wilcox.test(fitD07D03_close$fitD07D03AbsDiff,
fitD07D03_far$fitD07D03AbsDiff)
print(wilcox_result_fitD07D03)
Wilcoxon rank sum test with continuity correction
data: fitD07D03_close$fitD07D03AbsDiff and fitD07D03_far$fitD07D03AbsDiff
W = 317479680, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
Interpretation:
Significant Difference: The extremely low p-value (< 2.2e-16) indicates a statistically significant difference between the fitD07D03AbsDiff values of closely related taxa (fitnessD07D03_close) and more distantly related taxa (fitD07D03_far).
Distribution Comparison: The test suggests that the distribution of fitD07D03AbsDiff values is significantly different between close and far phylogenetic relationships.
Magnitude and Direction: While the test tells us there’s a significant difference, it doesn’t tell us about the magnitude or direction of this difference. You’d need to look at descriptive statistics (like medians) of both groups to understand this.
Biological Significance: This result suggests that the absolute differences in fitD07D03 values are not the same for closely related taxa compared to more distantly related taxa. This could imply that evolutionary distance does play a role in the similarity (or difference) of fitD07D03 values, despite the weak correlation you found earlier.
Further Investigation: It would be worth examining the actual distributions of these differences (e.g., with box plots) to understand how they differ. Are the differences generally larger or smaller for closely related taxa?
Caution: Remember that with very large sample sizes, even small differences can become statistically significant. It’s important to consider the practical or biological significance of these differences, not just the statistical significance.
This result adds nuance to your earlier findings, suggesting that while there might not be a strong linear correlation between phylogenetic distance and fitD07D03 differences, there is a significant difference in how these values vary among close relatives versus more distant relatives.
Combine Data: Examine the distributions of these differences to understand how they differ and whether these differences are generally larger or smaller for closely related taxa.
# Combine the data and add a group label
fitD07D03_combined_data <- bind_rows(
mutate(fitD07D03_close, group = "Close"),
mutate(fitD07D03_far, group = "Far"))
# Remove NA values
fitD07D03_combined_data <- fitD07D03_combined_data %>%
filter(!is.na(fitD07D03AbsDiff))
# Create a box plot
fitD07D03_boxplot <- ggplot(fitD07D03_combined_data, aes(x = group, y = fitD07D03AbsDiff)) +
geom_boxplot() +
geom_jitter(alpha = 0.1, width = 0.2) + # Add individual points with some transparency
labs(title = "Distribution of fitD07D03 Differences",
x = "Phylogenetic Relationship",
y = "Absolute Difference in fitD07D03") +
theme_minimal()
# Display the plot
print(fitD07D03_boxplot)
# Calculate summary statistics
fitD07D03_summary_stats <- fitD07D03_combined_data %>%
group_by(group) %>%
summarise(
n = n(),
mean = mean(fitD07D03AbsDiff),
median = median(fitD07D03AbsDiff),
sd = sd(fitD07D03AbsDiff),
min = min(fitD07D03AbsDiff),
max = max(fitD07D03AbsDiff)
)
# Print summary statistics
print(fitD07D03_summary_stats)
# Perform a formal test for difference in means
fitD07D03_t_test_result <- t.test(fitD07D03AbsDiff ~ group, data = fitD07D03_combined_data)
print(fitD07D03_t_test_result)
Welch Two Sample t-test
data: fitD07D03AbsDiff by group
t = -63.505, df = 7020.5, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Close and group Far is not equal to 0
95 percent confidence interval:
-1.263865 -1.188174
sample estimates:
mean in group Close mean in group Far
1.482027 2.708046
Interpretation:
Significant Difference: There is a statistically significant difference in the fitD07D03AbsDiff values between closely related taxa and more distantly related taxa.
Magnitude of Difference: On average, the absolute difference in fitD07D03 values is about 1.226 units smaller for closely related taxa compared to distantly related taxa.
Consistency of Difference: The narrow confidence interval (-1.263865 to -1.188174) suggests that this difference is quite consistent across the dataset.
Biological Significance: Closely related taxa tend to have more similar fitD07D03 values (smaller differences) compared to distantly related taxa. This suggests that the fitD07D03 trait shows some degree of phylogenetic signal or conservation.
Evolutionary Implications: This result indicates that evolutionary relatedness does play a role in the similarity of fitD07D03 values. Closely related species tend to have more similar values, which could be due to shared evolutionary history or similar environmental adaptations.
Contrast with Earlier Correlation: While earlier you found a weak correlation between phylogenetic distance and fitD07D03 differences, this t-test reveals a clear distinction between close and distant relationships. This highlights the importance of considering different analytical approaches, as they can reveal different aspects of the data.
Practical Significance: The difference in means (about 1.226) should be considered in the context of the overall range and biological meaning of fitD07D03 values to determine its practical significance.
In summary, this analysis provides strong evidence that closely related taxa have more similar fitD07D03 values compared to distantly related taxa. This supports the idea that there is a phylogenetic component to the fitD07D03 trait, even if it’s not captured well by a simple linear correlation with phylogenetic distance. The large sample size and highly significant result suggest that this is a robust finding, though you should always consider the biological context and potential confounding factors in your interpretation.
Lib15: MIC (1.0 ug/mL TMP)
tree_perfects15_5BCs_good_1.0tmp_fitness_labelled <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD08D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD08D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
#geom_tiplab2(aes(label=NCBI.phylum, angle=angle), size=1) +
theme(legend.position="none") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray"
) +
guides(color = guide_legend(title = "Fitness"))
tree_perfects15_5BCs_good_1.0tmp_fitness_labelled
Count the number of complementing homologs (fit > -1) and dropout homologs (fit < -1):
# Count unique IDs with fitD08D03 > -1
high_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD08D03 > -1) %>%
distinct(ID) %>%
nrow()
# Count unique IDs with fitD08D03 < -1
low_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD08D03 < -1) %>%
distinct(ID) %>%
nrow()
# Print the results
cat("Number of unique IDs with fitD08D03 > -1:", high_fitness_count, "\n")
Number of unique IDs with fitD08D03 > -1: 226
cat("Number of unique IDs with fitD08D03 < -1:", low_fitness_count, "\n")
Number of unique IDs with fitD08D03 < -1: 183
Build the initial DHFR tree with color-scale based on fitness at COMPLEMENTATION and add an additional ring representing taxonomic lineages at the Phylum-level.
tree_perfects15_5BCs_good_1.0tmp_fitness_phylum_p1 <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD08D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD08D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
theme(legend.position="right") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray")
tree_perfects15_5BCs_good_1.0tmp_fitness_phylum_p2 <- tree_perfects15_5BCs_good_1.0tmp_fitness_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum), # Changed this line
width=3,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=setNames(Fittree15good_taxa_merged$NCBI.phylum_colors, Fittree15good_taxa_merged$NCBI.phylum),
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
tree_perfects15_5BCs_good_1.0tmp_fitness_phylum_p2
The following statistical tests will determine if fitness clusters by evolutionary distance within the phylogenetic tree
# Create a function to calculate distances and differences between
fitD08D03_dist_diff <- function(pairwise_distances, taxa_merged) {
distance_df <- as.data.frame(as.table(pairwise_distances)) %>%
rename(Tip1 = Var1, Tip2 = Var2, Distance = Freq) %>%
mutate(Tip1 = as.character(Tip1),
Tip2 = as.character(Tip2)) %>%
filter(Tip1 != Tip2) # Remove self-comparisons
results <- distance_df %>%
left_join(taxa_merged, by = c("Tip1" = "ID")) %>%
rename(fitD08D03v1 = fitD08D03) %>%
left_join(taxa_merged, by = c("Tip2" = "ID")) %>%
rename(fitD08D03v2 = fitD08D03) %>%
mutate(
AbsDistance = abs(Distance),
fitD08D03AbsDiff = abs(fitD08D03v1 - fitD08D03v2)
) %>%
select(Tip1, Tip2, AbsDistance, fitD08D03AbsDiff)
return(results)}
Pearson Correlation: Calculate pairwise distances between tips and differences in fitness values. Plot distances and differences as scatter plot and test for significant correlation using Pearson.
# Calculate pairwise distances
fitD08D03_pairwise_distances <- cophenetic.phylo(Fittree15good)
# Calculate distances and differences
fitD08D03_results <- fitD08D03_dist_diff(fitD08D03_pairwise_distances,Fittree15good_taxa_merged)
# Create the scatter plot
fitD08D03_scatter_plot <- ggplot(fitD08D03_results, aes(x = AbsDistance, y = fitD08D03AbsDiff)) +
geom_point(alpha = 0.5) + # Add transparency to points
geom_smooth(method = "lm", color = "red", se = FALSE) + # Add a linear trend line
stat_cor(method = "pearson", p.accuracy = 0.000001, r.accuracy = 0.01) +
theme_minimal() +
labs(title = "Absolute Distance vs Absolute Difference in fitD08D03",
x = "Absolute Value of Distance",
y = "Absolute Value of Difference in fitD08D03") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
# Display the plot
print(fitD08D03_scatter_plot)
# Optional: Save the plot
ggsave("Perfects/PLOTS/Wilcox/fitD08D03_pearson_scatter_plot.png",
fitD08D03_scatter_plot, width = 10, height = 8, dpi = 300)
Close-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance less than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on closely related taxa
fitD08D03_close <- fitD08D03_results %>%
filter(AbsDistance < 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD08D03_close$fitD08D03AbsDiff)
qqline(fitD08D03_close$fitD08D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD08D03_close$fitD08D03AbsDiff, main = "Histogram of Close Data")
Far-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance greater than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on distantly related taxa
fitD08D03_far <- fitD08D03_results %>%
filter(AbsDistance > 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD08D03_far$fitD08D03AbsDiff)
qqline(fitD08D03_far$fitD08D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD08D03_far$fitD08D03AbsDiff, main = "Histogram of Far Data")
Wilcox Test: Perform Wilcoxon Rank Sum Test (also known as Mann-Whitney U test) comparing the absolute distance values between the close-related and far-related groups.
# Wilcox test between close and far related distances
wilcox_result_fitD08D03 <- wilcox.test(fitD08D03_close$fitD08D03AbsDiff,
fitD08D03_far$fitD08D03AbsDiff)
print(wilcox_result_fitD08D03)
Wilcoxon rank sum test with continuity correction
data: fitD08D03_close$fitD08D03AbsDiff and fitD08D03_far$fitD08D03AbsDiff
W = 334730856, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
Combine Data: Examine the distributions of these differences to understand how they differ and whether these differences are generally larger or smaller for closely related taxa.
# Combine the data and add a group label
fitD08D03_combined_data <- bind_rows(
mutate(fitD08D03_close, group = "Close"),
mutate(fitD08D03_far, group = "Far"))
# Remove NA values
fitD08D03_combined_data <- fitD08D03_combined_data %>%
filter(!is.na(fitD08D03AbsDiff))
# Create a box plot
fitD08D03_boxplot <- ggplot(fitD08D03_combined_data, aes(x = group, y = fitD08D03AbsDiff)) +
geom_boxplot() +
geom_jitter(alpha = 0.1, width = 0.2) + # Add individual points with some transparency
labs(title = "Distribution of fitD08D03 Differences",
x = "Phylogenetic Relationship",
y = "Absolute Difference in fitD08D03") +
theme_minimal()
# Display the plot
print(fitD08D03_boxplot)
# Calculate summary statistics
fitD08D03_summary_stats <- fitD08D03_combined_data %>%
group_by(group) %>%
summarise(
n = n(),
mean = mean(fitD08D03AbsDiff),
median = median(fitD08D03AbsDiff),
sd = sd(fitD08D03AbsDiff),
min = min(fitD08D03AbsDiff),
max = max(fitD08D03AbsDiff)
)
# Print summary statistics
print(fitD08D03_summary_stats)
# Perform a formal test for difference in means
fitD08D03_t_test_result <- t.test(fitD08D03AbsDiff ~ group, data = fitD08D03_combined_data)
print(fitD08D03_t_test_result)
Welch Two Sample t-test
data: fitD08D03AbsDiff by group
t = -51.84, df = 6728.6, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Close and group Far is not equal to 0
95 percent confidence interval:
-1.246943 -1.156073
sample estimates:
mean in group Close mean in group Far
1.883948 3.085455
Lib15: MIC (10 ug/mL TMP)
tree_perfects15_5BCs_good_10tmp_fitness_labelled <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD09D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD09D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
#geom_tiplab2(aes(label=NCBI.phylum, angle=angle), size=1) +
theme(legend.position="none") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray"
) +
guides(color = guide_legend(title = "Fitness"))
tree_perfects15_5BCs_good_10tmp_fitness_labelled
Count the number of complementing homologs (fit > -1) and dropout homologs (fit < -1):
# Count unique IDs with fitD09D03 > -1
high_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD09D03 > -1) %>%
distinct(ID) %>%
nrow()
# Count unique IDs with fitD09D03 < -1
low_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD09D03 < -1) %>%
distinct(ID) %>%
nrow()
# Print the results
cat("Number of unique IDs with fitD09D03 > -1:", high_fitness_count, "\n")
Number of unique IDs with fitD09D03 > -1: 128
cat("Number of unique IDs with fitD09D03 < -1:", low_fitness_count, "\n")
Number of unique IDs with fitD09D03 < -1: 276
Build the initial DHFR tree with color-scale based on fitness at COMPLEMENTATION and add an additional ring representing taxonomic lineages at the Phylum-level.
tree_perfects15_5BCs_good_10tmp_fitness_phylum_p1 <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD09D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD09D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
theme(legend.position="right") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray")
tree_perfects15_5BCs_good_10tmp_fitness_phylum_p2 <- tree_perfects15_5BCs_good_10tmp_fitness_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum), # Changed this line
width=3,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=setNames(Fittree15good_taxa_merged$NCBI.phylum_colors, Fittree15good_taxa_merged$NCBI.phylum),
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
tree_perfects15_5BCs_good_10tmp_fitness_phylum_p2
The following statistical tests will determine if fitness clusters by evolutionary distance within the phylogenetic tree
# Create a function to calculate distances and differences between
fitD09D03_dist_diff <- function(pairwise_distances, taxa_merged) {
distance_df <- as.data.frame(as.table(pairwise_distances)) %>%
rename(Tip1 = Var1, Tip2 = Var2, Distance = Freq) %>%
mutate(Tip1 = as.character(Tip1),
Tip2 = as.character(Tip2)) %>%
filter(Tip1 != Tip2) # Remove self-comparisons
results <- distance_df %>%
left_join(taxa_merged, by = c("Tip1" = "ID")) %>%
rename(fitD09D03v1 = fitD09D03) %>%
left_join(taxa_merged, by = c("Tip2" = "ID")) %>%
rename(fitD09D03v2 = fitD09D03) %>%
mutate(
AbsDistance = abs(Distance),
fitD09D03AbsDiff = abs(fitD09D03v1 - fitD09D03v2)
) %>%
select(Tip1, Tip2, AbsDistance, fitD09D03AbsDiff)
return(results)}
Pearson Correlation: Calculate pairwise distances between tips and differences in fitness values. Plot distances and differences as scatter plot and test for significant correlation using Pearson.
# Calculate pairwise distances
fitD09D03_pairwise_distances <- cophenetic.phylo(Fittree15good)
# Calculate distances and differences
fitD09D03_results <- fitD09D03_dist_diff(fitD09D03_pairwise_distances,Fittree15good_taxa_merged)
# Create the scatter plot
fitD09D03_scatter_plot <- ggplot(fitD09D03_results, aes(x = AbsDistance, y = fitD09D03AbsDiff)) +
geom_point(alpha = 0.5) + # Add transparency to points
geom_smooth(method = "lm", color = "red", se = FALSE) + # Add a linear trend line
stat_cor(method = "pearson", p.accuracy = 0.000001, r.accuracy = 0.01) +
theme_minimal() +
labs(title = "Absolute Distance vs Absolute Difference in fitD09D03",
x = "Absolute Value of Distance",
y = "Absolute Value of Difference in fitD09D03") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
# Display the plot
print(fitD09D03_scatter_plot)
# Optional: Save the plot
ggsave("Perfects/PLOTS/Wilcox/fitD09D03_pearson_scatter_plot.png",
fitD09D03_scatter_plot, width = 10, height = 8, dpi = 300)
Close-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance less than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on closely related taxa
fitD09D03_close <- fitD09D03_results %>%
filter(AbsDistance < 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD09D03_close$fitD09D03AbsDiff)
qqline(fitD09D03_close$fitD09D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD09D03_close$fitD09D03AbsDiff, main = "Histogram of Close Data")
Far-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance greater than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on distantly related taxa
fitD09D03_far <- fitD09D03_results %>%
filter(AbsDistance > 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD09D03_far$fitD09D03AbsDiff)
qqline(fitD09D03_far$fitD09D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD09D03_far$fitD09D03AbsDiff, main = "Histogram of Far Data")
Wilcox Test: Perform Wilcoxon Rank Sum Test (also known as Mann-Whitney U test) comparing the absolute distance values between the close-related and far-related groups.
# Wilcox test between close and far related distances
wilcox_result_fitD09D03 <- wilcox.test(fitD09D03_close$fitD09D03AbsDiff,
fitD09D03_far$fitD09D03AbsDiff)
print(wilcox_result_fitD09D03)
Wilcoxon rank sum test with continuity correction
data: fitD09D03_close$fitD09D03AbsDiff and fitD09D03_far$fitD09D03AbsDiff
W = 387510616, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
Combine Data: Examine the distributions of these differences to understand how they differ and whether these differences are generally larger or smaller for closely related taxa.
# Combine the data and add a group label
fitD09D03_combined_data <- bind_rows(
mutate(fitD09D03_close, group = "Close"),
mutate(fitD09D03_far, group = "Far"))
# Remove NA values
fitD09D03_combined_data <- fitD09D03_combined_data %>%
filter(!is.na(fitD09D03AbsDiff))
# Create a box plot
fitD09D03_boxplot <- ggplot(fitD09D03_combined_data, aes(x = group, y = fitD09D03AbsDiff)) +
geom_boxplot() +
geom_jitter(alpha = 0.1, width = 0.2) + # Add individual points with some transparency
labs(title = "Distribution of fitD09D03 Differences",
x = "Phylogenetic Relationship",
y = "Absolute Difference in fitD09D03") +
theme_minimal()
# Display the plot
print(fitD09D03_boxplot)
# Calculate summary statistics
fitD09D03_summary_stats <- fitD09D03_combined_data %>%
group_by(group) %>%
summarise(
n = n(),
mean = mean(fitD09D03AbsDiff),
median = median(fitD09D03AbsDiff),
sd = sd(fitD09D03AbsDiff),
min = min(fitD09D03AbsDiff),
max = max(fitD09D03AbsDiff)
)
# Print summary statistics
print(fitD09D03_summary_stats)
# Perform a formal test for difference in means
fitD09D03_t_test_result <- t.test(fitD09D03AbsDiff ~ group, data = fitD09D03_combined_data)
print(fitD09D03_t_test_result)
Welch Two Sample t-test
data: fitD09D03AbsDiff by group
t = -14.539, df = 6131.3, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Close and group Far is not equal to 0
95 percent confidence interval:
-0.7057103 -0.5380190
sample estimates:
mean in group Close mean in group Far
3.684020 4.305885
Lib15: MIC (50 ug/mL TMP)
tree_perfects15_5BCs_good_50tmp_fitness_labelled <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD10D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD10D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
#geom_tiplab2(aes(label=NCBI.phylum, angle=angle), size=1) +
theme(legend.position="none") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray"
) +
guides(color = guide_legend(title = "Fitness"))
tree_perfects15_5BCs_good_50tmp_fitness_labelled
Count the number of complementing homologs (fit > -1) and dropout homologs (fit < -1):
# Count unique IDs with fitD10D03 > -1
high_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD10D03 > -1) %>%
distinct(ID) %>%
nrow()
# Count unique IDs with fitD10D03 < -1
low_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD10D03 < -1) %>%
distinct(ID) %>%
nrow()
# Print the results
cat("Number of unique IDs with fitD10D03 > -1:", high_fitness_count, "\n")
Number of unique IDs with fitD10D03 > -1: 80
cat("Number of unique IDs with fitD10D03 < -1:", low_fitness_count, "\n")
Number of unique IDs with fitD10D03 < -1: 321
Build the initial DHFR tree with color-scale based on fitness at COMPLEMENTATION and add an additional ring representing taxonomic lineages at the Phylum-level.
tree_perfects15_5BCs_good_50tmp_fitness_phylum_p1 <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD10D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD10D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
theme(legend.position="right") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray")
tree_perfects15_5BCs_good_50tmp_fitness_phylum_p2 <- tree_perfects15_5BCs_good_50tmp_fitness_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum), # Changed this line
width=3,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=setNames(Fittree15good_taxa_merged$NCBI.phylum_colors, Fittree15good_taxa_merged$NCBI.phylum),
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
tree_perfects15_5BCs_good_50tmp_fitness_phylum_p2
The following statistical tests will determine if fitness clusters by evolutionary distance within the phylogenetic tree
# Create a function to calculate distances and differences between
fitD10D03_dist_diff <- function(pairwise_distances, taxa_merged) {
distance_df <- as.data.frame(as.table(pairwise_distances)) %>%
rename(Tip1 = Var1, Tip2 = Var2, Distance = Freq) %>%
mutate(Tip1 = as.character(Tip1),
Tip2 = as.character(Tip2)) %>%
filter(Tip1 != Tip2) # Remove self-comparisons
results <- distance_df %>%
left_join(taxa_merged, by = c("Tip1" = "ID")) %>%
rename(fitD10D03v1 = fitD10D03) %>%
left_join(taxa_merged, by = c("Tip2" = "ID")) %>%
rename(fitD10D03v2 = fitD10D03) %>%
mutate(
AbsDistance = abs(Distance),
fitD10D03AbsDiff = abs(fitD10D03v1 - fitD10D03v2)
) %>%
select(Tip1, Tip2, AbsDistance, fitD10D03AbsDiff)
return(results)}
Pearson Correlation: Calculate pairwise distances between tips and differences in fitness values. Plot distances and differences as scatter plot and test for significant correlation using Pearson.
# Calculate pairwise distances
fitD10D03_pairwise_distances <- cophenetic.phylo(Fittree15good)
# Calculate distances and differences
fitD10D03_results <- fitD10D03_dist_diff(fitD10D03_pairwise_distances,Fittree15good_taxa_merged)
# Create the scatter plot
fitD10D03_scatter_plot <- ggplot(fitD10D03_results, aes(x = AbsDistance, y = fitD10D03AbsDiff)) +
geom_point(alpha = 0.5) + # Add transparency to points
geom_smooth(method = "lm", color = "red", se = FALSE) + # Add a linear trend line
stat_cor(method = "pearson", p.accuracy = 0.000001, r.accuracy = 0.01) +
theme_minimal() +
labs(title = "Absolute Distance vs Absolute Difference in fitD10D03",
x = "Absolute Value of Distance",
y = "Absolute Value of Difference in fitD10D03") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
# Display the plot
print(fitD10D03_scatter_plot)
# Optional: Save the plot
ggsave("Perfects/PLOTS/Wilcox/fitD10D03_pearson_scatter_plot.png",
fitD10D03_scatter_plot, width = 10, height = 8, dpi = 300)
Close-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance less than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on closely related taxa
fitD10D03_close <- fitD10D03_results %>%
filter(AbsDistance < 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD10D03_close$fitD10D03AbsDiff)
qqline(fitD10D03_close$fitD10D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD10D03_close$fitD10D03AbsDiff, main = "Histogram of Close Data")
Far-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance greater than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on distantly related taxa
fitD10D03_far <- fitD10D03_results %>%
filter(AbsDistance > 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD10D03_far$fitD10D03AbsDiff)
qqline(fitD10D03_far$fitD10D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD10D03_far$fitD10D03AbsDiff, main = "Histogram of Far Data")
Wilcox Test: Perform Wilcoxon Rank Sum Test (also known as Mann-Whitney U test) comparing the absolute distance values between the close-related and far-related groups.
# Wilcox test between close and far related distances
wilcox_result_fitD10D03 <- wilcox.test(fitD10D03_close$fitD10D03AbsDiff,
fitD10D03_far$fitD10D03AbsDiff)
print(wilcox_result_fitD10D03)
Wilcoxon rank sum test with continuity correction
data: fitD10D03_close$fitD10D03AbsDiff and fitD10D03_far$fitD10D03AbsDiff
W = 389640364, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
Combine Data: Examine the distributions of these differences to understand how they differ and whether these differences are generally larger or smaller for closely related taxa.
# Combine the data and add a group label
fitD10D03_combined_data <- bind_rows(
mutate(fitD10D03_close, group = "Close"),
mutate(fitD10D03_far, group = "Far"))
# Remove NA values
fitD10D03_combined_data <- fitD10D03_combined_data %>%
filter(!is.na(fitD10D03AbsDiff))
# Create a box plot
fitD10D03_boxplot <- ggplot(fitD10D03_combined_data, aes(x = group, y = fitD10D03AbsDiff)) +
geom_boxplot() +
geom_jitter(alpha = 0.1, width = 0.2) + # Add individual points with some transparency
labs(title = "Distribution of fitD10D03 Differences",
x = "Phylogenetic Relationship",
y = "Absolute Difference in fitD10D03") +
theme_minimal()
# Display the plot
print(fitD10D03_boxplot)
# Calculate summary statistics
fitD10D03_summary_stats <- fitD10D03_combined_data %>%
group_by(group) %>%
summarise(
n = n(),
mean = mean(fitD10D03AbsDiff),
median = median(fitD10D03AbsDiff),
sd = sd(fitD10D03AbsDiff),
min = min(fitD10D03AbsDiff),
max = max(fitD10D03AbsDiff)
)
# Print summary statistics
print(fitD10D03_summary_stats)
# Perform a formal test for difference in means
fitD10D03_t_test_result <- t.test(fitD10D03AbsDiff ~ group, data = fitD10D03_combined_data)
print(fitD10D03_t_test_result)
Welch Two Sample t-test
data: fitD10D03AbsDiff by group
t = -13.392, df = 6216.1, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Close and group Far is not equal to 0
95 percent confidence interval:
-0.5300265 -0.3946694
sample estimates:
mean in group Close mean in group Far
2.880586 3.342934
Lib15: 400x MIC (200 ug/mL TMP)
tree_perfects15_5BCs_good_200tmp_fitness_labelled <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD11D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD11D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
#geom_tiplab2(aes(label=NCBI.phylum, angle=angle), size=1) +
theme(legend.position="none") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray"
) +
guides(color = guide_legend(title = "Fitness"))
tree_perfects15_5BCs_good_200tmp_fitness_labelled
Count the number of complementing homologs (fit > -1) and dropout homologs (fit < -1):
# Count unique IDs with fitD11D03 > -1
high_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD11D03 > -1) %>%
distinct(ID) %>%
nrow()
# Count unique IDs with fitD11D03 < -1
low_fitness_count <- Fittree15good_taxa_merged %>%
filter(fitD11D03 < -1) %>%
distinct(ID) %>%
nrow()
# Print the results
cat("Number of unique IDs with fitD11D03 > -1:", high_fitness_count, "\n")
Number of unique IDs with fitD11D03 > -1: 27
cat("Number of unique IDs with fitD11D03 < -1:", low_fitness_count, "\n")
Number of unique IDs with fitD11D03 < -1: 332
Build the Complementation-capable DHFR tree with color-scale based on fitness at 400x MIC and add an additional ring representing taxonomic lineages at the Phylum-level.
tree_perfects15_5BCs_good_200tmp_fitness_phylum_p1 <- ggtree(Fittree15good, layout="circular", branch.length="none") %<+%
Fittree15good_taxa_merged +
aes(color = ifelse(fitD11D03 < -1, "Low fitness", "High fitness")) +
geom_tippoint(aes(color = ifelse(fitD11D03 < -1, "Low fitness", "High fitness")), size=1, alpha=0.8) +
theme(legend.position="right") +
scale_color_manual(
"Fitness",
values = c("Low fitness" = "darkblue", "High fitness" = "gold"),
na.value = "gray")
tree_perfects15_5BCs_good_200tmp_fitness_phylum_p2 <- tree_perfects15_5BCs_good_200tmp_fitness_phylum_p1 +
new_scale_fill() +
geom_fruit(
geom=geom_tile,
mapping=aes(fill=NCBI.phylum), # Changed this line
width=3,
offset=0.1
) +
scale_fill_manual(
name="Phylum",
values=setNames(Fittree15good_taxa_merged$NCBI.phylum_colors, Fittree15good_taxa_merged$NCBI.phylum),
guide=guide_legend(keywidth=0.3, keyheight=0.3, ncol=1)
) +
theme(
legend.title=element_text(size=10),
legend.text=element_text(size=8),
legend.spacing.y = unit(0.2, "cm")
)
tree_perfects15_5BCs_good_200tmp_fitness_phylum_p2
The following statistical tests will determine if fitness clusters by evolutionary distance within the phylogenetic tree
# Create a function to calculate distances and differences between
fitD11D03_dist_diff <- function(pairwise_distances, taxa_merged) {
distance_df <- as.data.frame(as.table(pairwise_distances)) %>%
rename(Tip1 = Var1, Tip2 = Var2, Distance = Freq) %>%
mutate(Tip1 = as.character(Tip1),
Tip2 = as.character(Tip2)) %>%
filter(Tip1 != Tip2) # Remove self-comparisons
results <- distance_df %>%
left_join(taxa_merged, by = c("Tip1" = "ID")) %>%
rename(fitD11D03v1 = fitD11D03) %>%
left_join(taxa_merged, by = c("Tip2" = "ID")) %>%
rename(fitD11D03v2 = fitD11D03) %>%
mutate(
AbsDistance = abs(Distance),
fitD11D03AbsDiff = abs(fitD11D03v1 - fitD11D03v2)
) %>%
select(Tip1, Tip2, AbsDistance, fitD11D03AbsDiff)
return(results)}
Pearson Correlation: Calculate pairwise distances between tips and differences in fitness values. Plot distances and differences as scatter plot and test for significant correlation using Pearson.
# Calculate pairwise distances
fitD11D03_pairwise_distances <- cophenetic.phylo(Fittree15good)
# Calculate distances and differences
fitD11D03_results <- fitD11D03_dist_diff(fitD11D03_pairwise_distances,Fittree15good_taxa_merged)
# Create the scatter plot
fitD11D03_scatter_plot <- ggplot(fitD11D03_results, aes(x = AbsDistance, y = fitD11D03AbsDiff)) +
geom_point(alpha = 0.5) + # Add transparency to points
geom_smooth(method = "lm", color = "red", se = FALSE) + # Add a linear trend line
stat_cor(method = "pearson", p.accuracy = 0.000001, r.accuracy = 0.01) +
theme_minimal() +
labs(title = "Absolute Distance vs Absolute Difference in fitD11D03",
x = "Absolute Value of Distance",
y = "Absolute Value of Difference in fitD11D03") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
# Display the plot
print(fitD11D03_scatter_plot)
# Optional: Save the plot
ggsave("Perfects/PLOTS/Wilcox/fitD11D03_pearson_scatter_plot.png",
fitD11D03_scatter_plot, width = 10, height = 8, dpi = 300)
Close-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance less than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on closely related taxa
fitD11D03_close <- fitD11D03_results %>%
filter(AbsDistance < 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD11D03_close$fitD11D03AbsDiff)
qqline(fitD11D03_close$fitD11D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD11D03_close$fitD11D03AbsDiff, main = "Histogram of Close Data")
Far-Related Pairs: Subset to include only pairs of taxa with an absolute phylogenetic distance greater than 1. Graphical summaries include a Q-Q (Quantile-Quantile) plot and histrogram of the absolute difference values. The Q-Q plot is graphical method to assess if a dataset follows a normal distribution. The red line added by qqline() represents the expected line if the data were perfectly normally distributed. The histogram provides a visual representation of the distribution of the data.
# This focuses the analysis on distantly related taxa
fitD11D03_far <- fitD11D03_results %>%
filter(AbsDistance > 1)
# Graphical Summaries
# Create a Q-Q (Quantile-Quantile) plot
qqnorm(fitD11D03_far$fitD11D03AbsDiff)
qqline(fitD11D03_far$fitD11D03AbsDiff, col = "red")
# Create a histogram plot
hist(fitD11D03_far$fitD11D03AbsDiff, main = "Histogram of Far Data")
Wilcox Test: Perform Wilcoxon Rank Sum Test (also known as Mann-Whitney U test) comparing the absolute distance values between the close-related and far-related groups.
# Wilcox test between close and far related distances
wilcox_result_fitD11D03 <- wilcox.test(fitD11D03_close$fitD11D03AbsDiff,
fitD11D03_far$fitD11D03AbsDiff)
print(wilcox_result_fitD11D03)
Wilcoxon rank sum test with continuity correction
data: fitD11D03_close$fitD11D03AbsDiff and fitD11D03_far$fitD11D03AbsDiff
W = 283156404, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
Interpretation:
Significant Difference: The extremely low p-value (< 2.2e-16) indicates a statistically significant difference between the fitD11D03AbsDiff values of closely related taxa (fitnessD07D03_close) and more distantly related taxa (fitD11D03_far).
Distribution Comparison: The test suggests that the distribution of fitD11D03AbsDiff values is significantly different between close and far phylogenetic relationships.
Magnitude and Direction: While the test tells us there’s a significant difference, it doesn’t tell us about the magnitude or direction of this difference. You’d need to look at descriptive statistics (like medians) of both groups to understand this.
Biological Significance: This result suggests that the absolute differences in fitD11D03 values are not the same for closely related taxa compared to more distantly related taxa. This could imply that evolutionary distance does play a role in the similarity (or difference) of fitD11D03 values, despite the weak correlation you found earlier.
Further Investigation: It would be worth examining the actual distributions of these differences (e.g., with box plots) to understand how they differ. Are the differences generally larger or smaller for closely related taxa?
Caution: Remember that with very large sample sizes, even small differences can become statistically significant. It’s important to consider the practical or biological significance of these differences, not just the statistical significance.
This result adds nuance to your earlier findings, suggesting that while there might not be a strong linear correlation between phylogenetic distance and fitD11D03 differences, there is a significant difference in how these values vary among close relatives versus more distant relatives.
Combine Data: Examine the distributions of these differences to understand how they differ and whether these differences are generally larger or smaller for closely related taxa.
# Combine the data and add a group label
fitD11D03_combined_data <- bind_rows(
mutate(fitD11D03_close, group = "Close"),
mutate(fitD11D03_far, group = "Far"))
# Remove NA values
fitD11D03_combined_data <- fitD11D03_combined_data %>%
filter(!is.na(fitD11D03AbsDiff))
# Create a box plot
fitD11D03_boxplot <- ggplot(fitD11D03_combined_data, aes(x = group, y = fitD11D03AbsDiff)) +
geom_boxplot() +
geom_jitter(alpha = 0.1, width = 0.2) + # Add individual points with some transparency
labs(title = "Distribution of fitD11D03 Differences",
x = "Phylogenetic Relationship",
y = "Absolute Difference in fitD11D03") +
theme_minimal()
# Display the plot
print(fitD11D03_boxplot)
# Calculate summary statistics
fitD11D03_summary_stats <- fitD11D03_combined_data %>%
group_by(group) %>%
summarise(
n = n(),
mean = mean(fitD11D03AbsDiff),
median = median(fitD11D03AbsDiff),
sd = sd(fitD11D03AbsDiff),
min = min(fitD11D03AbsDiff),
max = max(fitD11D03AbsDiff)
)
# Print summary statistics
print(fitD11D03_summary_stats)
# Perform a formal test for difference in means
fitD11D03_t_test_result <- t.test(fitD11D03AbsDiff ~ group, data = fitD11D03_combined_data)
print(fitD11D03_t_test_result)
Welch Two Sample t-test
data: fitD11D03AbsDiff by group
t = -26.178, df = 6206.9, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Close and group Far is not equal to 0
95 percent confidence interval:
-0.9692797 -0.8342237
sample estimates:
mean in group Close mean in group Far
2.238397 3.140148
Interpretation:
Significant Difference: There is a statistically significant difference in the fitD11D03AbsDiff values between closely related taxa and more distantly related taxa.
Magnitude of Difference: On average, the absolute difference in fitD11D03 values is about 1.226 units smaller for closely related taxa compared to distantly related taxa.
Consistency of Difference: The narrow confidence interval (-1.263865 to -1.188174) suggests that this difference is quite consistent across the dataset.
Biological Significance: Closely related taxa tend to have more similar fitD11D03 values (smaller differences) compared to distantly related taxa. This suggests that the fitD11D03 trait shows some degree of phylogenetic signal or conservation.
Evolutionary Implications: This result indicates that evolutionary relatedness does play a role in the similarity of fitD11D03 values. Closely related species tend to have more similar values, which could be due to shared evolutionary history or similar environmental adaptations.
Contrast with Earlier Correlation: While earlier you found a weak correlation between phylogenetic distance and fitD11D03 differences, this t-test reveals a clear distinction between close and distant relationships. This highlights the importance of considering different analytical approaches, as they can reveal different aspects of the data.
Practical Significance: The difference in means (about 1.226) should be considered in the context of the overall range and biological meaning of fitD11D03 values to determine its practical significance.
In summary, this analysis provides strong evidence that closely related taxa have more similar fitD11D03 values compared to distantly related taxa. This supports the idea that there is a phylogenetic component to the fitD11D03 trait, even if it’s not captured well by a simple linear correlation with phylogenetic distance. The large sample size and highly significant result suggest that this is a robust finding, though you should always consider the biological context and potential confounding factors in your interpretation.
# Remove legends
tmp0.0.tree <- tree_perfects15_5BCs_good_0.0tmp_fitness_phylum_p2 +
theme(legend.position = 'none') +
ggtitle("0.0 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp0.058.tree <- tree_perfects15_5BCs_good_0.058tmp_fitness_phylum_p2 +
theme(legend.position = 'none') +
ggtitle("0.058 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp0.5.tree <- tree_perfects15_5BCs_good_0.5tmp_fitness_phylum_p2 +
theme(legend.position = 'none') +
ggtitle("0.5 TMP \n(MIC)") +
theme(plot.title = element_text(hjust = 0.5))
tmp1.0.tree <- tree_perfects15_5BCs_good_1.0tmp_fitness_phylum_p2 +
theme(legend.position = 'none') +
ggtitle("1.0 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp10.tree <- tree_perfects15_5BCs_good_10tmp_fitness_phylum_p2 +
theme(legend.position = 'none') +
ggtitle("10 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp50.tree <- tree_perfects15_5BCs_good_50tmp_fitness_phylum_p2 +
theme(legend.position = 'none') +
ggtitle("50 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp200.tree <- tree_perfects15_5BCs_good_200tmp_fitness_phylum_p2 +
theme(legend.position = 'none') +
ggtitle("200 TMP \n(400x MIC)") +
theme(plot.title = element_text(hjust = 0.5))
# Create the patchwork with the legend on the right
patch9 <- (tmp0.0.tree) /
(tmp0.058.tree | tmp0.5.tree | tmp1.0.tree) /
(tmp10.tree | tmp50.tree | tmp200.tree) +
plot_layout(heights = c(1, 1, 1))
patch9
Plot just the colored tree tips (no phylum ring)
# Remove legends
tmp0.0.tree.tips <- tree_perfects15_5BCs_good_0.0tmp_fitness_labelled +
ggtitle("0.0 TMP \n(Complementation)") +
theme(plot.title = element_text(hjust = 0.5))
tmp0.058.tree.tips <- tree_perfects15_5BCs_good_0.058tmp_fitness_labelled +
ggtitle("0.058 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp0.5.tree.tips <- tree_perfects15_5BCs_good_0.5tmp_fitness_labelled +
ggtitle("0.05 TMP \n(MIC)") +
theme(plot.title = element_text(hjust = 0.5))
tmp1.0.tree.tips <- tree_perfects15_5BCs_good_1.0tmp_fitness_labelled +
ggtitle("1.0 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp10.tree.tips <- tree_perfects15_5BCs_good_10tmp_fitness_labelled +
ggtitle("10 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp50.tree.tips <- tree_perfects15_5BCs_good_50tmp_fitness_labelled +
ggtitle("50 TMP") +
theme(plot.title = element_text(hjust = 0.5))
tmp200.tree.tips <- tree_perfects15_5BCs_good_200tmp_fitness_labelled +
ggtitle("200 TMP \n(400x MIC)") +
theme(plot.title = element_text(hjust = 0.5))
# Plot together
patch10 <- (tmp0.0.tree.tips) /
(tmp0.058.tree.tips | tmp0.5.tree.tips | tmp1.0.tree.tips) /
(tmp10.tree.tips | tmp50.tree.tips | tmp200.tree.tips) +
plot_layout(heights = c(1, 1, 1))
patch10
Plot the MIC and 400x MIC trees side-by-side:
# For the first tree
tree_perfects15_5BCs_good_0.5tmp_fitness_phylum_p2_no_legend <- tree_perfects15_5BCs_good_0.5tmp_fitness_phylum_p2 +
theme(legend.position="none") +
ggtitle("Codon 1 Library \nMIC (0.5 ug/mL TMP)") +
theme(plot.title = element_text(hjust = 0.5))
# For the second tree (this seems correct, but I'll include it for completeness)
tree_perfects15_5BCs_good_200tmp_fitness_phylum_p2_no_legend <- tree_perfects15_5BCs_good_200tmp_fitness_phylum_p2 +
theme(legend.position="right") +
ggtitle("Codon 1 Library \n400x MIC (200 ug/mL TMP)") +
theme(plot.title = element_text(hjust = 0.5))
# Combine the plots
patch11 <- tree_perfects15_5BCs_good_0.5tmp_fitness_phylum_p2_no_legend | tree_perfects15_5BCs_good_200tmp_fitness_phylum_p2_no_legend
patch11
Save the formatted perfects files to import for downstream analyses
# BCs15_map (62.6 MB)
write.csv(BCs15_map, "Perfects/perfects_files_formatted/BCs15_map.csv", row.names = FALSE)
# BCs16_map (80.5 MB)
write.csv(BCs16_map, "Perfects/perfects_files_formatted/BCs16_map.csv", row.names = FALSE)
###------------------------------------
# mutIDinfo15 (17 MB)
write.csv(mutIDinfo15, "Perfects/perfects_files_formatted/mutIDinfo15.csv", row.names = FALSE)
# mutIDinfo16 (14.7 MB)
write.csv(mutIDinfo16, "Perfects/perfects_files_formatted/mutIDinfo16.csv", row.names = FALSE)
###------------------------------------
# perfects15_5BCs (275 KB)
write.csv(perfects15_5BCs, "Perfects/perfects_files_formatted/perfects15_5BCs.csv",
row.names = FALSE)
# perfects16_5BCs (231 KB)
write.csv(perfects16_5BCs, "Perfects/perfects_files_formatted/perfects16_5BCs.csv",
row.names = FALSE)
# perfects_15_16_5BCs_tree (319 KB)
write.csv(perfects_15_16_5BCs_tree, "Perfects/perfects_files_formatted/perfects_15_16_5BCs_tree.csv",
row.names = FALSE)
###------------------------------------
# orginfo (2.1 MB)
write.csv(orginfo, "Perfects/perfects_files_formatted/orginfo.csv",
row.names = FALSE)
###------------------------------------
# Alltree15_taxa_merged (611 KB)
write.csv(Alltree15_taxa_merged, "Perfects/perfects_files_formatted/Alltree15_taxa_merged.csv",
row.names = FALSE)
###------------------------------------
# BCcontrols_15_16_shared_median_WT (435 Bytes)
write.csv(BCcontrols_15_16_shared_median_WT, "Perfects/perfects_files_formatted/BCcontrols_15_16_shared_median_WT.csv",
row.names = FALSE)
# BCcontrols_15_16_shared_median_Neg (697 Bytes)
write.csv(BCcontrols_15_16_shared_median_Neg, "Perfects/perfects_files_formatted/BCcontrols_15_16_shared_median_Neg.csv",
row.names = FALSE)
###------------------------------------
# Fittree15good_taxa_merged (256 KB)
write.csv(Fittree15good_taxa_merged,
"Perfects/perfects_files_formatted/Fittree15good_taxa_merged.csv",
row.names = FALSE)
###------------------------------------
# ncbi_taxa (353.1 MB)
write.csv(ncbi_taxa,
"Perfects/perfects_files_formatted/ncbi_taxa.csv",
row.names = FALSE)
The session information is provided for full reproducibility.
devtools::session_info()
─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────
setting value
version R version 4.3.2 (2023-10-31)
os macOS 15.2
system aarch64, darwin20
ui RStudio
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/Los_Angeles
date 2025-01-23
rstudio 2024.09.0+375 Cranberry Hibiscus (desktop)
pandoc 3.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
abind 1.4-5 2016-07-21 [1] CRAN (R 4.3.0)
ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.0)
ape * 5.8 2024-04-11 [1] CRAN (R 4.3.1)
aplot 0.2.2 2023-10-06 [1] CRAN (R 4.3.1)
backports 1.4.1 2021-12-13 [1] CRAN (R 4.3.0)
bio3d * 2.4-5 2024-10-29 [1] CRAN (R 4.3.3)
BiocGenerics * 0.46.0 2023-06-04 [1] Bioconductor
Biostrings * 2.68.1 2023-05-21 [1] Bioconductor
bitops 1.0-7 2021-04-24 [1] CRAN (R 4.3.0)
broom 1.0.7 2024-09-26 [1] CRAN (R 4.3.3)
cachem 1.0.8 2023-05-01 [1] CRAN (R 4.3.0)
car 3.1-3 2024-09-27 [1] CRAN (R 4.3.3)
carData 3.0-5 2022-01-06 [1] CRAN (R 4.3.0)
castor * 1.8.0 2024-01-09 [1] CRAN (R 4.3.1)
cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.1)
codetools 0.2-20 2024-03-31 [1] CRAN (R 4.3.1)
colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.0)
cowplot * 1.1.3 2024-01-22 [1] CRAN (R 4.3.1)
crayon 1.5.2 2022-09-29 [1] CRAN (R 4.3.0)
devtools * 2.4.5 2022-10-11 [1] CRAN (R 4.3.0)
digest 0.6.35 2024-03-11 [1] CRAN (R 4.3.1)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.3.0)
evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.1)
fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1)
farver 2.1.1 2022-07-06 [1] CRAN (R 4.3.0)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.0)
Formula 1.2-5 2023-02-24 [1] CRAN (R 4.3.0)
fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.0)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
GenomeInfoDb * 1.36.4 2023-10-08 [1] Bioconductor
GenomeInfoDbData 1.2.10 2023-09-13 [1] Bioconductor
ggExtra * 0.10.1 2023-08-21 [1] CRAN (R 4.3.0)
ggfun 0.1.4 2024-01-19 [1] CRAN (R 4.3.1)
ggnewscale * 0.4.10 2024-02-08 [1] CRAN (R 4.3.1)
ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1)
ggplotify 0.1.2 2023-08-09 [1] CRAN (R 4.3.0)
ggpubr * 0.6.0 2023-02-10 [1] CRAN (R 4.3.0)
ggridges * 0.5.6 2024-01-23 [1] CRAN (R 4.3.1)
ggsignif 0.6.4 2022-10-13 [1] CRAN (R 4.3.0)
ggtree * 3.8.2 2023-07-30 [1] Bioconductor
ggtreeExtra * 1.10.0 2023-04-25 [1] Bioconductor
glmnet * 4.1-8 2023-08-22 [1] CRAN (R 4.3.0)
glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1)
gridExtra * 2.3 2017-09-09 [1] CRAN (R 4.3.0)
gridGraphics 0.5-1 2020-12-13 [1] CRAN (R 4.3.0)
gtable 0.3.5 2024-04-22 [1] CRAN (R 4.3.1)
htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.3.1)
httpuv 1.6.15 2024-03-26 [1] CRAN (R 4.3.1)
igraph * 2.0.3 2024-03-13 [1] CRAN (R 4.3.1)
IRanges * 2.34.1 2023-07-02 [1] Bioconductor
isoband 0.2.7 2022-12-20 [1] CRAN (R 4.3.0)
iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.0)
jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1)
knitr * 1.45 2023-10-30 [1] CRAN (R 4.3.1)
labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0)
later 1.3.2 2023-12-06 [1] CRAN (R 4.3.1)
lattice 0.22-6 2024-03-20 [1] CRAN (R 4.3.1)
lazyeval 0.2.2 2019-03-15 [1] CRAN (R 4.3.0)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
MASS 7.3-60.0.1 2024-01-13 [1] CRAN (R 4.3.1)
Matrix * 1.6-5 2024-01-11 [1] CRAN (R 4.3.1)
matrixStats * 1.3.0 2024-04-11 [1] CRAN (R 4.3.1)
memoise 2.0.1 2021-11-26 [1] CRAN (R 4.3.0)
mgcv 1.9-1 2023-12-21 [1] CRAN (R 4.3.1)
mime 0.12 2021-09-28 [1] CRAN (R 4.3.0)
miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.3.0)
munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.1)
naturalsort 0.1.3 2016-08-30 [1] CRAN (R 4.3.0)
nlme 3.1-164 2023-11-27 [1] CRAN (R 4.3.1)
patchwork * 1.2.0 2024-01-08 [1] CRAN (R 4.3.1)
pheatmap * 1.0.12 2019-01-04 [1] CRAN (R 4.3.0)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
pkgbuild 1.4.4 2024-03-17 [1] CRAN (R 4.3.1)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
pkgload 1.3.4 2024-01-16 [1] CRAN (R 4.3.1)
plyr 1.8.9 2023-10-02 [1] CRAN (R 4.3.1)
png 0.1-8 2022-11-29 [1] CRAN (R 4.3.0)
profvis 0.3.8 2023-05-02 [1] CRAN (R 4.3.0)
promises 1.3.0 2024-04-05 [1] CRAN (R 4.3.1)
pscl * 1.5.9 2024-01-31 [1] CRAN (R 4.3.1)
purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
ragg 1.3.0 2024-03-13 [1] CRAN (R 4.3.1)
RColorBrewer * 1.1-3 2022-04-03 [1] CRAN (R 4.3.0)
Rcpp * 1.0.13 2024-07-17 [1] CRAN (R 4.3.3)
RCurl 1.98-1.14 2024-01-09 [1] CRAN (R 4.3.1)
remotes 2.5.0 2024-03-17 [1] CRAN (R 4.3.1)
reshape * 0.8.9 2022-04-12 [1] CRAN (R 4.3.0)
reshape2 * 1.4.4 2020-04-09 [1] CRAN (R 4.3.0)
reticulate * 1.36.1 2024-04-22 [1] CRAN (R 4.3.1)
rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.1)
rmarkdown 2.26 2024-03-05 [1] CRAN (R 4.3.1)
ROCR * 1.0-11 2020-05-02 [1] CRAN (R 4.3.0)
RSpectra 0.16-1 2022-04-24 [1] CRAN (R 4.3.0)
rstatix * 0.7.2 2023-02-01 [1] CRAN (R 4.3.0)
rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.1)
S4Vectors * 0.38.2 2023-09-24 [1] Bioconductor
scales * 1.3.0 2023-11-28 [1] CRAN (R 4.3.1)
seqinr * 4.2-36 2023-12-08 [1] CRAN (R 4.3.1)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
shape 1.4.6.1 2024-02-23 [1] CRAN (R 4.3.1)
shiny 1.8.1.1 2024-04-02 [1] CRAN (R 4.3.1)
stringi * 1.8.3 2023-12-11 [1] CRAN (R 4.3.1)
stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.1)
survival 3.5-8 2024-02-14 [1] CRAN (R 4.3.1)
systemfonts 1.0.6 2024-03-07 [1] CRAN (R 4.3.1)
textshaping 0.3.7 2023-10-09 [1] CRAN (R 4.3.1)
tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.1)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1)
tidytree * 0.4.6 2023-12-12 [1] CRAN (R 4.3.1)
treeio 1.24.3 2023-07-30 [1] Bioconductor
urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.3.0)
usethis * 2.2.3 2024-02-19 [1] CRAN (R 4.3.1)
utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1)
viridis * 0.6.5 2024-01-29 [1] CRAN (R 4.3.1)
viridisLite * 0.4.2 2023-05-02 [1] CRAN (R 4.3.0)
withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.1)
xfun 0.43 2024-03-25 [1] CRAN (R 4.3.1)
xtable 1.8-4 2019-04-21 [1] CRAN (R 4.3.0)
XVector * 0.40.0 2023-05-08 [1] Bioconductor
yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.1)
yulab.utils 0.1.4 2024-01-28 [1] CRAN (R 4.3.1)
zlibbioc 1.46.0 2023-05-08 [1] Bioconductor
[1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────