R Notebook: Provides reproducible analysis for Dial-Out Variant Fitness in the following manuscript:

Citation: Romanowicz KJ, Resnick C, Hinton SR, Plesa C. (2025) Exploring antibiotic resistance in diverse homologs of the dihydrofolate reductase protein family through broad mutational scanning. bioRxiv.

GitHub Repository: https://github.com/PlesaLab/DHFR

NCBI BioProject: https://www.ncbi.nlm.nih.gov/bioproject/1189478

Experiment

This pipeline processes a library of 1,536 DHFR homologs and their associated mutants, with two-fold redundancy (two codon variants per sequence). Fitness scores are derived from a multiplexed in-vivo assay using a trimethoprim concentration gradient, assessing the ability of these homologs and their mutants to complement functionality in an E. coli knockout strain and their tolerance to trimethoprim treatment. This analysis provides insights into how antibiotic resistance evolves across a range of evolutionary starting points. Sequence data were generated using the Illumina NovaSeq platform with 100 bp paired-end sequencing of amplicons.

Methods overview to achieve a broad-mutational scan for DHFR homologs.
Methods overview to achieve a broad-mutational scan for DHFR homologs.

Packages

The following R packages must be installed prior to loading into the R session. See the Reproducibility tab for a complete list of packages and their versions used in this workflow.

# Load the latest version of python (3.10.14) for downstream use:
library(reticulate)
use_python("/Users/krom/miniforge3/bin/python3")

# Make a vector of required packages
required.packages <- c("ape", "bio3d", "Biostrings", "broom", "castor", "cowplot", "devtools", "dplyr", "ggExtra", "ggnewscale", "ggplot2", "ggridges", "ggtree", "ggtreeExtra", "glmnet", "gridExtra","igraph", "knitr", "lubridate", "matrixStats", "patchwork", "pheatmap", "purrr", "pscl", "RColorBrewer", "readxl", "reshape","reshape2", "ROCR", "seqinr", "scales", "stringr", "stringi", "tidyr", "tidytree", "viridis")

# Load required packages with error handling
loaded.packages <- lapply(required.packages, function(package) {
  if (!require(package, character.only = TRUE)) {
    install.packages(package, dependencies = TRUE)
    if (!require(package, character.only = TRUE)) {
      message("Package ", package, " could not be installed and loaded.")
      return(NULL)
    }
  }
  return(package)
})

# Remove NULL entries from loaded packages
loaded.packages <- loaded.packages[!sapply(loaded.packages, is.null)]
Loaded packages: ape, bio3d, Biostrings, broom, castor, cowplot, devtools, dplyr, ggExtra, ggnewscale, ggplot2, ggridges, ggtree, ggtreeExtra, glmnet, gridExtra, igraph, knitr, lubridate, matrixStats, patchwork, pheatmap, purrr, pscl, RColorBrewer, readxl, reshape, reshape2, ROCR, seqinr, scales, stringr, stringi, tidyr, tidytree, viridis 

Import Data Files

Import COUNT files generated from DHFR.2.Counts.RMD relevant for downstream analysis.

# BCcontrols_15_median
BCcontrols_15_median <- read.csv("Count/count_files_formatted/BCcontrols_15_median.csv", 
                         header = TRUE, stringsAsFactors = FALSE)

Import PERFECTS files generated from DHFR.3.Perfects.RMD relevant for downstream analysis.

# perfects15_5BCs
perfects15_5BCs <- read.csv("Perfects/perfects_files_formatted/perfects15_5BCs.csv", 
                         header = TRUE, stringsAsFactors = FALSE)

Dialout Analysis

Import Excel Files

This is the file with the plate reader data:

# REPLACE THIS FILE NAME FOR EACH ANALYSIS
filename_of_plate_reader_data = "Dialout/RAW/02-08-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate4.Rnd2.xlsx"

#M9-Full Plate 1: "01-24-24_KR_DHFR_Fitness_Assay_M9_Full_Supp.Plate1.xlsx"
#M9-TMP Plate 2: "01-31-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate2.xlsx"
#M9-TMP Plate 3: "02-01-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate3.xlsx"
#M9-TMP Plate 4: "02-02-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate4.xlsx"

#Re-do of M9-TMP Plates:
#M9-TMP Plate 2: "02-06-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate2.Rnd2.xlsx"
#M9-TMP Plate 3: "02-07-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate3.Rnd2.xlsx"
#M9-TMP Plate 4: "02-08-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate4.Rnd2.xlsx"

This is the range of data in the excel file to import

# UPDATE THIS DATA RANGE FOR EACH EXCEL FILE YOU IMPORT
range_od = "B41:CI282"

#M9-Full Plate 1 OD-range: B41:AM282
#M9-TMP Plate 2 OD-range: B41:CI276
#M9-TMP Plate 3 OD-range: B41:CI279
#M9-TMP Plate 4 OD-range: B41:CI282

#Re-do of M9-TMP Plates:
#M9-TMP Plate 2 OD-range: B41:CI282
#M9-TMP Plate 3 OD-range: B41:CI276
#M9-TMP Plate 4 OD-range: B41:CI282
#get the data
od_data1 <- as.data.frame(read_excel(filename_of_plate_reader_data, sheet = 1, range = range_od))
#fix the time (makes minutes from start)
od_data <- od_data1 %>%
  mutate(time_int=as.duration(interval(start = od_data1$Time[1], end=Time))) %>%
  mutate(round_time=ceiling(as.numeric(time_int)/60)) %>%
  subset(select = -Time) %>%
  subset(select = -time_int) %>%
  relocate(round_time) %>%
  dplyr::rename(Time=round_time) %>%
  t() %>%
  as.data.frame(.)
#make vector variables for plotting
M = nrow(od_data)
N = ncol(od_data)
#make a vector of your time points
ODtime <- od_data[1,1:N]

Sample Growth Rate Plots

This command loads the script (find_gr.R) that contains the analysis functions. This file needs to be in the present working directory.

source("Dialout/Scripts/find_gr.R")
#define initial growth rates
growth.rates = NULL
#this variable will store all the data
gr_table <- data.frame(sample_name=character(), 
                       doubling.time=double(), 
                       m=double(),
                       r2=double(),
                       lag.t=double())
#convert time_vector to numeric variable
time_vector = as.numeric(ODtime[1,])

Generate growth rate plots from each sampling well and save as .png files:

#run through all samples and fit
for (i in 2:M) {
  print(i)
  sample_nm <- row.names(od_data)[i]
  
  # Clean the sample name to remove or replace problematic characters
  clean_sample_nm <- gsub("[^[:alnum:]_-]", "_", sample_nm)
  
  # Open a PNG device with the cleaned filename
  png_filename <- file.path("Dialout/PLOTS", paste0(clean_sample_nm, "_growth_rate_plot.png"))
  png(filename = png_filename, width = 800, height = 600)
  
  #this is where the fit happens:
  gr <- findgr(od_data[i, 1:N], time_vector, sample_nm, int=30, r2=0.6) 
  
  de <- list(sample_name=sample_nm, 
             doubling.time=log2(2)/gr["m"], 
             m=gr["m"],
             r2=gr["r2"],
             lag.t=gr["lag.t"])
  
  #add this to previously calculated wells
  gr_table <- rbind(gr_table, de, stringsAsFactors=FALSE)
  growth.rates <- rbind(growth.rates, gr)
  
  # Close the PNG device
  dev.off()
}
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86

Display the plots in the RMD:

#run through all samples and fit
for (i in 2:M) {
  print(i)
  sample_nm=row.names(od_data)[i]
  
  #this is where the fit happens:
  gr <- findgr(od_data[i, 1:N], time_vector, sample_nm, int=30, r2=0.6) 
  #1 in od_data[i, 1:N] is the column number where the data starts;  
  #int is number of points taken at one time as an interval to find the highest slope; 
  #vary (i.e. lower) r2, i.e. rsquared as needed, blanks can be a problem here
  
  #print(gr["m"])
  de <- list(sample_name=sample_nm, 
             doubling.time=log2(2)/gr["m"], 
             m=gr["m"],
             r2=gr["r2"],
             lag.t=gr["lag.t"])
  #print(de)
  
  #add this to previously calculated wells
  gr_table = rbind(gr_table,de, stringsAsFactors=FALSE)
  growth.rates <- rbind(growth.rates, gr)
}
[1] 2
[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

[1] 13

[1] 14

[1] 15

[1] 16

[1] 17

[1] 18

[1] 19

[1] 20

[1] 21

[1] 22

[1] 23

[1] 24

[1] 25

[1] 26

[1] 27

[1] 28

[1] 29

[1] 30

[1] 31

[1] 32

[1] 33

[1] 34

[1] 35
Warning: essentially perfect fit: summary may be unreliableWarning: essentially perfect fit: summary may be unreliableWarning: essentially perfect fit: summary may be unreliableWarning: essentially perfect fit: summary may be unreliableWarning: essentially perfect fit: summary may be unreliableWarning: essentially perfect fit: summary may be unreliableWarning: essentially perfect fit: summary may be unreliable

[1] 36

[1] 37

[1] 38

[1] 39

[1] 40

[1] 41

[1] 42

[1] 43

[1] 44

[1] 45

[1] 46

[1] 47

[1] 48

[1] 49

[1] 50

[1] 51

[1] 52

[1] 53

[1] 54

[1] 55

[1] 56

[1] 57

[1] 58

[1] 59

[1] 60

[1] 61

[1] 62

[1] 63

[1] 64

[1] 65

[1] 66

[1] 67

[1] 68

[1] 69

[1] 70

[1] 71

[1] 72

[1] 73

[1] 74

[1] 75

[1] 76

[1] 77

[1] 78

[1] 79

[1] 80

[1] 81

[1] 82

[1] 83

[1] 84

[1] 85

[1] 86

dev.off()
null device 
          1 

Write the data to a file

# UPDATE THE TABLE NAME IF YOU RE-RUN MULTIPLE EXCEL FILES

# Substitute your desired file name for 'growth_rates'
write.table(gr_table, "Dialout/OUTPUT/growth_rates_plate4.txt", sep="\t", quote=F, row.names=F) 

Full Growth Rate Plot

Load the final fitness data for all 12 Dialout samples based on calculated growth rates:

# For 12 samples (including WT and mCherry controls)
grwell <- read.table(file = 'Dialout/Raw/growth_rates_all.v2.csv', sep =',', header = TRUE)

Reshape the data for plotting:

# Convert Construct to a factor with desired levels
grwell$Media <- factor(grwell$Media, levels = c("M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp"))

# Define the 12 samples
grwell$Construct <- factor(grwell$Construct, levels = c("NP_267306", "WP_000637209", "WP_008976421",
                                                        "WP_003776922", "WP_008578924", "WP_002897636",
                                                        "WP_000162453", "WP_003012456", "WP_000162462",
                                                        "WP_003027976", "WT-DHFR", "mCherry"))

Growth Rate Plot for DHFR Dialout Variants:

# Convert Media to a factor with desired levels
grwell$Media <- factor(grwell$Media, levels = c("M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp"))

# Define the 12 samples in the desired order
construct_order <- c("NP_267306", "WP_000637209", "WP_008976421", "WP_003776922", "WP_008578924", "WP_002897636", "WP_000162453", "WP_003012456", "WP_000162462", "WP_003027976", "WT-DHFR", "mCherry")

grwell$Construct <- factor(grwell$Construct, levels = construct_order)

# Create a named vector for facet colors
facet_colors <- setNames(c(rep("lightgray", 10), "blue", "red"), construct_order)

# Custom function to reorder facets
reorder_facets <- function(x) factor(x, levels = construct_order)

# Calculate the overall y-axis range
y_min <- min(grwell$m, na.rm = TRUE)
y_max <- max(grwell$m, na.rm = TRUE)

# Calculate mean growth rate for each Construct-Media combination
grwell_mean <- grwell %>%
  group_by(Construct, Media) %>%
  summarize(mean_m = mean(m, na.rm = TRUE), .groups = "drop")

Growth Rate Plot

# Growth rate plot
fitness.plot.v3 <- ggplot(grwell_mean, aes(x = Media, y = mean_m, group = Construct)) +
  geom_point() +
  geom_line() +
  labs(title = "Growth Rate Assay for DHFR Dial-Out Variants") +
  facet_wrap(~ reorder_facets(Construct), scales = "fixed", ncol = 4) +
  theme_cowplot(16) +
  theme(
    title = element_text(color = "black", face = "bold", size=14),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=10),
    strip.background = element_blank(),
    strip.text = element_text(color = "black", face = "bold", size=10),
    panel.spacing = unit(0.5, "lines"),
    axis.text.y = element_text(size = 10),
    axis.title.y = element_text(size = 12),
    panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    legend.position = "none"
  ) +
  ylab(expression("Mean Growth Rate (min"^"-1"*")")) +
  xlab("Media") +
  coord_cartesian(ylim = c(y_min, y_max))

# Add colored backgrounds for all facets
fitness.plot.v3 <- fitness.plot.v3 +
  geom_rect(data = data.frame(Construct = construct_order),
            aes(fill = Construct),
            xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf,
            alpha = 0.2, inherit.aes = FALSE) +
  scale_fill_manual(values = facet_colors, guide = "none")

# Remove y-axis labels for all but the left-most facets
fitness.plot.v3 <- fitness.plot.v3 +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y.left = element_text(size = 10),
    axis.ticks.y.left = element_line(),
    strip.text.x = element_text(size = 10, margin = margin(b = 5, t = 5))
  )

# Print the plot
print(fitness.plot.v3)

Calculate a Mean growth rate value at each TMP concentration for each dialout variant:

# Create the summary table
grwell_summary <- grwell_mean %>%
  pivot_wider(
    names_from = Media,
    values_from = mean_m
  ) %>%
  select(Construct, `M9-Supp`, `0-tmp`, `0.058-tmp`, `0.5-tmp`, `1.0-tmp`, `10-tmp`, `50-tmp`, `200-tmp`) %>%
  arrange(factor(Construct, levels = construct_order))

# Round all numeric columns to 6 decimal places
grwell_summary <- grwell_summary %>%
  mutate(across(where(is.numeric), ~round(., 6)))

# Display the table
kable(grwell_summary, format = "markdown", caption = "Summary of Mean Growth Rates (1/min) for DHFR Variants")
Summary of Mean Growth Rates (1/min) for DHFR Variants
Construct M9-Supp 0-tmp 0.058-tmp 0.5-tmp 1.0-tmp 10-tmp 50-tmp 200-tmp
NP_267306 0.004930 0.003574 0.003481 0.003474 0.003458 0.003451 0.003261 0.002857
WP_000637209 0.005100 0.003875 0.003450 0.003535 0.003430 0.003372 0.003391 0.002943
WP_008976421 0.005272 0.003704 0.003665 0.003435 0.003079 0.000975 0.000456 0.000000
WP_003776922 0.002708 0.000623 0.000280 0.000072 0.000000 0.000104 0.000000 0.000000
WP_008578924 0.004743 0.003655 0.003617 0.003613 0.003576 0.003520 0.003415 0.003176
WP_002897636 0.004477 0.003861 0.003834 0.003860 0.003842 0.003850 0.003693 0.003384
WP_000162453 0.004793 0.003854 0.003811 0.003811 0.003695 0.003689 0.003466 0.002736
WP_003012456 0.005081 0.003778 0.003558 0.003158 0.002784 0.000652 0.000390 0.000107
WP_000162462 0.004432 0.003734 0.003724 0.003697 0.003642 0.003245 0.002384 0.001157
WP_003027976 0.005033 0.003697 0.003533 0.003527 0.003564 0.003417 0.003113 0.002163
WT-DHFR 0.004638 0.003346 0.001960 0.000526 0.000311 0.000103 0.000000 0.000000
mCherry 0.000000 0.000000 0.000000 0.000000 0.000083 0.000275 0.000000 0.000000

Dialout Variant Fitness

Start by pulling out the Dialout variants from the perfects15_5BCs dataset:

# Specify the mutID values you want to keep
dialout_mutIDs <- c("NP_267306", "WP_000637209", "WP_008976421", "WP_003776922", "WP_008578924",
                    "WP_002897636", "WP_000162453", "WP_003012456", "WP_000162462", "WP_003027976")

# Subset the dataframe, keeping only columns 1:9
dialout_perfects15_5BCs <- perfects15_5BCs[perfects15_5BCs$mutID %in% dialout_mutIDs, 1:9]

# Rename the columns
colnames(dialout_perfects15_5BCs) <- c("mutID","M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp")

Add the Controls fitness to the Dialout fitness dataset:

# Rename controls columns
colnames(BCcontrols_15_median) <- c("mutID", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp")

# First, add the M9-Supp column with NA values to BCcontrols_15_median
BCcontrols_15_median <- BCcontrols_15_median %>%
  mutate(`M9-Supp` = NA)

# Now merge the two dataframes
dialout_ctrls_fitness <- bind_rows(
  dialout_perfects15_5BCs,
  BCcontrols_15_median)

# Reorder columns to match the original order
column_order <- c("mutID", "M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp")
dialout_ctrls_fitness <- dialout_ctrls_fitness %>%
  select(all_of(column_order))

# Remove the row where mutID is "D27N"
dialout_ctrls_fitness <- dialout_ctrls_fitness %>%
  filter(mutID != "D27N")

# Rename "WT" to "WT-DHFR"
dialout_ctrls_fitness <- dialout_ctrls_fitness %>%
  mutate(mutID = ifelse(mutID == "WT", "WT-DHFR", mutID))

# Round all numeric columns to 6 decimal places
dialout_ctrls_fitness_summary <- dialout_ctrls_fitness %>%
  mutate(across(where(is.numeric), ~round(., 6)))

# Display the table
kable(dialout_ctrls_fitness_summary, format = "markdown", caption = "Summary of Median Fitness for Pooled DHFR Variants")
Summary of Median Fitness for Pooled DHFR Variants
mutID M9-Supp 0-tmp 0.058-tmp 0.5-tmp 1.0-tmp 10-tmp 50-tmp 200-tmp
NP_267306 -0.577521 -1.615358 -1.380216 -0.925626 -0.766846 0.545330 1.742130 1.439774
WP_000162453 0.863386 0.224312 0.567447 1.064629 1.210018 0.702456 -2.305182 -7.805952
WP_000162462 0.911336 0.505794 0.747209 0.525657 0.229753 -4.137980 -4.706285 -8.743955
WP_000637209 0.584842 0.329937 0.663696 1.228697 1.532788 2.329915 0.408856 -7.660775
WP_002897636 1.022996 0.691549 1.053154 1.701834 2.053830 3.327449 2.658230 -4.688964
WP_003012456 0.785658 0.001135 -0.563819 -2.483255 -3.497845 -6.975202 -6.195765 -9.118302
WP_003027976 0.763870 0.260849 -0.776827 -4.106876 -5.236413 -8.447001 -6.917008 -8.570287
WP_003776922 -3.892368 -2.031637 -2.108692 -2.291793 -2.056731 -3.028958 -3.668410 -3.563152
WP_008578924 0.967376 0.274501 0.642541 1.351745 1.715205 3.117568 4.696570 6.159919
WP_008976421 0.938040 0.501459 0.791208 1.020815 0.950342 -0.341536 -1.460952 -6.890267
WT-DHFR NA -1.000283 -4.530458 -6.543512 -6.415804 -7.986620 -6.272670 -9.817412
mCherry NA -1.290412 -1.566887 -1.748018 -1.541840 -2.259923 -2.158114 -4.440523

Fitness Plot

Reshape data prior to plotting

# Reshape the data from wide to long format
long_data <- dialout_ctrls_fitness %>%
  pivot_longer(
    cols = c("M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp"),
    names_to = "Media",
    values_to = "fitness"
  )

# Convert Media to a factor with desired levels
long_data$Media <- factor(long_data$Media, levels = c("M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp"))

# Define the 12 samples in the desired order
construct_order <- c("NP_267306", "WP_000637209", "WP_008976421", "WP_003776922", "WP_008578924", "WP_002897636", "WP_000162453", "WP_003012456", "WP_000162462", "WP_003027976", "WT-DHFR", "mCherry")

long_data$mutID <- factor(long_data$mutID, levels = construct_order)

# Create a named vector for facet colors
facet_colors <- setNames(c(rep("lightgray", 10), "blue", "red"), construct_order)

# Custom function to reorder facets
reorder_facets <- function(x) factor(x, levels = construct_order)

# Calculate the overall y-axis range
y_min <- min(long_data$fitness, na.rm = TRUE)
y_max <- max(long_data$fitness, na.rm = TRUE)

Plot the fitness values across the TMP gradient for each dialout variant:

# Fitness plot
fitness.plot.v4 <- ggplot(long_data, aes(x = Media, y = fitness, group = mutID)) +
  geom_hline(yintercept = -1, linetype = "dashed", color = "red", size = 0.5) +
  geom_point() +
  geom_line() +
  labs(title = "Pooled Fitness Assay for DHFR Dial-Out Variants") +
  facet_wrap(~ reorder_facets(mutID), scales = "fixed", ncol = 4) +
  theme_cowplot(16) +
  theme(
    title = element_text(color = "black", face = "bold", size=14),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=10),
    strip.background = element_blank(),
    strip.text = element_text(color = "black", face = "bold", size=10),
    panel.spacing = unit(0.5, "lines"),
    axis.text.y = element_text(size = 10),
    axis.title.y = element_text(size = 12),
    panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    legend.position = "none"
  ) +
  ylab("Median Fitness (LogFC)") +
  xlab("Media") +
  coord_cartesian(ylim = c(y_min, y_max))

# Add colored backgrounds for all facets
fitness.plot.v4 <- fitness.plot.v4 +
  geom_rect(data = data.frame(mutID = construct_order),
            aes(fill = mutID),
            xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf,
            alpha = 0.2, inherit.aes = FALSE) +
  scale_fill_manual(values = facet_colors, guide = "none")

# Remove y-axis labels for all but the left-most facets
fitness.plot.v4 <- fitness.plot.v4 +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y.left = element_text(size = 10),
    axis.ticks.y.left = element_line(),
    strip.text.x = element_text(size = 10, margin = margin(b = 5, t = 5))
  )

# Print the plot
print(fitness.plot.v4)

Side-by-Side Plots

Plot Fitness and Growth Rates side-by-side

patch1 <- fitness.plot.v4 | fitness.plot.v3
patch1

Combined Datasets for Plotting Together

# Prepare long_data (Fitness data)
long_data_renamed <- long_data %>%
  rename(Construct = mutID, value = fitness) %>%
  mutate(assay_type = "Fitness")

# Check the range of Fitness values
print("Range of Fitness values:")
[1] "Range of Fitness values:"
print(range(long_data_renamed$value, na.rm = TRUE))
[1] -9.817412  6.159919
# Prepare grwell_mean data (Growth Rate data)
grwell_mean_prepared <- grwell_mean %>%
  rename(value = mean_m) %>%
  mutate(
    Type = case_when(
      Construct %in% c("WT-DHFR", "mCherry") ~ "Control",
      TRUE ~ "Variant"
    ),
    assay_type = "Growth Rate"
  )
# Combine the datasets
combined_data <- bind_rows(long_data_renamed, grwell_mean_prepared)

# Verify the data
print("Combined data summary:")
[1] "Combined data summary:"
print(table(combined_data$assay_type, combined_data$Type, useNA = "ifany"))
             
              Control Variant <NA>
  Fitness           0       0   96
  Growth Rate      16      80    0
# Check for any extreme values or outliers
print("Summary of 'value' column:")
[1] "Summary of 'value' column:"
print(summary(combined_data$value))
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
-9.817412 -0.508248  0.003403 -0.742356  0.005007  6.159919         2 
# Create the combined plot
combined_plot <- ggplot(combined_data, aes(x = Media, y = value, group = interaction(Construct, assay_type), color = assay_type)) +
  geom_point() +
  geom_line() +
  geom_hline(data = filter(combined_data, assay_type == "Fitness"), 
             aes(yintercept = -1), linetype = "dashed", color = "red", size = 0.5) +
  facet_grid(assay_type ~ reorder_facets(Construct), scales = "free_y") +
  theme_cowplot(16) +
  theme(
    strip.background = element_blank(),
    strip.text = element_text(color = "black", face = "bold", size = 10),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 8),
    panel.spacing = unit(0.5, "lines"),
    axis.text.y = element_text(size = 8),
    axis.title = element_text(size = 10),
    panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    legend.position = "none"
  ) +
  labs(title = "Growth Rate and Pooled Fitness Assays for DHFR Dial-Out Variants",
       x = "Media",
       y = "Value") +
  scale_color_manual(values = c("Fitness" = "blue", "Growth Rate" = "red"))

# Add colored backgrounds for all facets
combined_plot <- combined_plot +
  geom_rect(data = data.frame(Construct = construct_order),
            aes(fill = Construct),
            xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf,
            alpha = 0.2, inherit.aes = FALSE) +
  scale_fill_manual(values = facet_colors, guide = "none")

# Print the plot
print(combined_plot)

Combined Plots

All Dial-outs

Plot data together with separate y-axes

# Function to create a dual-axis plot for each Construct
create_dual_axis_plot <- function(data, construct) {
  # Remove M9-Supp media for mCherry and WT-DHFR
  if (construct %in% c("mCherry", "WT-DHFR")) {
    data <- data %>% filter(!(Media == "M9-Supp"))
  }
  
  fitness_data <- filter(data, assay_type == "Fitness")
  growth_rate_data <- filter(data, assay_type == "Growth Rate")
  
  # Calculate the ratio for the secondary axis
  ratio <- diff(range(fitness_data$value)) / diff(range(growth_rate_data$value))
  
  ggplot() +
    # Fitness data
    geom_line(data = fitness_data, aes(x = Media, y = value, group = 1), color = "blue") +
    geom_point(data = fitness_data, aes(x = Media, y = value), color = "blue") +
    # Growth Rate data
    geom_line(data = growth_rate_data, aes(x = Media, y = value * ratio, group = 1), color = "red") +
    geom_point(data = growth_rate_data, aes(x = Media, y = value * ratio), color = "red") +
    # Horizontal line at y = -1 for Fitness
    geom_hline(yintercept = -1, linetype = "dashed", color = "gray", size = 0.5) +
    # Axes and labels
    scale_y_continuous(
      name = "Fitness",
      sec.axis = sec_axis(~./ratio, name = "Growth Rate")
    ) +
    theme_classic() +
    theme(
      axis.line.y.right = element_line(color = "red"),
      axis.ticks.y.right = element_line(color = "red"),
      axis.text.y.right = element_text(color = "red"),
      axis.title.y.right = element_text(color = "red"),
      axis.line.y.left = element_line(color = "blue"),
      axis.ticks.y.left = element_line(color = "blue"),
      axis.text.y.left = element_text(color = "blue"),
      axis.title.y.left = element_text(color = "blue"),
      axis.text.x = element_text(angle=45, hjust=1),
      axis.title.x = element_blank(),
      plot.title = element_text(size=10, face="bold", hjust=0.5)
    ) +
    labs(title=paste(construct),
         x="Media")
}

# Generate a list of plots, one for each Construct
plot_list <- combined_data %>%
  group_by(Construct) %>%
  group_map(~ create_dual_axis_plot(.x, .y$Construct))

# Arrange all plots in a grid
grid_plot <- do.call(grid.arrange, c(plot_list, ncol=3))


# Print the grid plot
print(grid_plot)
TableGrob (4 x 3) "arrange": 12 grobs

Pathogenic Dial-outs

# Function to create a dual-axis plot for each Construct
create_dual_axis_plot <- function(data, construct) {
  # Remove M9-Supp media for WT-DHFR
  if (construct == "WT-DHFR") {
    data <- data %>% filter(!(Media == "M9-Supp"))
  }
  
  fitness_data <- filter(data, assay_type == "Fitness")
  growth_rate_data <- filter(data, assay_type == "Growth Rate")
  
  # Calculate the ratio for the secondary axis
  ratio <- diff(range(fitness_data$value)) / diff(range(growth_rate_data$value))
  
  ggplot() +
    # Fitness data
    geom_line(data = fitness_data, aes(x = Media, y = value, group = 1), color = "blue") +
    geom_point(data = fitness_data, aes(x = Media, y = value), color = "blue") +
    # Growth Rate data
    geom_line(data = growth_rate_data, aes(x = Media, y = value * ratio, group = 1), color = "red") +
    geom_point(data = growth_rate_data, aes(x = Media, y = value * ratio), color = "red") +
    # Horizontal line at y = -1 for Fitness
    geom_hline(yintercept = -1, linetype = "dashed", color = "gray", size = 0.5) +
    # Axes and labels
    scale_y_continuous(
      name = "Fitness",
      sec.axis = sec_axis(~./ratio, name = "Growth Rate")
    ) +
    theme_classic() +
    theme(
      axis.line.y.right = element_line(color = "red"),
      axis.ticks.y.right = element_line(color = "red"),
      axis.text.y.right = element_text(color = "red"),
      axis.title.y.right = element_text(color = "red"),
      axis.line.y.left = element_line(color = "blue"),
      axis.ticks.y.left = element_line(color = "blue"),
      axis.text.y.left = element_text(color = "blue"),
      axis.title.y.left = element_text(color = "blue"),
      axis.text.x = element_text(angle=45, hjust=1),
      axis.title.x = element_blank(),
      plot.title = element_text(size=10, face="bold", hjust=0.5)
    ) +
    labs(title=paste(construct),
         x="Media")
}

# Filter combined_data for the specific constructs and rename
specific_constructs <- c("Bacillus cereus", "Streptococcus pneumoniae", "WT-DHFR")
filtered_data <- combined_data %>%
  filter(Construct %in% c("WP_000637209", "WP_000162453", "WT-DHFR")) %>%
  mutate(Construct = case_when(
    Construct == "WP_000637209" ~ "Bacillus cereus",
    Construct == "WP_000162453" ~ "Streptococcus pneumoniae",
    TRUE ~ Construct
  ))

# Generate a list of plots, one for each specified Construct
plot_list <- filtered_data %>%
  group_by(Construct) %>%
  group_map(~ create_dual_axis_plot(.x, .y$Construct))

# Arrange all plots in a grid
grid_plot <- do.call(grid.arrange, c(plot_list, ncol=3))


# Print the grid plot
print(grid_plot)
TableGrob (1 x 3) "arrange": 3 grobs

Correlations

# First, let's reshape the data to have Fitness and Growth Rate side by side
wide_data <- combined_data %>%
  select(Construct, Media, assay_type, value) %>%
  pivot_wider(names_from = assay_type, values_from = value) %>%
  filter(!is.na(Fitness) & !is.na(`Growth Rate`))

# Function to calculate Spearman correlation and p-value
spearman_cor <- function(data) {
  cor_test <- cor.test(data$Fitness, data$`Growth Rate`, method = "spearman")
  tibble(
    correlation = cor_test$estimate,
    p_value = cor_test$p.value
  )
}

# Calculate correlations for each Media type
correlations <- wide_data %>%
  group_by(Media) %>%
  do(spearman_cor(.)) %>%
  ungroup()
Warning: Cannot compute exact p-value with tiesWarning: Cannot compute exact p-value with ties
# Print the results
print(correlations)

# Optionally, create a plot to visualize the correlations
correlation_plot <- ggplot(correlations, aes(x = Media, y = correlation)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_text(aes(label = sprintf("%.3f", correlation), 
                y = ifelse(correlation > 0, correlation + 0.05, correlation - 0.05)),
            vjust = ifelse(correlations$correlation > 0, 0, 1)) +
  geom_text(aes(label = sprintf("p = %.3f", p_value), y = 0), 
            vjust = ifelse(correlations$correlation > 0, 1.5, -0.5)) +
  theme_classic() +  # This removes the background grid and adds axis lines
  theme(
    axis.line = element_line(color = "black"),  # Ensure axis lines are black
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    panel.grid.major = element_blank(),  # Remove major grid lines
    panel.grid.minor = element_blank()   # Remove minor grid lines
  ) +
  labs(title = "Spearman Correlations between Fitness and Growth Rate by Media",
       y = "Correlation Coefficient")

print(correlation_plot)

Correlation Plots

# Function to create a scatter plot for each media type
create_scatter_plot <- function(data, media_type) {
  # Remove rows where Construct is "mCherry"
  data <- data %>% filter(Construct != "mCherry")
  
  # Perform Spearman correlation test
  cor_test <- cor.test(data$Fitness, data$`Growth Rate`, method = "spearman")
  
  ggplot(data, aes(x = `Growth Rate`, y = Fitness)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", color = "red", fill = "skyblue", se = TRUE) +  # Set se = TRUE to include confidence interval
    annotate("text", x = min(data$`Growth Rate`, na.rm = TRUE), y = max(data$Fitness, na.rm = TRUE),
             label = sprintf("Correlation = %.2f\np-value = %.3f", cor_test$estimate, cor_test$p.value),
             hjust = 0, vjust = 1) +
    theme_classic() +  # This removes the background grid and adds axis lines
    theme(
      axis.line = element_line(color = "black"),
      plot.title = element_text(size = 14, hjust = 0.5),
      axis.title = element_text(size = 12),
      axis.text = element_text(size = 10),
      plot.margin = margin(10, 30, 10, 10)  # Add right margin for x-axis labels
    ) +
    labs(
      title = NULL,  # Remove the title
      x = paste("Growth Rate in", media_type),  # Set x-axis title
      y = paste("Fitness in", media_type)        # Set y-axis title
    ) +
    scale_x_continuous(expand = expansion(mult = c(0.05, 0.05)))  # Add padding to x-axis limits
}

# Generate a list of plots, one for each media type
corr_plot_list <- wide_data %>%
  group_by(Media) %>%
  group_map(~ create_scatter_plot(.x, .y$Media))
Warning: Cannot compute exact p-value with tiesWarning: Cannot compute exact p-value with ties
# Arrange all plots in a grid
corr_grid_plot <- do.call(grid.arrange, c(corr_plot_list, ncol = 2))


# Print the grid plot
print(corr_grid_plot)
TableGrob (4 x 2) "arrange": 8 grobs
# Function to create a scatter plot for each media type
create_scatter_plot <- function(data, media_type) {
  # Remove rows where Construct is "mCherry"
  data <- data %>% filter(Construct != "mCherry")
  
  # Perform Spearman correlation test (if needed for other purposes)
  cor_test <- cor.test(data$Fitness, data$`Growth Rate`, method = "spearman")
  
  ggplot(data, aes(x = `Growth Rate`, y = Fitness)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", color = "red", fill = "skyblue", se = TRUE) +  # Set fill color for confidence interval to blue
    theme_classic() + 
    theme(
      axis.line = element_line(color = "black"),
      plot.title = element_text(size = 14, hjust = 0.5),
      axis.title = element_text(size = 12),
      axis.text = element_text(size = 10),
      plot.margin = margin(10, 30, 10, 10)
    ) +
    labs(
      title = NULL,
      x = paste("Growth Rate in", media_type),  # Set x-axis title
      y = paste("Fitness in", media_type)        # Set y-axis title
    ) +
    scale_x_continuous(expand = expansion(mult = c(0.05, 0.05)))  
}

# Generate a list of plots, one for each media type
corr_plot_list <- wide_data %>%
  group_by(Media) %>%
  group_map(~ create_scatter_plot(.x, .y$Media))
Warning: Cannot compute exact p-value with tiesWarning: Cannot compute exact p-value with ties
# Arrange all plots in a grid
corr_grid_plot <- do.call(grid.arrange, c(corr_plot_list, ncol = 2))


# Print the grid plot
print(corr_grid_plot)
TableGrob (4 x 2) "arrange": 8 grobs

Reproducibility

The session information is provided for full reproducibility.

devtools::session_info()
─ Session info ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.2 (2023-10-31)
 os       macOS 15.2
 system   aarch64, darwin20
 ui       RStudio
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Los_Angeles
 date     2025-01-23
 rstudio  2024.09.0+375 Cranberry Hibiscus (desktop)
 pandoc   3.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)

─ Packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 package          * version    date (UTC) lib source
 ade4               1.7-22     2023-02-06 [1] CRAN (R 4.3.0)
 ape              * 5.8        2024-04-11 [1] CRAN (R 4.3.1)
 aplot              0.2.2      2023-10-06 [1] CRAN (R 4.3.1)
 backports          1.4.1      2021-12-13 [1] CRAN (R 4.3.0)
 bio3d            * 2.4-5      2024-10-29 [1] CRAN (R 4.3.3)
 BiocGenerics     * 0.46.0     2023-06-04 [1] Bioconductor
 Biostrings       * 2.68.1     2023-05-21 [1] Bioconductor
 bitops             1.0-7      2021-04-24 [1] CRAN (R 4.3.0)
 broom            * 1.0.7      2024-09-26 [1] CRAN (R 4.3.3)
 cachem             1.0.8      2023-05-01 [1] CRAN (R 4.3.0)
 castor           * 1.8.0      2024-01-09 [1] CRAN (R 4.3.1)
 cellranger         1.1.0      2016-07-27 [1] CRAN (R 4.3.0)
 cli                3.6.2      2023-12-11 [1] CRAN (R 4.3.1)
 codetools          0.2-20     2024-03-31 [1] CRAN (R 4.3.1)
 colorspace         2.1-0      2023-01-23 [1] CRAN (R 4.3.0)
 cowplot          * 1.1.3      2024-01-22 [1] CRAN (R 4.3.1)
 crayon             1.5.2      2022-09-29 [1] CRAN (R 4.3.0)
 devtools         * 2.4.5      2022-10-11 [1] CRAN (R 4.3.0)
 digest             0.6.35     2024-03-11 [1] CRAN (R 4.3.1)
 dplyr            * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
 ellipsis           0.3.2      2021-04-29 [1] CRAN (R 4.3.0)
 evaluate           0.23       2023-11-01 [1] CRAN (R 4.3.1)
 fansi              1.0.6      2023-12-08 [1] CRAN (R 4.3.1)
 farver             2.1.1      2022-07-06 [1] CRAN (R 4.3.0)
 fastmap            1.1.1      2023-02-24 [1] CRAN (R 4.3.0)
 foreach            1.5.2      2022-02-02 [1] CRAN (R 4.3.0)
 fs                 1.6.3      2023-07-20 [1] CRAN (R 4.3.0)
 generics           0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
 GenomeInfoDb     * 1.36.4     2023-10-08 [1] Bioconductor
 GenomeInfoDbData   1.2.10     2023-09-13 [1] Bioconductor
 ggExtra          * 0.10.1     2023-08-21 [1] CRAN (R 4.3.0)
 ggfun              0.1.4      2024-01-19 [1] CRAN (R 4.3.1)
 ggnewscale       * 0.4.10     2024-02-08 [1] CRAN (R 4.3.1)
 ggplot2          * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
 ggplotify          0.1.2      2023-08-09 [1] CRAN (R 4.3.0)
 ggridges         * 0.5.6      2024-01-23 [1] CRAN (R 4.3.1)
 ggtree           * 3.8.2      2023-07-30 [1] Bioconductor
 ggtreeExtra      * 1.10.0     2023-04-25 [1] Bioconductor
 glmnet           * 4.1-8      2023-08-22 [1] CRAN (R 4.3.0)
 glue               1.7.0      2024-01-09 [1] CRAN (R 4.3.1)
 gridExtra        * 2.3        2017-09-09 [1] CRAN (R 4.3.0)
 gridGraphics       0.5-1      2020-12-13 [1] CRAN (R 4.3.0)
 gtable             0.3.5      2024-04-22 [1] CRAN (R 4.3.1)
 htmltools          0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
 htmlwidgets        1.6.4      2023-12-06 [1] CRAN (R 4.3.1)
 httpuv             1.6.15     2024-03-26 [1] CRAN (R 4.3.1)
 igraph           * 2.0.3      2024-03-13 [1] CRAN (R 4.3.1)
 IRanges          * 2.34.1     2023-07-02 [1] Bioconductor
 iterators          1.0.14     2022-02-05 [1] CRAN (R 4.3.0)
 jsonlite           1.8.8      2023-12-04 [1] CRAN (R 4.3.1)
 knitr            * 1.45       2023-10-30 [1] CRAN (R 4.3.1)
 labeling           0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
 later              1.3.2      2023-12-06 [1] CRAN (R 4.3.1)
 lattice            0.22-6     2024-03-20 [1] CRAN (R 4.3.1)
 lazyeval           0.2.2      2019-03-15 [1] CRAN (R 4.3.0)
 lifecycle          1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
 lubridate        * 1.9.3      2023-09-27 [1] CRAN (R 4.3.1)
 magrittr           2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
 MASS               7.3-60.0.1 2024-01-13 [1] CRAN (R 4.3.1)
 Matrix           * 1.6-5      2024-01-11 [1] CRAN (R 4.3.1)
 matrixStats      * 1.3.0      2024-04-11 [1] CRAN (R 4.3.1)
 memoise            2.0.1      2021-11-26 [1] CRAN (R 4.3.0)
 mgcv               1.9-1      2023-12-21 [1] CRAN (R 4.3.1)
 mime               0.12       2021-09-28 [1] CRAN (R 4.3.0)
 miniUI             0.1.1.1    2018-05-18 [1] CRAN (R 4.3.0)
 munsell            0.5.1      2024-04-01 [1] CRAN (R 4.3.1)
 naturalsort        0.1.3      2016-08-30 [1] CRAN (R 4.3.0)
 nlme               3.1-164    2023-11-27 [1] CRAN (R 4.3.1)
 patchwork        * 1.2.0      2024-01-08 [1] CRAN (R 4.3.1)
 pheatmap         * 1.0.12     2019-01-04 [1] CRAN (R 4.3.0)
 pillar             1.9.0      2023-03-22 [1] CRAN (R 4.3.0)
 pkgbuild           1.4.4      2024-03-17 [1] CRAN (R 4.3.1)
 pkgconfig          2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
 pkgload            1.3.4      2024-01-16 [1] CRAN (R 4.3.1)
 plyr               1.8.9      2023-10-02 [1] CRAN (R 4.3.1)
 png                0.1-8      2022-11-29 [1] CRAN (R 4.3.0)
 profvis            0.3.8      2023-05-02 [1] CRAN (R 4.3.0)
 promises           1.3.0      2024-04-05 [1] CRAN (R 4.3.1)
 pscl             * 1.5.9      2024-01-31 [1] CRAN (R 4.3.1)
 purrr            * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
 R6                 2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
 ragg               1.3.0      2024-03-13 [1] CRAN (R 4.3.1)
 RColorBrewer     * 1.1-3      2022-04-03 [1] CRAN (R 4.3.0)
 Rcpp             * 1.0.13     2024-07-17 [1] CRAN (R 4.3.3)
 RCurl              1.98-1.14  2024-01-09 [1] CRAN (R 4.3.1)
 readxl           * 1.4.3      2023-07-06 [1] CRAN (R 4.3.0)
 rematch            2.0.0      2023-08-30 [1] CRAN (R 4.3.0)
 remotes            2.5.0      2024-03-17 [1] CRAN (R 4.3.1)
 reshape          * 0.8.9      2022-04-12 [1] CRAN (R 4.3.0)
 reshape2         * 1.4.4      2020-04-09 [1] CRAN (R 4.3.0)
 reticulate       * 1.36.1     2024-04-22 [1] CRAN (R 4.3.1)
 rlang              1.1.3      2024-01-10 [1] CRAN (R 4.3.1)
 rmarkdown          2.26       2024-03-05 [1] CRAN (R 4.3.1)
 ROCR             * 1.0-11     2020-05-02 [1] CRAN (R 4.3.0)
 RSpectra           0.16-1     2022-04-24 [1] CRAN (R 4.3.0)
 rstudioapi         0.16.0     2024-03-24 [1] CRAN (R 4.3.1)
 S4Vectors        * 0.38.2     2023-09-24 [1] Bioconductor
 scales           * 1.3.0      2023-11-28 [1] CRAN (R 4.3.1)
 seqinr           * 4.2-36     2023-12-08 [1] CRAN (R 4.3.1)
 sessioninfo        1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
 shape              1.4.6.1    2024-02-23 [1] CRAN (R 4.3.1)
 shiny              1.8.1.1    2024-04-02 [1] CRAN (R 4.3.1)
 stringi          * 1.8.3      2023-12-11 [1] CRAN (R 4.3.1)
 stringr          * 1.5.1      2023-11-14 [1] CRAN (R 4.3.1)
 survival           3.5-8      2024-02-14 [1] CRAN (R 4.3.1)
 systemfonts        1.0.6      2024-03-07 [1] CRAN (R 4.3.1)
 textshaping        0.3.7      2023-10-09 [1] CRAN (R 4.3.1)
 tibble             3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
 tidyr            * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect         1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
 tidytree         * 0.4.6      2023-12-12 [1] CRAN (R 4.3.1)
 timechange         0.3.0      2024-01-18 [1] CRAN (R 4.3.1)
 treeio             1.24.3     2023-07-30 [1] Bioconductor
 urlchecker         1.0.1      2021-11-30 [1] CRAN (R 4.3.0)
 usethis          * 2.2.3      2024-02-19 [1] CRAN (R 4.3.1)
 utf8               1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
 vctrs              0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
 viridis          * 0.6.5      2024-01-29 [1] CRAN (R 4.3.1)
 viridisLite      * 0.4.2      2023-05-02 [1] CRAN (R 4.3.0)
 withr              3.0.0      2024-01-16 [1] CRAN (R 4.3.1)
 xfun               0.43       2024-03-25 [1] CRAN (R 4.3.1)
 xtable             1.8-4      2019-04-21 [1] CRAN (R 4.3.0)
 XVector          * 0.40.0     2023-05-08 [1] Bioconductor
 yaml               2.3.8      2023-12-11 [1] CRAN (R 4.3.1)
 yulab.utils        0.1.4      2024-01-28 [1] CRAN (R 4.3.1)
 zlibbioc           1.46.0     2023-05-08 [1] Bioconductor

 [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library

─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
---
title: "Dialout Variant Fitness Analysis"
author: 'Authors: [Karl J. Romanowicz](https://kromanowicz.github.io/), Carmen Resnick, Samuel R. Hinton, Calin Plesa'
output:
  html_notebook:
    theme: spacelab
    toc: yes
    toc_depth: 5
    toc_float:
      collapsed: yes
      smooth_scroll: yes
  html_document:
    toc: yes
    toc_depth: '5'
    df_print: paged
  pdf_document:
    toc: yes
    toc_depth: '5'
---

**R Notebook:** <font color="green">Provides reproducible analysis for **Dial-Out Variant Fitness** in the following manuscript:</font>

**Citation:** Romanowicz KJ, Resnick C, Hinton SR, Plesa C. (2025) Exploring antibiotic resistance in diverse homologs of the dihydrofolate reductase protein family through broad mutational scanning. ***bioRxiv***. []()

**GitHub Repository:** [https://github.com/PlesaLab/DHFR](https://github.com/PlesaLab/DHFR)

**NCBI BioProject:** [https://www.ncbi.nlm.nih.gov/bioproject/1189478](https://www.ncbi.nlm.nih.gov/bioproject/1189478)

# Experiment

This pipeline processes a library of 1,536 DHFR homologs and their associated mutants, with two-fold redundancy (two codon variants per sequence). Fitness scores are derived from a multiplexed in-vivo assay using a trimethoprim concentration gradient, assessing the ability of these homologs and their mutants to complement functionality in an *E. coli* knockout strain and their tolerance to trimethoprim treatment. This analysis provides insights into how antibiotic resistance evolves across a range of evolutionary starting points. Sequence data were generated using the Illumina NovaSeq platform with 100 bp paired-end sequencing of amplicons.

![Methods overview to achieve a broad-mutational scan for DHFR homologs.](Images/DHFR.Diagram.png)

```{css}
.badCode {
background-color: lightpink;
font-weight: bold;
}

.goodCode {
background-color: lightgreen;
font-weight: bold;
}

.sharedCode {
background-color: lightblue;
font-weight: bold;
}

table {
  margin: auto;
  border-top: 1px solid #666;
  border-bottom: 1px solid #666;
}
table thead th { border-bottom: 1px solid #ddd; }
th, td { padding: 5px; }
thead, tfoot, tr:nth-child(even) { background: #eee; }
```

```{r setup, include=FALSE}
# Set global options for notebook
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = TRUE, message = TRUE)
knitr::opts_chunk$set(echo = TRUE, class.source = "bg-success")

# Getting the path of your current open file and set as wd
current_path = rstudioapi::getActiveDocumentContext()$path 
setwd(dirname(current_path))
print(getwd())
```

# Packages
The following R packages must be installed prior to loading into the R session. See the **Reproducibility** tab for a complete list of packages and their versions used in this workflow.
```{r message=FALSE, warning=FALSE, results='hide'}
# Load the latest version of python (3.10.14) for downstream use:
library(reticulate)
use_python("/Users/krom/miniforge3/bin/python3")

# Make a vector of required packages
required.packages <- c("ape", "bio3d", "Biostrings", "broom", "castor", "cowplot", "devtools", "dplyr", "ggExtra", "ggnewscale", "ggplot2", "ggridges", "ggtree", "ggtreeExtra", "glmnet", "gridExtra","igraph", "knitr", "lubridate", "matrixStats", "patchwork", "pheatmap", "purrr", "pscl", "RColorBrewer", "readxl", "reshape","reshape2", "ROCR", "seqinr", "scales", "stringr", "stringi", "tidyr", "tidytree", "viridis")

# Load required packages with error handling
loaded.packages <- lapply(required.packages, function(package) {
  if (!require(package, character.only = TRUE)) {
    install.packages(package, dependencies = TRUE)
    if (!require(package, character.only = TRUE)) {
      message("Package ", package, " could not be installed and loaded.")
      return(NULL)
    }
  }
  return(package)
})

# Remove NULL entries from loaded packages
loaded.packages <- loaded.packages[!sapply(loaded.packages, is.null)]
```

```{r class.output="sharedCode", echo=FALSE}
# Print loaded packages
cat("Loaded packages:", paste(loaded.packages, collapse = ", "), "\n")
```

```{r include=FALSE}
# set.seed is used to fix the random number generation to make the results repeatable
set.seed(123)
```

# Import Data Files

Import **COUNT** files generated from [DHFR.2.Counts.RMD](https://github.com/PlesaLab/DHFR) relevant for downstream analysis.
```{r}
# BCcontrols_15_median
BCcontrols_15_median <- read.csv("Count/count_files_formatted/BCcontrols_15_median.csv", 
                         header = TRUE, stringsAsFactors = FALSE)
```

Import **PERFECTS** files generated from [DHFR.3.Perfects.RMD](https://github.com/PlesaLab/DHFR) relevant for downstream analysis.
```{r}
# perfects15_5BCs
perfects15_5BCs <- read.csv("Perfects/perfects_files_formatted/perfects15_5BCs.csv", 
                         header = TRUE, stringsAsFactors = FALSE)
```

# Dialout Analysis

## Import Excel Files

This is the file with the plate reader data:
```{r}
# REPLACE THIS FILE NAME FOR EACH ANALYSIS
filename_of_plate_reader_data = "Dialout/RAW/02-08-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate4.Rnd2.xlsx"

#M9-Full Plate 1: "01-24-24_KR_DHFR_Fitness_Assay_M9_Full_Supp.Plate1.xlsx"
#M9-TMP Plate 2: "01-31-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate2.xlsx"
#M9-TMP Plate 3: "02-01-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate3.xlsx"
#M9-TMP Plate 4: "02-02-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate4.xlsx"

#Re-do of M9-TMP Plates:
#M9-TMP Plate 2: "02-06-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate2.Rnd2.xlsx"
#M9-TMP Plate 3: "02-07-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate3.Rnd2.xlsx"
#M9-TMP Plate 4: "02-08-24_KR_DHFR_Fitness_Assay_M9_TMP.Plate4.Rnd2.xlsx"
```

This is the range of data in the excel file to import
```{r}
# UPDATE THIS DATA RANGE FOR EACH EXCEL FILE YOU IMPORT
range_od = "B41:CI282"

#M9-Full Plate 1 OD-range: B41:AM282
#M9-TMP Plate 2 OD-range: B41:CI276
#M9-TMP Plate 3 OD-range: B41:CI279
#M9-TMP Plate 4 OD-range: B41:CI282

#Re-do of M9-TMP Plates:
#M9-TMP Plate 2 OD-range: B41:CI282
#M9-TMP Plate 3 OD-range: B41:CI276
#M9-TMP Plate 4 OD-range: B41:CI282
```

```{r}
#get the data
od_data1 <- as.data.frame(read_excel(filename_of_plate_reader_data, sheet = 1, range = range_od))
```

```{r}
#fix the time (makes minutes from start)
od_data <- od_data1 %>%
  mutate(time_int=as.duration(interval(start = od_data1$Time[1], end=Time))) %>%
  mutate(round_time=ceiling(as.numeric(time_int)/60)) %>%
  subset(select = -Time) %>%
  subset(select = -time_int) %>%
  relocate(round_time) %>%
  dplyr::rename(Time=round_time) %>%
  t() %>%
  as.data.frame(.)
```

```{r}
#make vector variables for plotting
M = nrow(od_data)
N = ncol(od_data)
```

```{r}
#make a vector of your time points
ODtime <- od_data[1,1:N]
```

## Sample Growth Rate Plots

This command loads the script (find_gr.R) that contains the analysis functions. This file needs to be in the present working directory.
```{r}
source("Dialout/Scripts/find_gr.R")
```

```{r}
#define initial growth rates
growth.rates = NULL
```

```{r}
#this variable will store all the data
gr_table <- data.frame(sample_name=character(), 
                       doubling.time=double(), 
                       m=double(),
                       r2=double(),
                       lag.t=double())
```

```{r}
#convert time_vector to numeric variable
time_vector = as.numeric(ODtime[1,])
```

Generate growth rate plots from each sampling well and save as .png files:
```{r}
#run through all samples and fit
for (i in 2:M) {
  print(i)
  sample_nm <- row.names(od_data)[i]
  
  # Clean the sample name to remove or replace problematic characters
  clean_sample_nm <- gsub("[^[:alnum:]_-]", "_", sample_nm)
  
  # Open a PNG device with the cleaned filename
  png_filename <- file.path("Dialout/PLOTS", paste0(clean_sample_nm, "_growth_rate_plot.png"))
  png(filename = png_filename, width = 800, height = 600)
  
  #this is where the fit happens:
  gr <- findgr(od_data[i, 1:N], time_vector, sample_nm, int=30, r2=0.6) 
  
  de <- list(sample_name=sample_nm, 
             doubling.time=log2(2)/gr["m"], 
             m=gr["m"],
             r2=gr["r2"],
             lag.t=gr["lag.t"])
  
  #add this to previously calculated wells
  gr_table <- rbind(gr_table, de, stringsAsFactors=FALSE)
  growth.rates <- rbind(growth.rates, gr)
  
  # Close the PNG device
  dev.off()
}
```

Display the plots in the RMD:
```{r class.output="goodCode"}
#run through all samples and fit
for (i in 2:M) {
  print(i)
  sample_nm=row.names(od_data)[i]
  
  #this is where the fit happens:
  gr <- findgr(od_data[i, 1:N], time_vector, sample_nm, int=30, r2=0.6) 
  #1 in od_data[i, 1:N] is the column number where the data starts;  
  #int is number of points taken at one time as an interval to find the highest slope; 
  #vary (i.e. lower) r2, i.e. rsquared as needed, blanks can be a problem here
  
  #print(gr["m"])
  de <- list(sample_name=sample_nm, 
             doubling.time=log2(2)/gr["m"], 
             m=gr["m"],
             r2=gr["r2"],
             lag.t=gr["lag.t"])
  #print(de)
  
  #add this to previously calculated wells
  gr_table = rbind(gr_table,de, stringsAsFactors=FALSE)
  growth.rates <- rbind(growth.rates, gr)
}

dev.off()
```

Write the data to a file
```{r}
# UPDATE THE TABLE NAME IF YOU RE-RUN MULTIPLE EXCEL FILES

# Substitute your desired file name for 'growth_rates'
write.table(gr_table, "Dialout/OUTPUT/growth_rates_plate4.txt", sep="\t", quote=F, row.names=F) 
```

## Full Growth Rate Plot

Load the final fitness data for all 12 Dialout samples based on calculated growth rates:
```{r}
# For 12 samples (including WT and mCherry controls)
grwell <- read.table(file = 'Dialout/Raw/growth_rates_all.v2.csv', sep =',', header = TRUE)
```

Reshape the data for plotting:
```{r}
# Convert Construct to a factor with desired levels
grwell$Media <- factor(grwell$Media, levels = c("M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp"))

# Define the 12 samples
grwell$Construct <- factor(grwell$Construct, levels = c("NP_267306", "WP_000637209", "WP_008976421",
                                                        "WP_003776922", "WP_008578924", "WP_002897636",
                                                        "WP_000162453", "WP_003012456", "WP_000162462",
                                                        "WP_003027976", "WT-DHFR", "mCherry"))
```

Growth Rate Plot for DHFR Dialout Variants:
```{r}
# Convert Media to a factor with desired levels
grwell$Media <- factor(grwell$Media, levels = c("M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp"))

# Define the 12 samples in the desired order
construct_order <- c("NP_267306", "WP_000637209", "WP_008976421", "WP_003776922", "WP_008578924", "WP_002897636", "WP_000162453", "WP_003012456", "WP_000162462", "WP_003027976", "WT-DHFR", "mCherry")

grwell$Construct <- factor(grwell$Construct, levels = construct_order)

# Create a named vector for facet colors
facet_colors <- setNames(c(rep("lightgray", 10), "blue", "red"), construct_order)

# Custom function to reorder facets
reorder_facets <- function(x) factor(x, levels = construct_order)

# Calculate the overall y-axis range
y_min <- min(grwell$m, na.rm = TRUE)
y_max <- max(grwell$m, na.rm = TRUE)

# Calculate mean growth rate for each Construct-Media combination
grwell_mean <- grwell %>%
  group_by(Construct, Media) %>%
  summarize(mean_m = mean(m, na.rm = TRUE), .groups = "drop")
```

Growth Rate Plot
```{r}
# Growth rate plot
fitness.plot.v3 <- ggplot(grwell_mean, aes(x = Media, y = mean_m, group = Construct)) +
  geom_point() +
  geom_line() +
  labs(title = "Growth Rate Assay for DHFR Dial-Out Variants") +
  facet_wrap(~ reorder_facets(Construct), scales = "fixed", ncol = 4) +
  theme_cowplot(16) +
  theme(
    title = element_text(color = "black", face = "bold", size=14),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=10),
    strip.background = element_blank(),
    strip.text = element_text(color = "black", face = "bold", size=10),
    panel.spacing = unit(0.5, "lines"),
    axis.text.y = element_text(size = 10),
    axis.title.y = element_text(size = 12),
    panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    legend.position = "none"
  ) +
  ylab(expression("Mean Growth Rate (min"^"-1"*")")) +
  xlab("Media") +
  coord_cartesian(ylim = c(y_min, y_max))

# Add colored backgrounds for all facets
fitness.plot.v3 <- fitness.plot.v3 +
  geom_rect(data = data.frame(Construct = construct_order),
            aes(fill = Construct),
            xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf,
            alpha = 0.2, inherit.aes = FALSE) +
  scale_fill_manual(values = facet_colors, guide = "none")

# Remove y-axis labels for all but the left-most facets
fitness.plot.v3 <- fitness.plot.v3 +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y.left = element_text(size = 10),
    axis.ticks.y.left = element_line(),
    strip.text.x = element_text(size = 10, margin = margin(b = 5, t = 5))
  )

# Print the plot
print(fitness.plot.v3)
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Dialout_Growth_Rates.pdf", 
       plot = fitness.plot.v3, 
       dpi=600, width = 6, height = 6, units = "in")
```

Calculate a Mean growth rate value at each TMP concentration for each dialout variant:
```{r class.output="goodCode"}
# Create the summary table
grwell_summary <- grwell_mean %>%
  pivot_wider(
    names_from = Media,
    values_from = mean_m
  ) %>%
  select(Construct, `M9-Supp`, `0-tmp`, `0.058-tmp`, `0.5-tmp`, `1.0-tmp`, `10-tmp`, `50-tmp`, `200-tmp`) %>%
  arrange(factor(Construct, levels = construct_order))

# Round all numeric columns to 6 decimal places
grwell_summary <- grwell_summary %>%
  mutate(across(where(is.numeric), ~round(., 6)))

# Display the table
kable(grwell_summary, format = "markdown", caption = "Summary of Mean Growth Rates (1/min) for DHFR Variants")
```

## Dialout Variant Fitness

```{r echo=FALSE}
# Reload BCcontrols_15_median
BCcontrols_15_median <- read.csv("Count/count_files_formatted/BCcontrols_15_median.csv", 
                         header = TRUE, stringsAsFactors = FALSE)

# Re-load perfects15_5BCs
perfects15_5BCs <- read.csv("Perfects/perfects_files_formatted/perfects15_5BCs.csv", 
                         header = TRUE, stringsAsFactors = FALSE)
```

Start by pulling out the Dialout variants from the perfects15_5BCs dataset:
```{r}
# Specify the mutID values you want to keep
dialout_mutIDs <- c("NP_267306", "WP_000637209", "WP_008976421", "WP_003776922", "WP_008578924",
                    "WP_002897636", "WP_000162453", "WP_003012456", "WP_000162462", "WP_003027976")

# Subset the dataframe, keeping only columns 1:9
dialout_perfects15_5BCs <- perfects15_5BCs[perfects15_5BCs$mutID %in% dialout_mutIDs, 1:9]

# Rename the columns
colnames(dialout_perfects15_5BCs) <- c("mutID","M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp")
```

Add the Controls fitness to the Dialout fitness dataset:
```{r}
# Rename controls columns
colnames(BCcontrols_15_median) <- c("mutID", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp")

# First, add the M9-Supp column with NA values to BCcontrols_15_median
BCcontrols_15_median <- BCcontrols_15_median %>%
  mutate(`M9-Supp` = NA)

# Now merge the two dataframes
dialout_ctrls_fitness <- bind_rows(
  dialout_perfects15_5BCs,
  BCcontrols_15_median)

# Reorder columns to match the original order
column_order <- c("mutID", "M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp")
dialout_ctrls_fitness <- dialout_ctrls_fitness %>%
  select(all_of(column_order))

# Remove the row where mutID is "D27N"
dialout_ctrls_fitness <- dialout_ctrls_fitness %>%
  filter(mutID != "D27N")

# Rename "WT" to "WT-DHFR"
dialout_ctrls_fitness <- dialout_ctrls_fitness %>%
  mutate(mutID = ifelse(mutID == "WT", "WT-DHFR", mutID))

# Round all numeric columns to 6 decimal places
dialout_ctrls_fitness_summary <- dialout_ctrls_fitness %>%
  mutate(across(where(is.numeric), ~round(., 6)))

# Display the table
kable(dialout_ctrls_fitness_summary, format = "markdown", caption = "Summary of Median Fitness for Pooled DHFR Variants")
```

### Fitness Plot

Reshape data prior to plotting
```{r}
# Reshape the data from wide to long format
long_data <- dialout_ctrls_fitness %>%
  pivot_longer(
    cols = c("M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp"),
    names_to = "Media",
    values_to = "fitness"
  )

# Convert Media to a factor with desired levels
long_data$Media <- factor(long_data$Media, levels = c("M9-Supp", "0-tmp", "0.058-tmp", "0.5-tmp", "1.0-tmp", "10-tmp", "50-tmp", "200-tmp"))

# Define the 12 samples in the desired order
construct_order <- c("NP_267306", "WP_000637209", "WP_008976421", "WP_003776922", "WP_008578924", "WP_002897636", "WP_000162453", "WP_003012456", "WP_000162462", "WP_003027976", "WT-DHFR", "mCherry")

long_data$mutID <- factor(long_data$mutID, levels = construct_order)

# Create a named vector for facet colors
facet_colors <- setNames(c(rep("lightgray", 10), "blue", "red"), construct_order)

# Custom function to reorder facets
reorder_facets <- function(x) factor(x, levels = construct_order)

# Calculate the overall y-axis range
y_min <- min(long_data$fitness, na.rm = TRUE)
y_max <- max(long_data$fitness, na.rm = TRUE)
```

Plot the fitness values across the TMP gradient for each dialout variant:
```{r warning=FALSE}
# Fitness plot
fitness.plot.v4 <- ggplot(long_data, aes(x = Media, y = fitness, group = mutID)) +
  geom_hline(yintercept = -1, linetype = "dashed", color = "red", size = 0.5) +
  geom_point() +
  geom_line() +
  labs(title = "Pooled Fitness Assay for DHFR Dial-Out Variants") +
  facet_wrap(~ reorder_facets(mutID), scales = "fixed", ncol = 4) +
  theme_cowplot(16) +
  theme(
    title = element_text(color = "black", face = "bold", size=14),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=10),
    strip.background = element_blank(),
    strip.text = element_text(color = "black", face = "bold", size=10),
    panel.spacing = unit(0.5, "lines"),
    axis.text.y = element_text(size = 10),
    axis.title.y = element_text(size = 12),
    panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    legend.position = "none"
  ) +
  ylab("Median Fitness (LogFC)") +
  xlab("Media") +
  coord_cartesian(ylim = c(y_min, y_max))

# Add colored backgrounds for all facets
fitness.plot.v4 <- fitness.plot.v4 +
  geom_rect(data = data.frame(mutID = construct_order),
            aes(fill = mutID),
            xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf,
            alpha = 0.2, inherit.aes = FALSE) +
  scale_fill_manual(values = facet_colors, guide = "none")

# Remove y-axis labels for all but the left-most facets
fitness.plot.v4 <- fitness.plot.v4 +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y.left = element_text(size = 10),
    axis.ticks.y.left = element_line(),
    strip.text.x = element_text(size = 10, margin = margin(b = 5, t = 5))
  )

# Print the plot
print(fitness.plot.v4)
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Dialout_Median_Fitness.pdf",
       plot = fitness.plot.v4, 
       dpi=600, width = 6, height = 6, units = "in")
```

## Side-by-Side Plots

Plot Fitness and Growth Rates side-by-side
```{r warning=FALSE}
patch1 <- fitness.plot.v4 | fitness.plot.v3
patch1
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Dialout_Growth_Rate_Fitness_Comparison.pdf",
       plot = patch1, 
       dpi=600, width = 12, height = 6, units = "in")
```

Combined Datasets for Plotting Together
```{r class.output="goodCode"}
# Prepare long_data (Fitness data)
long_data_renamed <- long_data %>%
  rename(Construct = mutID, value = fitness) %>%
  mutate(assay_type = "Fitness")

# Check the range of Fitness values
print("Range of Fitness values:")
print(range(long_data_renamed$value, na.rm = TRUE))

# Prepare grwell_mean data (Growth Rate data)
grwell_mean_prepared <- grwell_mean %>%
  rename(value = mean_m) %>%
  mutate(
    Type = case_when(
      Construct %in% c("WT-DHFR", "mCherry") ~ "Control",
      TRUE ~ "Variant"
    ),
    assay_type = "Growth Rate"
  )
```

```{r class.output="goodCode"}
# Combine the datasets
combined_data <- bind_rows(long_data_renamed, grwell_mean_prepared)

# Verify the data
print("Combined data summary:")
print(table(combined_data$assay_type, combined_data$Type, useNA = "ifany"))

# Check for any extreme values or outliers
print("Summary of 'value' column:")
print(summary(combined_data$value))

# Create the combined plot
combined_plot <- ggplot(combined_data, aes(x = Media, y = value, group = interaction(Construct, assay_type), color = assay_type)) +
  geom_point() +
  geom_line() +
  geom_hline(data = filter(combined_data, assay_type == "Fitness"), 
             aes(yintercept = -1), linetype = "dashed", color = "red", size = 0.5) +
  facet_grid(assay_type ~ reorder_facets(Construct), scales = "free_y") +
  theme_cowplot(16) +
  theme(
    strip.background = element_blank(),
    strip.text = element_text(color = "black", face = "bold", size = 10),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 8),
    panel.spacing = unit(0.5, "lines"),
    axis.text.y = element_text(size = 8),
    axis.title = element_text(size = 10),
    panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    legend.position = "none"
  ) +
  labs(title = "Growth Rate and Pooled Fitness Assays for DHFR Dial-Out Variants",
       x = "Media",
       y = "Value") +
  scale_color_manual(values = c("Fitness" = "blue", "Growth Rate" = "red"))

# Add colored backgrounds for all facets
combined_plot <- combined_plot +
  geom_rect(data = data.frame(Construct = construct_order),
            aes(fill = Construct),
            xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf,
            alpha = 0.2, inherit.aes = FALSE) +
  scale_fill_manual(values = facet_colors, guide = "none")

# Print the plot
print(combined_plot)
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Dialout_Fitness_Growth_Stacked_Plots.pdf",
       plot = combined_plot, 
       dpi=600, width = 16, height = 4, units = "in")
```

## Combined Plots

### All Dial-outs

Plot data together with separate y-axes
```{r class.output="goodCode"}
# Function to create a dual-axis plot for each Construct
create_dual_axis_plot <- function(data, construct) {
  # Remove M9-Supp media for mCherry and WT-DHFR
  if (construct %in% c("mCherry", "WT-DHFR")) {
    data <- data %>% filter(!(Media == "M9-Supp"))
  }
  
  fitness_data <- filter(data, assay_type == "Fitness")
  growth_rate_data <- filter(data, assay_type == "Growth Rate")
  
  # Calculate the ratio for the secondary axis
  ratio <- diff(range(fitness_data$value)) / diff(range(growth_rate_data$value))
  
  ggplot() +
    # Fitness data
    geom_line(data = fitness_data, aes(x = Media, y = value, group = 1), color = "blue") +
    geom_point(data = fitness_data, aes(x = Media, y = value), color = "blue") +
    # Growth Rate data
    geom_line(data = growth_rate_data, aes(x = Media, y = value * ratio, group = 1), color = "red") +
    geom_point(data = growth_rate_data, aes(x = Media, y = value * ratio), color = "red") +
    # Horizontal line at y = -1 for Fitness
    geom_hline(yintercept = -1, linetype = "dashed", color = "gray", size = 0.5) +
    # Axes and labels
    scale_y_continuous(
      name = "Fitness",
      sec.axis = sec_axis(~./ratio, name = "Growth Rate")
    ) +
    theme_classic() +
    theme(
      axis.line.y.right = element_line(color = "red"),
      axis.ticks.y.right = element_line(color = "red"),
      axis.text.y.right = element_text(color = "red"),
      axis.title.y.right = element_text(color = "red"),
      axis.line.y.left = element_line(color = "blue"),
      axis.ticks.y.left = element_line(color = "blue"),
      axis.text.y.left = element_text(color = "blue"),
      axis.title.y.left = element_text(color = "blue"),
      axis.text.x = element_text(angle=45, hjust=1),
      axis.title.x = element_blank(),
      plot.title = element_text(size=10, face="bold", hjust=0.5)
    ) +
    labs(title=paste(construct),
         x="Media")
}

# Generate a list of plots, one for each Construct
plot_list <- combined_data %>%
  group_by(Construct) %>%
  group_map(~ create_dual_axis_plot(.x, .y$Construct))

# Arrange all plots in a grid
grid_plot <- do.call(grid.arrange, c(plot_list, ncol=3))

# Print the grid plot
print(grid_plot)
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Dialout_Fitness_Growth_Combined_Plots.pdf",
       plot = grid_plot, 
       dpi=600, width = 10, height = 10, units = "in")
```

### Pathogenic Dial-outs

```{r}
# Function to create a dual-axis plot for each Construct
create_dual_axis_plot <- function(data, construct) {
  # Remove M9-Supp media for WT-DHFR
  if (construct == "WT-DHFR") {
    data <- data %>% filter(!(Media == "M9-Supp"))
  }
  
  fitness_data <- filter(data, assay_type == "Fitness")
  growth_rate_data <- filter(data, assay_type == "Growth Rate")
  
  # Calculate the ratio for the secondary axis
  ratio <- diff(range(fitness_data$value)) / diff(range(growth_rate_data$value))
  
  ggplot() +
    # Fitness data
    geom_line(data = fitness_data, aes(x = Media, y = value, group = 1), color = "blue") +
    geom_point(data = fitness_data, aes(x = Media, y = value), color = "blue") +
    # Growth Rate data
    geom_line(data = growth_rate_data, aes(x = Media, y = value * ratio, group = 1), color = "red") +
    geom_point(data = growth_rate_data, aes(x = Media, y = value * ratio), color = "red") +
    # Horizontal line at y = -1 for Fitness
    geom_hline(yintercept = -1, linetype = "dashed", color = "gray", size = 0.5) +
    # Axes and labels
    scale_y_continuous(
      name = "Fitness",
      sec.axis = sec_axis(~./ratio, name = "Growth Rate")
    ) +
    theme_classic() +
    theme(
      axis.line.y.right = element_line(color = "red"),
      axis.ticks.y.right = element_line(color = "red"),
      axis.text.y.right = element_text(color = "red"),
      axis.title.y.right = element_text(color = "red"),
      axis.line.y.left = element_line(color = "blue"),
      axis.ticks.y.left = element_line(color = "blue"),
      axis.text.y.left = element_text(color = "blue"),
      axis.title.y.left = element_text(color = "blue"),
      axis.text.x = element_text(angle=45, hjust=1),
      axis.title.x = element_blank(),
      plot.title = element_text(size=10, face="bold", hjust=0.5)
    ) +
    labs(title=paste(construct),
         x="Media")
}

# Filter combined_data for the specific constructs and rename
specific_constructs <- c("Bacillus cereus", "Streptococcus pneumoniae", "WT-DHFR")
filtered_data <- combined_data %>%
  filter(Construct %in% c("WP_000637209", "WP_000162453", "WT-DHFR")) %>%
  mutate(Construct = case_when(
    Construct == "WP_000637209" ~ "Bacillus cereus",
    Construct == "WP_000162453" ~ "Streptococcus pneumoniae",
    TRUE ~ Construct
  ))

# Generate a list of plots, one for each specified Construct
plot_list <- filtered_data %>%
  group_by(Construct) %>%
  group_map(~ create_dual_axis_plot(.x, .y$Construct))

# Arrange all plots in a grid
grid_plot <- do.call(grid.arrange, c(plot_list, ncol=3))

# Print the grid plot
print(grid_plot)
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Pathogenic_Dialout_Fitness_Growth_Combined_Plots.pdf",
       plot = grid_plot, 
       dpi=600, width = 10, height = 3, units = "in")
```

## Correlations

```{r class.output="goodCode"}
# First, let's reshape the data to have Fitness and Growth Rate side by side
wide_data <- combined_data %>%
  select(Construct, Media, assay_type, value) %>%
  pivot_wider(names_from = assay_type, values_from = value) %>%
  filter(!is.na(Fitness) & !is.na(`Growth Rate`))

# Function to calculate Spearman correlation and p-value
spearman_cor <- function(data) {
  cor_test <- cor.test(data$Fitness, data$`Growth Rate`, method = "spearman")
  tibble(
    correlation = cor_test$estimate,
    p_value = cor_test$p.value
  )
}

# Calculate correlations for each Media type
correlations <- wide_data %>%
  group_by(Media) %>%
  do(spearman_cor(.)) %>%
  ungroup()

# Print the results
print(correlations)

# Optionally, create a plot to visualize the correlations
correlation_plot <- ggplot(correlations, aes(x = Media, y = correlation)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_text(aes(label = sprintf("%.3f", correlation), 
                y = ifelse(correlation > 0, correlation + 0.05, correlation - 0.05)),
            vjust = ifelse(correlations$correlation > 0, 0, 1)) +
  geom_text(aes(label = sprintf("p = %.3f", p_value), y = 0), 
            vjust = ifelse(correlations$correlation > 0, 1.5, -0.5)) +
  theme_classic() +  # This removes the background grid and adds axis lines
  theme(
    axis.line = element_line(color = "black"),  # Ensure axis lines are black
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    panel.grid.major = element_blank(),  # Remove major grid lines
    panel.grid.minor = element_blank()   # Remove minor grid lines
  ) +
  labs(title = "Spearman Correlations between Fitness and Growth Rate by Media",
       y = "Correlation Coefficient")

print(correlation_plot)
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Dialout_Spearman_Correlations_Barplot.pdf",
       plot = correlation_plot, 
       dpi=600, width = 8, height = 4, units = "in")
```

### Correlation Plots

```{r class.output="goodCode"}
# Function to create a scatter plot for each media type
create_scatter_plot <- function(data, media_type) {
  # Remove rows where Construct is "mCherry"
  data <- data %>% filter(Construct != "mCherry")
  
  # Perform Spearman correlation test
  cor_test <- cor.test(data$Fitness, data$`Growth Rate`, method = "spearman")
  
  ggplot(data, aes(x = `Growth Rate`, y = Fitness)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", color = "red", fill = "skyblue", se = TRUE) +  # Set se = TRUE to include confidence interval
    annotate("text", x = min(data$`Growth Rate`, na.rm = TRUE), y = max(data$Fitness, na.rm = TRUE),
             label = sprintf("Correlation = %.2f\np-value = %.3f", cor_test$estimate, cor_test$p.value),
             hjust = 0, vjust = 1) +
    theme_classic() +  # This removes the background grid and adds axis lines
    theme(
      axis.line = element_line(color = "black"),
      plot.title = element_text(size = 14, hjust = 0.5),
      axis.title = element_text(size = 12),
      axis.text = element_text(size = 10),
      plot.margin = margin(10, 30, 10, 10)  # Add right margin for x-axis labels
    ) +
    labs(
      title = NULL,  # Remove the title
      x = paste("Growth Rate in", media_type),  # Set x-axis title
      y = paste("Fitness in", media_type)        # Set y-axis title
    ) +
    scale_x_continuous(expand = expansion(mult = c(0.05, 0.05)))  # Add padding to x-axis limits
}

# Generate a list of plots, one for each media type
corr_plot_list <- wide_data %>%
  group_by(Media) %>%
  group_map(~ create_scatter_plot(.x, .y$Media))

# Arrange all plots in a grid
corr_grid_plot <- do.call(grid.arrange, c(corr_plot_list, ncol = 2))

# Print the grid plot
print(corr_grid_plot)
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Dialout_Spearman_Correlations.pdf",
       plot = corr_grid_plot, 
       dpi=600, width = 8, height = 10, units = "in")
```

```{r class.output="goodCode"}
# Function to create a scatter plot for each media type
create_scatter_plot <- function(data, media_type) {
  # Remove rows where Construct is "mCherry"
  data <- data %>% filter(Construct != "mCherry")
  
  # Perform Spearman correlation test (if needed for other purposes)
  cor_test <- cor.test(data$Fitness, data$`Growth Rate`, method = "spearman")
  
  ggplot(data, aes(x = `Growth Rate`, y = Fitness)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", color = "red", fill = "skyblue", se = TRUE) +  # Set fill color for confidence interval to blue
    theme_classic() + 
    theme(
      axis.line = element_line(color = "black"),
      plot.title = element_text(size = 14, hjust = 0.5),
      axis.title = element_text(size = 12),
      axis.text = element_text(size = 10),
      plot.margin = margin(10, 30, 10, 10)
    ) +
    labs(
      title = NULL,
      x = paste("Growth Rate in", media_type),  # Set x-axis title
      y = paste("Fitness in", media_type)        # Set y-axis title
    ) +
    scale_x_continuous(expand = expansion(mult = c(0.05, 0.05)))  
}

# Generate a list of plots, one for each media type
corr_plot_list <- wide_data %>%
  group_by(Media) %>%
  group_map(~ create_scatter_plot(.x, .y$Media))

# Arrange all plots in a grid
corr_grid_plot <- do.call(grid.arrange, c(corr_plot_list, ncol = 2))

# Print the grid plot
print(corr_grid_plot)
```

```{r echo=FALSE}
#save plot
# For 12 samples
ggsave(file="Dialout/DHFR_Dialout_Spearman_Correlations.no.labels.pdf",
       plot = corr_grid_plot, 
       dpi=600, width = 8, height = 9, units = "in")
```

# Reproducibility

The session information is provided for full reproducibility.
```{r}
devtools::session_info()
```