R Notebook: Provides reproducible analysis for Association of mutant sequence, barcode, and inferred fluorescence phenotype in the following manuscript:
Citation: Lippert LB, Hinton SR, Holston A, Romanowicz KJ, Plesa C. Characterizing Sequence-Function Relationships in Chimeric DcuS/EnvZ Histidine Kinases. In Prep. 2026.
GitHub Repository: https://github.com/PlesaLab/DcuSEnvZ
This pipeline processes all barcode-sequence pairs and their associated median bin scores for all three conditions. Here, previously generated files for all three conditions (containing barcodes, their associated nucleotide and amino acid sequences, activation (“median bin”) scores, the lower and upper indices of their activation bins) are merged. Median bin scores are then converted to molecules of equivalent fluorescein (MEFL)
The following R packages must be installed prior to loading into the R session. See the Reproducibility tab for a complete list of packages and their versions used in this workflow.
# Make a vector of required packages
required.packages <- c("devtools", "knitr", "patchwork", "tidyverse", "ggplot2", "dplyr", "tidyr", "magrittr", "stringr", "seqinr", "ggridges")
# Load required packages
lapply(required.packages, library, character.only = TRUE)
This section is based on the R file: “Merge_and_.R”. It describes how merge the data from all three conditions, convert from ‘median bin’ to MEFL, and calculate phenotype scores for all barcode-sequence pairs observed in the dataset. The end result is a .CSV file containing the total set of observed barcode-sequence pairs and nine values which describe the associated phenotype
NoLig_all <- read.csv(file="./output_files/DcuS_NoLigand_bin_distribution-byBC.csv", head=T, sep=",")
Fum_all <- read.csv(file="./output_files/DcuS_Fumarate_bin_distribution-byBC.csv", head=T, sep=",")
Asp_all <- read.csv(file="./output_files/DcuS_Aspartate_bin_distribution-byBC.csv", head=T, sep=",")
Keep only barcodes with > 10 reads
NoLig_all <- NoLig_all %>%
filter(BC_SorTotReads>=10) %>%
dplyr::rename(AAseq=seq,NoLig_LowerIndex=lower_index, NoLig_UpperIndex=upper_index, NoLig_Median=median, NoLig_Reads=BC_SorTotReads)
Fum_all <- Fum_all %>%
filter(BC_SorTotReads>=10) %>%
dplyr::rename(AAseq=seq,Fum_LowerIndex=lower_index, Fum_UpperIndex=upper_index, Fum_Median=median, Fum_Reads=BC_SorTotReads)
Asp_all <- Asp_all %>%
filter(BC_SorTotReads>=10) %>%
dplyr::rename(AAseq=seq,Asp_LowerIndex=lower_index, Asp_UpperIndex=upper_index, Asp_Median=median, Asp_Reads=BC_SorTotReads)
Collapse duplicated rows
# Check if duplicated rows are identical or if there are differences that need to be resolved - this process was repeated separately for all conditions elsewhere
A_dupes <- Asp_all %>%
group_by(barcode) %>%
filter(n() > 1) %>%
ungroup()
# For each barcode, check if all rows are identical
A_dupes %>%
group_by(barcode) %>%
mutate(
# Check if amino acid sequences differ within this barcode
seq_differs = n_distinct(AAseq) > 1,
# Check if phenotype differs within this barcode
phenotype_differs = n_distinct(Asp_Median) > 1
) %>%
ungroup()
## # A tibble: 8,708 × 11
## barcode NTseq AAseq Asp_LowerIndex Asp_UpperIndex Asp_Median Asp_Reads
## <chr> <chr> <chr> <dbl> <int> <dbl> <dbl>
## 1 AACCAACCGATTT… AGAC… RHSL… 6 7 6.34 2589.
## 2 AACCAACCGATTT… AGAC… RHSL… 6 7 6.34 2589.
## 3 AACCAACCGGCTA… AGAC… RHSL… 6 7 6.45 6694.
## 4 AACCAACCGGCTA… AGAC… RHSL… 6 7 6.45 6694.
## 5 AACCAACGGAATA… AGAC… RHSL… 6 7 6.37 1461.
## 6 AACCAACGGAATA… AGAC… RHSL… 6 7 6.37 1461.
## 7 AACCAATCAAAAA… AGAC… RHSL… -Inf 1 1 10296.
## 8 AACCAATCAAAAA… AGAC… RHSL… -Inf 1 1 10296.
## 9 AACCAATCAATAT… AGAC… RHSL… 6 7 6.55 1005.
## 10 AACCAATCAATAT… AGAC… RHSL… 6 7 6.55 1005.
## # ℹ 8,698 more rows
## # ℹ 4 more variables: presort1reads <dbl>, presort2reads <dbl>,
## # seq_differs <lgl>, phenotype_differs <lgl>
# Find barcodes where the duplicates don't match
A_dupes %>%
group_by(barcode) %>%
filter(
n_distinct(AAseq) > 1 | n_distinct(Asp_Median) > 1
) %>%
arrange(barcode) %>%
ungroup()
## # A tibble: 0 × 9
## # ℹ 9 variables: barcode <chr>, NTseq <chr>, AAseq <chr>, Asp_LowerIndex <dbl>,
## # Asp_UpperIndex <int>, Asp_Median <dbl>, Asp_Reads <dbl>,
## # presort1reads <dbl>, presort2reads <dbl>
A_dupes %>%
group_by(barcode) %>%
summarise(
n_copies = n(),
seq_matches = n_distinct(AAseq) == 1,
phenotype_matches = n_distinct(Asp_Median) == 1,
all_identical = seq_matches & phenotype_matches
) %>%
filter(!all_identical) # Show only problematic barcodes
## # A tibble: 0 × 5
## # ℹ 5 variables: barcode <chr>, n_copies <int>, seq_matches <lgl>,
## # phenotype_matches <lgl>, all_identical <lgl>
# Empty tibble means that all duplicated rows are identical, so will merge based on firsr appearance below
# Collapse all duplicated rows
NoLig_all_mod <- NoLig_all %>%
select(barcode,NTseq,AAseq,NoLig_LowerIndex,NoLig_UpperIndex,NoLig_Median,NoLig_Reads,presort1reads,presort2reads) %>%
group_by(barcode) %>%
summarize(
NTseq = dplyr::first(NTseq),
AAseq = dplyr::first(AAseq),
NoLig_Reads = dplyr::first(NoLig_Reads),
presort1reads=dplyr::first(presort1reads),
presort2reads=dplyr::first(presort2reads),
across(c(NoLig_LowerIndex,NoLig_UpperIndex,NoLig_Median), ~mean(.x,na.rm=TRUE))
)
Fum_all_mod <- Fum_all %>%
select(barcode,NTseq,AAseq,Fum_LowerIndex,Fum_UpperIndex,Fum_Median,Fum_Reads,presort1reads,presort2reads) %>%
group_by(barcode) %>%
summarize(
NTseq = dplyr::first(NTseq),
AAseq = dplyr::first(AAseq),
Fum_Reads = dplyr::first(Fum_Reads),
presort1reads=dplyr::first(presort1reads),
presort2reads=dplyr::first(presort2reads),
across(c(Fum_LowerIndex,Fum_UpperIndex,Fum_Median), ~mean(.x,na.rm=TRUE))
)
Asp_all_mod <- Asp_all %>%
select(barcode,NTseq,AAseq,Asp_LowerIndex,Asp_UpperIndex,Asp_Median,Asp_Reads,presort1reads,presort2reads) %>%
group_by(barcode) %>%
summarize(
NTseq = dplyr::first(NTseq),
AAseq = dplyr::first(AAseq),
Asp_Reads = dplyr::first(Asp_Reads),
presort1reads=dplyr::first(presort1reads),
presort2reads=dplyr::first(presort2reads),
across(c(Asp_LowerIndex,Asp_UpperIndex,Asp_Median), ~mean(.x,na.rm=TRUE))
)
merged_BC <- merge(merge(NoLig_all_mod, Fum_all_mod, by = "barcode"), Asp_all_mod, by = "barcode")
# Keep only barcode, seq, lower index, uppder index, and score/bin for each condition; the other columns are no longer needed at this point or are repeated
condensed_merged_BC <- merged_BC %>%
select(barcode,NTseq.x,AAseq.x,NoLig_LowerIndex,NoLig_UpperIndex,NoLig_Median,Fum_LowerIndex,Fum_UpperIndex,Fum_Median,Asp_LowerIndex,Asp_UpperIndex,Asp_Median,presort1reads.x,presort2reads.x,NoLig_Reads,Fum_Reads,Asp_Reads)
# remove negative infinite values from dataframe, push to 0; this occurred as an error in the cumulative sum function in previous scripts
condensed_merged_BC <- condensed_merged_BC %>%
mutate(across(where(is.numeric), ~ ifelse(is.infinite(.), 0, .)))
Load in the bin to MEFL conversion “lookup” tables. There are separate ones for No ligand and Fumarate+Aspartate because those conditions were sorted on different days.
# No Ligand MEFL lookup table
bin_to_MEFL_NL <- read.csv(file="./input_files/bins_to_MEFL_NoLigand.tsv",head=TRUE,sep="\t")
bin_to_MEFL_NL <- bin_to_MEFL_NL %>%
select(X,X488.1..530_30.A) %>%
dplyr::rename(bin=X, MEFL=X488.1..530_30.A) %>%
add_row(bin = 12, MEFL = 500000, .after = 12) %>%
mutate(logMEFL = log10(MEFL))
# Fumarate and Aspartate MEFL lookup table
bin_to_MEFL_FA <- read.csv(file="./input_files/bins_to_MEFL_Fumarate-Aspartate.tsv",head=TRUE,sep="\t")
bin_to_MEFL_FA <- bin_to_MEFL_FA %>%
select(X,X488.1..530_30.A) %>%
dplyr::rename(bin=X, MEFL=X488.1..530_30.A) %>%
add_row(bin = 12, MEFL = 500000, .after = 12) %>%
mutate(logMEFL = log10(MEFL))
Create an interpolation function to convert nondiscreet bin values to MEFL
bin_to_MEFL_inter_NL <- approxfun(bin_to_MEFL_NL$bin, bin_to_MEFL_NL$logMEFL, rule = 2)
bin_to_MEFL_inter_FA <- approxfun(bin_to_MEFL_FA$bin, bin_to_MEFL_FA$logMEFL, rule = 2)
Convert from units of “median bin” to MEFL
MEFL_merged_BC <- condensed_merged_BC %>%
mutate(
NoLig_LowerIndex_logMEFL = bin_to_MEFL_inter_NL(NoLig_LowerIndex),
NoLig_UpperIndex_logMEFL = bin_to_MEFL_inter_NL(NoLig_UpperIndex),
NoLig_Median_logMEFL = bin_to_MEFL_inter_NL(NoLig_Median),
Fum_LowerIndex_logMEFL = bin_to_MEFL_inter_FA(Fum_LowerIndex),
Fum_UpperIndex_logMEFL = bin_to_MEFL_inter_FA(Fum_UpperIndex),
Fum_Median_logMEFL = bin_to_MEFL_inter_FA(Fum_Median),
Asp_LowerIndex_logMEFL = bin_to_MEFL_inter_FA(Asp_LowerIndex),
Asp_UpperIndex_logMEFL = bin_to_MEFL_inter_FA(Asp_UpperIndex),
Asp_Median_logMEFL = bin_to_MEFL_inter_FA(Asp_Median)
) %>%
mutate(NoLig_LowerIndex_logMEFL = 10^NoLig_LowerIndex_logMEFL,
NoLig_UpperIndex_logMEFL = 10^NoLig_UpperIndex_logMEFL,
NoLig_Median_MEFL = 10^NoLig_Median_logMEFL,
Fum_LowerIndex_logMEFL = 10^Fum_LowerIndex_logMEFL,
Fum_UpperIndex_logMEFL = 10^Fum_UpperIndex_logMEFL,
Fum_Median_MEFL = 10^Fum_Median_logMEFL,
Asp_LowerIndex_logMEFL = 10^Asp_LowerIndex_logMEFL,
Asp_UpperIndex_logMEFL = 10^Asp_UpperIndex_logMEFL,
Asp_Median_MEFL = 10^Asp_Median_logMEFL)
Keep only MEFL values for each barcode
merged_BC_MEFL_compare <- MEFL_merged_BC %>%
select(barcode,NTseq.x,AAseq.x,presort1reads.x,presort2reads.x,NoLig_Reads,Fum_Reads,Asp_Reads,NoLig_Median_MEFL,Fum_Median_MEFL,Asp_Median_MEFL) %>%
dplyr::rename(NTseq=NTseq.x, AAseq=AAseq.x, presort1reads=presort1reads.x,presort2reads=presort2reads.x)
These include: no ligand fold-change, fumarate fold-change, aspartate fold-change, fumarate dynamic range, aspartate dynamic range, and ligand specificity scores.
# To calculate the fold-change score, which is based on how the mutant's fluorescence changed relative to wildtype, we need to calculate the wildtype median fluorescence in the dataset.
# define a string identifier for the wildtype DcuS/EnvZ chimera
wt_seq = "RHSLPYRMLRKRPMKLSTTVILMVSAVLFSVLLVVHLIYFSQISDMTRDGLANKALAVARTLADSPEIRQGLQKKPQESGIQAIAEAVRKRNDLLFIVVTDMQSLRYSHPEAQRIGQPFKGDDILKALNGEENVAINRGFLAQALRVFTPIYDENHKQIGVVAIGLELSRVTQQINDSRWSLQMAAGVKQLA"
# filter for the wildtype data
wt_data <- merged_BC_MEFL_compare %>%
filter(AAseq==wt_seq)
# Calculate wildtype median MEFL values
NoLig_WTmed = median(wt_data$NoLig_Median_MEFL)
Fum_WTmed = median(wt_data$Fum_Median_MEFL)
Asp_WTmed = median(wt_data$Asp_Median_MEFL)
print("Wildtype DcuS/EnvZ Chimera Inferred MEFL")
## [1] "Wildtype DcuS/EnvZ Chimera Inferred MEFL"
print(paste0("No Ligand: ", round(NoLig_WTmed,2), " MEFL"))
## [1] "No Ligand: 13914.48 MEFL"
print(paste0("Fumarate: ", round(Fum_WTmed,2), " MEFL"))
## [1] "Fumarate: 24888.15 MEFL"
print(paste0("Aspartate: ", round(Asp_WTmed,2), " MEFL"))
## [1] "Aspartate: 15970.25 MEFL"
merged_BC_MEFL_norm <- merged_BC_MEFL_compare %>%
mutate(
NoLig_fc = log2(NoLig_Median_MEFL / NoLig_WTmed),
Fum_fc = log2(Fum_Median_MEFL / Fum_WTmed),
Asp_fc = log2(Asp_Median_MEFL / Asp_WTmed),
Fum_DNR = log2(Fum_Median_MEFL / NoLig_Median_MEFL),
Asp_DNR = log2(Asp_Median_MEFL / NoLig_Median_MEFL),
Lig_spec = log2(Asp_Median_MEFL / Fum_Median_MEFL),
AAlen = str_length(AAseq)
)
write.csv(merged_BC_MEFL_norm,
"./output_files/All_DcuS_Data-medianMEFL.csv", row.names = FALSE)
The session information is provided for full reproducibility.
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.4.1 (2024-06-14)
## os macOS 15.7.3
## system x86_64, darwin20
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Los_Angeles
## date 2026-05-20
## pandoc 3.6.3 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/x86_64/ (via rmarkdown)
## quarto 1.7.32 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/quarto
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## ade4 1.7-23 2025-02-14 [1] CRAN (R 4.4.1)
## bslib 0.10.0 2026-01-26 [1] CRAN (R 4.4.1)
## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.4.0)
## cli 3.6.5 2025-04-23 [1] CRAN (R 4.4.1)
## devtools * 2.4.6 2025-10-03 [1] CRAN (R 4.4.1)
## dichromat 2.0-0.1 2022-05-02 [1] CRAN (R 4.4.0)
## digest 0.6.39 2025-11-19 [1] CRAN (R 4.4.1)
## dplyr * 1.2.0 2026-02-03 [1] CRAN (R 4.4.1)
## ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.4.0)
## evaluate 1.0.5 2025-08-27 [1] CRAN (R 4.4.1)
## farver 2.1.2 2024-05-13 [1] CRAN (R 4.4.0)
## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
## forcats * 1.0.1 2025-09-25 [1] CRAN (R 4.4.1)
## fs 1.6.6 2025-04-12 [1] CRAN (R 4.4.1)
## generics 0.1.4 2025-05-09 [1] CRAN (R 4.4.1)
## ggplot2 * 4.0.2 2026-02-03 [1] CRAN (R 4.4.1)
## ggridges * 0.5.7 2025-08-27 [1] CRAN (R 4.4.1)
## glue 1.8.0 2024-09-30 [1] CRAN (R 4.4.1)
## gtable 0.3.6 2024-10-25 [1] CRAN (R 4.4.1)
## hms 1.1.4 2025-10-17 [1] CRAN (R 4.4.1)
## htmltools 0.5.9 2025-12-04 [1] CRAN (R 4.4.1)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.4.0)
## jsonlite 2.0.0 2025-03-27 [1] CRAN (R 4.4.1)
## knitr * 1.51 2025-12-20 [1] CRAN (R 4.4.1)
## lifecycle 1.0.5 2026-01-08 [1] CRAN (R 4.4.1)
## lubridate * 1.9.5 2026-02-04 [1] CRAN (R 4.4.1)
## magrittr * 2.0.4 2025-09-12 [1] CRAN (R 4.4.1)
## MASS 7.3-65 2025-02-28 [1] CRAN (R 4.4.1)
## memoise 2.0.1 2021-11-26 [1] CRAN (R 4.4.0)
## otel 0.2.0 2025-08-29 [1] CRAN (R 4.4.1)
## patchwork * 1.3.2 2025-08-25 [1] CRAN (R 4.4.1)
## pillar 1.11.1 2025-09-17 [1] CRAN (R 4.4.1)
## pkgbuild 1.4.8 2025-05-26 [1] CRAN (R 4.4.1)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.0)
## pkgload 1.5.0 2026-02-03 [1] CRAN (R 4.4.1)
## purrr * 1.2.1 2026-01-09 [1] CRAN (R 4.4.1)
## R6 2.6.1 2025-02-15 [1] CRAN (R 4.4.1)
## RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.4.0)
## Rcpp 1.1.1 2026-01-10 [1] CRAN (R 4.4.1)
## readr * 2.1.6 2025-11-14 [1] CRAN (R 4.4.1)
## remotes 2.5.0 2024-03-17 [1] CRAN (R 4.4.0)
## rlang 1.1.7 2026-01-09 [1] CRAN (R 4.4.1)
## rmarkdown 2.30 2025-09-28 [1] CRAN (R 4.4.1)
## rstudioapi 0.18.0 2026-01-16 [1] CRAN (R 4.4.1)
## S7 0.2.1 2025-11-14 [1] CRAN (R 4.4.1)
## sass 0.4.10 2025-04-11 [1] CRAN (R 4.4.1)
## scales 1.4.0 2025-04-24 [1] CRAN (R 4.4.1)
## seqinr * 4.2-36 2023-12-08 [1] CRAN (R 4.4.0)
## sessioninfo 1.2.3 2025-02-05 [1] CRAN (R 4.4.1)
## stringi 1.8.7 2025-03-27 [1] CRAN (R 4.4.1)
## stringr * 1.6.0 2025-11-04 [1] CRAN (R 4.4.1)
## tibble * 3.3.1 2026-01-11 [1] CRAN (R 4.4.1)
## tidyr * 1.3.2 2025-12-19 [1] CRAN (R 4.4.1)
## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.0)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.4.0)
## timechange 0.4.0 2026-01-29 [1] CRAN (R 4.4.1)
## tzdb 0.5.0 2025-03-15 [1] CRAN (R 4.4.1)
## usethis * 3.2.1 2025-09-06 [1] CRAN (R 4.4.1)
## utf8 1.2.6 2025-06-08 [1] CRAN (R 4.4.1)
## vctrs 0.7.1 2026-01-23 [1] CRAN (R 4.4.1)
## withr 3.0.2 2024-10-28 [1] CRAN (R 4.4.1)
## xfun 0.56 2026-01-18 [1] CRAN (R 4.4.1)
## yaml 2.3.12 2025-12-10 [1] CRAN (R 4.4.1)
##
## [1] /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/library
## * ── Packages attached to the search path.
##
## ──────────────────────────────────────────────────────────────────────────────