R Notebook: Provides reproducible analysis for Statistical analysis of multi-barcoded mutant sequences in the following manuscript:
Citation: Lippert LB, Hinton SR, Holston A, Romanowicz KJ, Plesa C. Characterizing Sequence-Function Relationships in Chimeric DcuS/EnvZ Histidine Kinases. In Prep. 2026.
GitHub Repository: https://github.com/PlesaLab/DcuSEnvZ
This pipeline processes barcode-sequence-phenotype data previously generated in Merge_and_MEFL_Convert.Rmd and determines which multi-barcoded mutnts have significant differences between their fold-change scores.
The following R packages must be installed prior to loading into the R session. See the Reproducibility tab for a complete list of packages and their versions used in this workflow.
# Make a vector of required packages
required.packages <- c("devtools", "knitr", "patchwork", "tidyverse", "ggplot2", "dplyr", "tidyr", "magrittr", "stringr", "seqinr", "purrr", "FSA", "rstatix")
# Load required packages
lapply(required.packages, library, character.only = TRUE)
This section is based on the R file: “Dial_Out_Mutant_Selection.R”. It describes how to load all of the pre-existing barcode-sequence-phenotype data necessary for downstream analysis. Here, we use the Kruskal-Walllis (a nonparametric ANOVA) and post-hoc Dunn statistical tests to determine if there are significant differences between the fold-change scores observed for all multi-barcoded mutants. These mutants determined to have significant differences between their phenotype scores were selected from for dial-out PCR and individual characterization. The end result of this script is two .CSV files containing a set of mutant amino acid sequences, their associated barcodes, and phenotype scores.
merged_BC_MEFL_norm <- read.csv(file="./output_files/All_DcuS_Data-medianMEFL.csv", head=T, sep=",")
…the data to associate phenotype with the specific amino acid mutation found in mutant sequences.
For each mutant DcuS sequence, align it to wild-type sequence and determine which amino acids differ. Record three columns: amino acid, position, mutation
# Filter to keep only mutants that are the correct length (192 amino acids) and are associated with more than one barcode
multiBC_data <- merged_BC_MEFL_norm %>%
filter(str_length(AAseq)==192) %>%
group_by(AAseq) %>%
filter(n_distinct(barcode) > 1) %>%
ungroup()
# make a vector of the wild-type sequence to compare mutant sequences to
wt_seq = "RHSLPYRMLRKRPMKLSTTVILMVSAVLFSVLLVVHLIYFSQISDMTRDGLANKALAVARTLADSPEIRQGLQKKPQESGIQAIAEAVRKRNDLLFIVVTDMQSLRYSHPEAQRIGQPFKGDDILKALNGEENVAINRGFLAQALRVFTPIYDENHKQIGVVAIGLELSRVTQQINDSRWSLQMAAGVKQLA"
# WT sequence as a vector
wt_seq_vec <- str_split(wt_seq, "")[[1]]
# custom mutation parser
find_mutations <- function(mut_seq) {
mut_vec <- str_split(mut_seq, "")[[1]]
# safety check
if (length(mut_vec) != length(wt_seq_vec)) {
return(NA_character_)
}
diff_pos <- which(mut_vec != wt_seq_vec)
# no mutations
if (length(diff_pos) == 0) {
return(NA_character_)
}
# output of the function:
paste0(
wt_seq_vec[diff_pos],
diff_pos +1,
mut_vec[diff_pos],
collapse = "_"
)
}
multiBC_data <- multiBC_data %>%
# Create the mutation identifier string
mutate(mutations = sapply(AAseq, find_mutations))
Define a function to perform the Kruskal-Wallis and Dunn tests for each observed multi-barcoded mutant
analyze_mutant_wide_FoldChange <- function(df_mutant) {
# number of barcodes for each mutant
n_barcodes = nrow(df_mutant)
seq = df_mutant$AAseq
# Pivot only inside the function for stats
df_long <- df_mutant %>%
pivot_longer(
cols = c(NoLig_fc, Fum_fc, Asp_fc),
names_to = "Condition",
values_to = "MEFL"
)
# Kruskal–Wallis
kw <- kruskal.test(MEFL ~ Condition, data = df_long)
# Dunn post-hoc (safe)
dunn <- tryCatch({
dunnTest(MEFL ~ Condition, data = df_long, method = "bh")$res
}, error = function(e) NULL)
# Medians computed directly from wide-format columns
med_NoLig <- median(df_mutant$NoLig_fc, na.rm = TRUE)
med_Fum <- median(df_mutant$Fum_fc, na.rm = TRUE)
med_Asp <- median(df_mutant$Asp_fc, na.rm = TRUE)
# Create a tibble of medians and deltas
medians <- tibble(
Condition = c("NoLig_fc_med", "Fum_fc_med", "Asp_fc_med"),
median_MEFL = c(med_NoLig, med_Fum, med_Asp),
)
# Return everything
list(
seq = seq,
kw_p = kw$p.value,
dunn = dunn,
Medians = medians,
n_barcodes = n_barcodes
)
}
Run the statistical tests directly from multiBC_data
results_FoldChange <- multiBC_data %>%
group_by(AAseq) %>%
group_map(~ analyze_mutant_wide_FoldChange(.x), .keep = TRUE)
names(results_FoldChange) <- map_chr(results_FoldChange, ~ unique(.x$seq))
Create a summary table of the statistical test results
summary_table_FoldChange <- map_df(
names(results_FoldChange),
function(m) {
res <- results_FoldChange[[m]]
# Extract barcode lists and values for this mutant
mutant_df <- multiBC_data %>% filter(AAseq == m)
barcodes <- mutant_df$barcode %>% unique() %>% paste(collapse = ",")
NoLig_Median_MEFL_median = median(mutant_df$NoLig_Median_MEFL)
Fum_Median_MEFL_median = median(mutant_df$Fum_Median_MEFL)
Asp_Median_MEFL_median = median(mutant_df$Asp_Median_MEFL)
NoLig_fc_median = median(mutant_df$NoLig_fc)
Fum_fc_median = median(mutant_df$Fum_fc)
Asp_fc_median = median(mutant_df$Asp_fc)
Fum_DNR_median = median(mutant_df$Fum_DNR)
Asp_DNR_median = median(mutant_df$Asp_DNR)
Lig_spec_median = median(mutant_df$Lig_spec)
# Handle Dunn p-values safely
if (!is.null(res$dunn) && "P.adj" %in% colnames(res$dunn)) {
dmin <- suppressWarnings(min(res$dunn$P.adj, na.rm = TRUE))
if (is.infinite(dmin)) dmin <- NA
} else {
dmin <- NA
}
# Handle effect size safely
if (!is.null(res$Medians) && "delta_vs_NoLig" %in% colnames(res$Medians)) {
deltas <- suppressWarnings(as.numeric(res$Medians$delta_vs_NoLig))
max_delta <- if (all(is.na(deltas))) NA else max(abs(deltas), na.rm = TRUE)
} else {
max_delta <- NA
}
tibble(
seq = m,
barcodes = barcodes,
n_barcodes = str_count(barcodes, ",") + 1,
KW_p = res$kw_p,
Dunn_min_padj = dmin,
Max_abs_delta = max_delta,
NoLig_Median_MEFL_median = NoLig_Median_MEFL_median,
Fum_Median_MEFL_median = Fum_Median_MEFL_median,
Asp_Median_MEFL_median = Asp_Median_MEFL_median,
NoLig_fc_median = NoLig_fc_median,
Fum_fc_median = Fum_fc_median,
Asp_fc_median = Asp_fc_median,
Fum_DNR_median = Fum_DNR_median,
Asp_DNR_median = Asp_DNR_median,
Lig_spec_median = Lig_spec_median,
)
}
)
Filter to keep only mutants with a Kruskal-Wallis p-value less than 0.05
significant_mutants_FoldChange <- summary_table_FoldChange %>%
filter(KW_p < 0.05)
Export the dial-out mutant data
write.csv(significant_mutants_FoldChange, "./output_files/significant_mutants_FoldChange.csv", row.names = FALSE)
Define a function to perform the Kruskal-Wallis and Dunn tests for each observed multi-barcoded mutant
analyze_mutant_wide_MedianMEFL <- function(df_mutant) {
# number of barcodes for each mutant
n_barcodes = nrow(df_mutant)
seq = df_mutant$AAseq
# Pivot only inside the function for stats
df_long <- df_mutant %>%
pivot_longer(
cols = c(NoLig_Median_MEFL, Fum_Median_MEFL, Asp_Median_MEFL),
names_to = "Condition",
values_to = "MEFL"
)
# Kruskal–Wallis
kw <- kruskal.test(MEFL ~ Condition, data = df_long)
# Dunn post-hoc (safe)
dunn <- tryCatch({
dunnTest(MEFL ~ Condition, data = df_long, method = "bh")$res
}, error = function(e) NULL)
# Medians computed directly from wide-format columns
med_NoLig <- median(df_mutant$NoLig_Median_MEFL, na.rm = TRUE)
med_Fum <- median(df_mutant$Fum_Median_MEFL, na.rm = TRUE)
med_Asp <- median(df_mutant$Asp_Median_MEFL, na.rm = TRUE)
# Create a tibble of medians and deltas
medians <- tibble(
Condition = c("NoLig_Median_MEFL_med", "Fum_Median_MEFL_med", "Asp_Median_MEFL_med"),
median_MEFL = c(med_NoLig, med_Fum, med_Asp),
)
# Return everything
list(
seq = seq,
kw_p = kw$p.value,
dunn = dunn,
Medians = medians,
n_barcodes = n_barcodes
)
}
Run the statistical tests directly from multiBC_data
results_MedianMEFL <- multiBC_data %>%
group_by(AAseq) %>%
group_map(~ analyze_mutant_wide_MedianMEFL(.x), .keep = TRUE)
names(results_MedianMEFL) <- map_chr(results_MedianMEFL, ~ unique(.x$seq))
Create a summary table of the statistical test results
summary_table_MedianMEFL <- map_df(
names(results_MedianMEFL),
function(m) {
res <- results_MedianMEFL[[m]]
# Extract barcode lists and values for this mutant
mutant_df <- multiBC_data %>% filter(AAseq == m)
barcodes <- mutant_df$barcode %>% unique() %>% paste(collapse = ",")
NoLig_Median_MEFL_median = median(mutant_df$NoLig_Median_MEFL)
Fum_Median_MEFL_median = median(mutant_df$Fum_Median_MEFL)
Asp_Median_MEFL_median = median(mutant_df$Asp_Median_MEFL)
NoLig_fc_median = median(mutant_df$NoLig_fc)
Fum_fc_median = median(mutant_df$Fum_fc)
Asp_fc_median = median(mutant_df$Asp_fc)
Fum_DNR_median = median(mutant_df$Fum_DNR)
Asp_DNR_median = median(mutant_df$Asp_DNR)
Lig_spec_median = median(mutant_df$Lig_spec)
# Handle Dunn p-values safely
if (!is.null(res$dunn) && "P.adj" %in% colnames(res$dunn)) {
dmin <- suppressWarnings(min(res$dunn$P.adj, na.rm = TRUE))
if (is.infinite(dmin)) dmin <- NA
} else {
dmin <- NA
}
# Handle effect size safely
if (!is.null(res$Medians) && "delta_vs_NoLig" %in% colnames(res$Medians)) {
deltas <- suppressWarnings(as.numeric(res$Medians$delta_vs_NoLig))
max_delta <- if (all(is.na(deltas))) NA else max(abs(deltas), na.rm = TRUE)
} else {
max_delta <- NA
}
tibble(
seq = m,
barcodes = barcodes,
n_barcodes = str_count(barcodes, ",") + 1,
KW_p = res$kw_p,
Dunn_min_padj = dmin,
Max_abs_delta = max_delta,
NoLig_Median_MEFL_median = NoLig_Median_MEFL_median,
Fum_Median_MEFL_median = Fum_Median_MEFL_median,
Asp_Median_MEFL_median = Asp_Median_MEFL_median,
NoLig_fc_median = NoLig_fc_median,
Fum_fc_median = Fum_fc_median,
Asp_fc_median = Asp_fc_median,
Fum_DNR_median = Fum_DNR_median,
Asp_DNR_median = Asp_DNR_median,
Lig_spec_median = Lig_spec_median,
)
}
)
Filter to keep only mutants with a Kruskal-Wallis p-value less than 0.05
significant_mutants_MedianMEFL <- summary_table_MedianMEFL %>%
filter(KW_p < 0.05)
Export the dial-out mutant data
write.csv(significant_mutants_MedianMEFL, "./output_files/significant_mutants_MedianMEFL.csv", row.names = FALSE)
The session information is provided for full reproducibility.
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.4.1 (2024-06-14)
## os macOS 15.7.3
## system x86_64, darwin20
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Los_Angeles
## date 2026-05-20
## pandoc 3.6.3 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/x86_64/ (via rmarkdown)
## quarto 1.7.32 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/quarto
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-8 2024-09-12 [1] CRAN (R 4.4.1)
## ade4 1.7-23 2025-02-14 [1] CRAN (R 4.4.1)
## backports 1.5.0 2024-05-23 [1] CRAN (R 4.4.0)
## broom 1.0.12 2026-01-27 [1] CRAN (R 4.4.1)
## bslib 0.10.0 2026-01-26 [1] CRAN (R 4.4.1)
## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.4.0)
## car 3.1-5 2026-02-03 [1] CRAN (R 4.4.1)
## carData 3.0-6 2026-01-30 [1] CRAN (R 4.4.1)
## cli 3.6.5 2025-04-23 [1] CRAN (R 4.4.1)
## devtools * 2.4.6 2025-10-03 [1] CRAN (R 4.4.1)
## dichromat 2.0-0.1 2022-05-02 [1] CRAN (R 4.4.0)
## digest 0.6.39 2025-11-19 [1] CRAN (R 4.4.1)
## dplyr * 1.2.0 2026-02-03 [1] CRAN (R 4.4.1)
## dunn.test 1.3.6 2024-04-12 [1] CRAN (R 4.4.0)
## ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.4.0)
## evaluate 1.0.5 2025-08-27 [1] CRAN (R 4.4.1)
## farver 2.1.2 2024-05-13 [1] CRAN (R 4.4.0)
## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
## forcats * 1.0.1 2025-09-25 [1] CRAN (R 4.4.1)
## Formula 1.2-5 2023-02-24 [1] CRAN (R 4.4.0)
## fs 1.6.6 2025-04-12 [1] CRAN (R 4.4.1)
## FSA * 0.10.1 2026-01-10 [1] CRAN (R 4.4.1)
## generics 0.1.4 2025-05-09 [1] CRAN (R 4.4.1)
## ggplot2 * 4.0.2 2026-02-03 [1] CRAN (R 4.4.1)
## glue 1.8.0 2024-09-30 [1] CRAN (R 4.4.1)
## gtable 0.3.6 2024-10-25 [1] CRAN (R 4.4.1)
## hms 1.1.4 2025-10-17 [1] CRAN (R 4.4.1)
## htmltools 0.5.9 2025-12-04 [1] CRAN (R 4.4.1)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.4.0)
## jsonlite 2.0.0 2025-03-27 [1] CRAN (R 4.4.1)
## knitr * 1.51 2025-12-20 [1] CRAN (R 4.4.1)
## lifecycle 1.0.5 2026-01-08 [1] CRAN (R 4.4.1)
## lubridate * 1.9.5 2026-02-04 [1] CRAN (R 4.4.1)
## magrittr * 2.0.4 2025-09-12 [1] CRAN (R 4.4.1)
## MASS 7.3-65 2025-02-28 [1] CRAN (R 4.4.1)
## memoise 2.0.1 2021-11-26 [1] CRAN (R 4.4.0)
## otel 0.2.0 2025-08-29 [1] CRAN (R 4.4.1)
## patchwork * 1.3.2 2025-08-25 [1] CRAN (R 4.4.1)
## pillar 1.11.1 2025-09-17 [1] CRAN (R 4.4.1)
## pkgbuild 1.4.8 2025-05-26 [1] CRAN (R 4.4.1)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.0)
## pkgload 1.5.0 2026-02-03 [1] CRAN (R 4.4.1)
## purrr * 1.2.1 2026-01-09 [1] CRAN (R 4.4.1)
## R6 2.6.1 2025-02-15 [1] CRAN (R 4.4.1)
## RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.4.0)
## Rcpp 1.1.1 2026-01-10 [1] CRAN (R 4.4.1)
## readr * 2.1.6 2025-11-14 [1] CRAN (R 4.4.1)
## remotes 2.5.0 2024-03-17 [1] CRAN (R 4.4.0)
## rlang 1.1.7 2026-01-09 [1] CRAN (R 4.4.1)
## rmarkdown 2.30 2025-09-28 [1] CRAN (R 4.4.1)
## rstatix * 0.7.3 2025-10-18 [1] CRAN (R 4.4.1)
## rstudioapi 0.18.0 2026-01-16 [1] CRAN (R 4.4.1)
## S7 0.2.1 2025-11-14 [1] CRAN (R 4.4.1)
## sass 0.4.10 2025-04-11 [1] CRAN (R 4.4.1)
## scales 1.4.0 2025-04-24 [1] CRAN (R 4.4.1)
## seqinr * 4.2-36 2023-12-08 [1] CRAN (R 4.4.0)
## sessioninfo 1.2.3 2025-02-05 [1] CRAN (R 4.4.1)
## stringi 1.8.7 2025-03-27 [1] CRAN (R 4.4.1)
## stringr * 1.6.0 2025-11-04 [1] CRAN (R 4.4.1)
## tibble * 3.3.1 2026-01-11 [1] CRAN (R 4.4.1)
## tidyr * 1.3.2 2025-12-19 [1] CRAN (R 4.4.1)
## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.0)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.4.0)
## timechange 0.4.0 2026-01-29 [1] CRAN (R 4.4.1)
## tzdb 0.5.0 2025-03-15 [1] CRAN (R 4.4.1)
## usethis * 3.2.1 2025-09-06 [1] CRAN (R 4.4.1)
## vctrs 0.7.1 2026-01-23 [1] CRAN (R 4.4.1)
## withr 3.0.2 2024-10-28 [1] CRAN (R 4.4.1)
## xfun 0.56 2026-01-18 [1] CRAN (R 4.4.1)
## yaml 2.3.12 2025-12-10 [1] CRAN (R 4.4.1)
##
## [1] /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/library
## * ── Packages attached to the search path.
##
## ──────────────────────────────────────────────────────────────────────────────