R Notebook: Provides reproducible analysis for Statistical analysis of multi-barcoded mutant sequences in the following manuscript:

Citation: Lippert LB, Hinton SR, Holston A, Romanowicz KJ, Plesa C. Characterizing Sequence-Function Relationships in Chimeric DcuS/EnvZ Histidine Kinases. In Prep. 2026.

GitHub Repository: https://github.com/PlesaLab/DcuSEnvZ

Experiment

This pipeline processes barcode-sequence-phenotype data previously generated in Merge_and_MEFL_Convert.Rmd and determines which multi-barcoded mutnts have significant differences between their fold-change scores.

Packages

The following R packages must be installed prior to loading into the R session. See the Reproducibility tab for a complete list of packages and their versions used in this workflow.

# Make a vector of required packages
required.packages <- c("devtools", "knitr", "patchwork", "tidyverse", "ggplot2", "dplyr", "tidyr", "magrittr", "stringr", "seqinr", "purrr", "FSA", "rstatix")

# Load required packages
lapply(required.packages, library, character.only = TRUE)

Dial-Out Mutant Selection Pipeline

This section is based on the R file: “Dial_Out_Mutant_Selection.R”. It describes how to load all of the pre-existing barcode-sequence-phenotype data necessary for downstream analysis. Here, we use the Kruskal-Walllis (a nonparametric ANOVA) and post-hoc Dunn statistical tests to determine if there are significant differences between the fold-change scores observed for all multi-barcoded mutants. These mutants determined to have significant differences between their phenotype scores were selected from for dial-out PCR and individual characterization. The end result of this script is two .CSV files containing a set of mutant amino acid sequences, their associated barcodes, and phenotype scores.

Read in data

merged_BC_MEFL_norm <- read.csv(file="./output_files/All_DcuS_Data-medianMEFL.csv", head=T, sep=",")

Wrangle and filter the data

…the data to associate phenotype with the specific amino acid mutation found in mutant sequences.

For each mutant DcuS sequence, align it to wild-type sequence and determine which amino acids differ. Record three columns: amino acid, position, mutation

# Filter to keep only mutants that are the correct length (192 amino acids) and are associated with more than one barcode
multiBC_data <- merged_BC_MEFL_norm %>%
  filter(str_length(AAseq)==192) %>%
  group_by(AAseq) %>%
  filter(n_distinct(barcode) > 1) %>%
  ungroup()

# make a vector of the wild-type sequence to compare mutant sequences to
wt_seq = "RHSLPYRMLRKRPMKLSTTVILMVSAVLFSVLLVVHLIYFSQISDMTRDGLANKALAVARTLADSPEIRQGLQKKPQESGIQAIAEAVRKRNDLLFIVVTDMQSLRYSHPEAQRIGQPFKGDDILKALNGEENVAINRGFLAQALRVFTPIYDENHKQIGVVAIGLELSRVTQQINDSRWSLQMAAGVKQLA"

# WT sequence as a vector
wt_seq_vec <- str_split(wt_seq, "")[[1]]

# custom mutation parser
find_mutations <- function(mut_seq) {
  mut_vec <- str_split(mut_seq, "")[[1]]
  # safety check
  if (length(mut_vec) != length(wt_seq_vec)) {
    return(NA_character_)
  }
  diff_pos <- which(mut_vec != wt_seq_vec)
  # no mutations
  if (length(diff_pos) == 0) {
    return(NA_character_)
  }
  # output of the function:
  paste0(
    wt_seq_vec[diff_pos],
    diff_pos +1,
    mut_vec[diff_pos],
    collapse = "_"
  )
}

multiBC_data <- multiBC_data %>% 
  # Create the mutation identifier string
  mutate(mutations = sapply(AAseq, find_mutations)) 

Statistical test on fold-change scores

Define a function to perform the Kruskal-Wallis and Dunn tests for each observed multi-barcoded mutant

analyze_mutant_wide_FoldChange <- function(df_mutant) {
  # number of barcodes for each mutant
  n_barcodes = nrow(df_mutant)
  seq = df_mutant$AAseq
  
  # Pivot only inside the function for stats
  df_long <- df_mutant %>%
    pivot_longer(
      cols = c(NoLig_fc, Fum_fc, Asp_fc),
      names_to = "Condition",
      values_to = "MEFL"
    )
  # Kruskal–Wallis
  kw <- kruskal.test(MEFL ~ Condition, data = df_long)
  # Dunn post-hoc (safe)
  dunn <- tryCatch({
    dunnTest(MEFL ~ Condition, data = df_long, method = "bh")$res
  }, error = function(e) NULL)
  # Medians computed directly from wide-format columns
  
  med_NoLig <- median(df_mutant$NoLig_fc, na.rm = TRUE)
  med_Fum   <- median(df_mutant$Fum_fc, na.rm = TRUE)
  med_Asp   <- median(df_mutant$Asp_fc, na.rm = TRUE)
  # Create a tibble of medians and deltas
  medians <- tibble(
    Condition = c("NoLig_fc_med", "Fum_fc_med", "Asp_fc_med"),
    median_MEFL = c(med_NoLig, med_Fum, med_Asp),
  )
  # Return everything
  list(
    seq = seq,
    kw_p = kw$p.value,
    dunn = dunn,
    Medians = medians,
    n_barcodes = n_barcodes
  )
}

Run the statistical tests directly from multiBC_data

results_FoldChange <- multiBC_data %>%
  group_by(AAseq) %>%
  group_map(~ analyze_mutant_wide_FoldChange(.x), .keep = TRUE)

names(results_FoldChange) <- map_chr(results_FoldChange, ~ unique(.x$seq))

Create a summary table of the statistical test results

summary_table_FoldChange <- map_df(
  names(results_FoldChange),
  function(m) {
    res <- results_FoldChange[[m]]
    # Extract barcode lists and values for this mutant
    mutant_df <- multiBC_data %>% filter(AAseq == m)
    barcodes <- mutant_df$barcode %>% unique() %>% paste(collapse = ",")
    
    NoLig_Median_MEFL_median = median(mutant_df$NoLig_Median_MEFL)
    Fum_Median_MEFL_median = median(mutant_df$Fum_Median_MEFL)
    Asp_Median_MEFL_median = median(mutant_df$Asp_Median_MEFL)
    
    NoLig_fc_median = median(mutant_df$NoLig_fc)
    Fum_fc_median = median(mutant_df$Fum_fc)
    Asp_fc_median = median(mutant_df$Asp_fc)
    
    Fum_DNR_median = median(mutant_df$Fum_DNR)
    Asp_DNR_median = median(mutant_df$Asp_DNR)
    Lig_spec_median = median(mutant_df$Lig_spec)
    
    # Handle Dunn p-values safely
    if (!is.null(res$dunn) && "P.adj" %in% colnames(res$dunn)) {
      dmin <- suppressWarnings(min(res$dunn$P.adj, na.rm = TRUE))
      if (is.infinite(dmin)) dmin <- NA
    } else {
      dmin <- NA
    }
    # Handle effect size safely
    if (!is.null(res$Medians) && "delta_vs_NoLig" %in% colnames(res$Medians)) {
      deltas <- suppressWarnings(as.numeric(res$Medians$delta_vs_NoLig))
      max_delta <- if (all(is.na(deltas))) NA else max(abs(deltas), na.rm = TRUE)
    } else {
      max_delta <- NA
    }
    tibble(
      seq = m,
      barcodes = barcodes,
      n_barcodes = str_count(barcodes, ",") + 1,
      
      KW_p = res$kw_p,
      Dunn_min_padj = dmin,
      Max_abs_delta = max_delta,
  
      NoLig_Median_MEFL_median = NoLig_Median_MEFL_median,
      Fum_Median_MEFL_median = Fum_Median_MEFL_median,
      Asp_Median_MEFL_median = Asp_Median_MEFL_median,
      
      NoLig_fc_median = NoLig_fc_median,
      Fum_fc_median = Fum_fc_median,
      Asp_fc_median = Asp_fc_median,
      
      Fum_DNR_median = Fum_DNR_median,
      Asp_DNR_median = Asp_DNR_median,
      Lig_spec_median = Lig_spec_median,

    )
  }
)

Filter to keep only mutants with a Kruskal-Wallis p-value less than 0.05

significant_mutants_FoldChange <- summary_table_FoldChange %>%
  filter(KW_p < 0.05)

Export the dial-out mutant data

write.csv(significant_mutants_FoldChange, "./output_files/significant_mutants_FoldChange.csv", row.names = FALSE)

Statistical test on inferred MEFL values

Define a function to perform the Kruskal-Wallis and Dunn tests for each observed multi-barcoded mutant

analyze_mutant_wide_MedianMEFL <- function(df_mutant) {
  # number of barcodes for each mutant
  n_barcodes = nrow(df_mutant)
  seq = df_mutant$AAseq
  
  # Pivot only inside the function for stats
  df_long <- df_mutant %>%
    pivot_longer(
      cols = c(NoLig_Median_MEFL, Fum_Median_MEFL, Asp_Median_MEFL),
      names_to = "Condition",
      values_to = "MEFL"
    )
  # Kruskal–Wallis
  kw <- kruskal.test(MEFL ~ Condition, data = df_long)
  # Dunn post-hoc (safe)
  dunn <- tryCatch({
    dunnTest(MEFL ~ Condition, data = df_long, method = "bh")$res
  }, error = function(e) NULL)
  # Medians computed directly from wide-format columns
  
  med_NoLig <- median(df_mutant$NoLig_Median_MEFL, na.rm = TRUE)
  med_Fum   <- median(df_mutant$Fum_Median_MEFL, na.rm = TRUE)
  med_Asp   <- median(df_mutant$Asp_Median_MEFL, na.rm = TRUE)
  # Create a tibble of medians and deltas
  medians <- tibble(
    Condition = c("NoLig_Median_MEFL_med", "Fum_Median_MEFL_med", "Asp_Median_MEFL_med"),
    median_MEFL = c(med_NoLig, med_Fum, med_Asp),
  )
  # Return everything
  list(
    seq = seq,
    kw_p = kw$p.value,
    dunn = dunn,
    Medians = medians,
    n_barcodes = n_barcodes
  )
}

Run the statistical tests directly from multiBC_data

results_MedianMEFL <- multiBC_data %>%
  group_by(AAseq) %>%
  group_map(~ analyze_mutant_wide_MedianMEFL(.x), .keep = TRUE)

names(results_MedianMEFL) <- map_chr(results_MedianMEFL, ~ unique(.x$seq))

Create a summary table of the statistical test results

summary_table_MedianMEFL <- map_df(
  names(results_MedianMEFL),
  function(m) {
    res <- results_MedianMEFL[[m]]
    # Extract barcode lists and values for this mutant
    mutant_df <- multiBC_data %>% filter(AAseq == m)
    barcodes <- mutant_df$barcode %>% unique() %>% paste(collapse = ",")
    
    NoLig_Median_MEFL_median = median(mutant_df$NoLig_Median_MEFL)
    Fum_Median_MEFL_median = median(mutant_df$Fum_Median_MEFL)
    Asp_Median_MEFL_median = median(mutant_df$Asp_Median_MEFL)
    
    NoLig_fc_median = median(mutant_df$NoLig_fc)
    Fum_fc_median = median(mutant_df$Fum_fc)
    Asp_fc_median = median(mutant_df$Asp_fc)
    
    Fum_DNR_median = median(mutant_df$Fum_DNR)
    Asp_DNR_median = median(mutant_df$Asp_DNR)
    Lig_spec_median = median(mutant_df$Lig_spec)
    
    # Handle Dunn p-values safely
    if (!is.null(res$dunn) && "P.adj" %in% colnames(res$dunn)) {
      dmin <- suppressWarnings(min(res$dunn$P.adj, na.rm = TRUE))
      if (is.infinite(dmin)) dmin <- NA
    } else {
      dmin <- NA
    }
    # Handle effect size safely
    if (!is.null(res$Medians) && "delta_vs_NoLig" %in% colnames(res$Medians)) {
      deltas <- suppressWarnings(as.numeric(res$Medians$delta_vs_NoLig))
      max_delta <- if (all(is.na(deltas))) NA else max(abs(deltas), na.rm = TRUE)
    } else {
      max_delta <- NA
    }
    tibble(
      seq = m,
      barcodes = barcodes,
      n_barcodes = str_count(barcodes, ",") + 1,
      
      KW_p = res$kw_p,
      Dunn_min_padj = dmin,
      Max_abs_delta = max_delta,
  
      NoLig_Median_MEFL_median = NoLig_Median_MEFL_median,
      Fum_Median_MEFL_median = Fum_Median_MEFL_median,
      Asp_Median_MEFL_median = Asp_Median_MEFL_median,
      
      NoLig_fc_median = NoLig_fc_median,
      Fum_fc_median = Fum_fc_median,
      Asp_fc_median = Asp_fc_median,
      
      Fum_DNR_median = Fum_DNR_median,
      Asp_DNR_median = Asp_DNR_median,
      Lig_spec_median = Lig_spec_median,

    )
  }
)

Filter to keep only mutants with a Kruskal-Wallis p-value less than 0.05

significant_mutants_MedianMEFL <- summary_table_MedianMEFL %>%
  filter(KW_p < 0.05)

Export the dial-out mutant data

write.csv(significant_mutants_MedianMEFL, "./output_files/significant_mutants_MedianMEFL.csv", row.names = FALSE)

Reproducibility

The session information is provided for full reproducibility.

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.4.1 (2024-06-14)
##  os       macOS 15.7.3
##  system   x86_64, darwin20
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/Los_Angeles
##  date     2026-05-20
##  pandoc   3.6.3 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/x86_64/ (via rmarkdown)
##  quarto   1.7.32 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/quarto
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package      * version date (UTC) lib source
##  abind          1.4-8   2024-09-12 [1] CRAN (R 4.4.1)
##  ade4           1.7-23  2025-02-14 [1] CRAN (R 4.4.1)
##  backports      1.5.0   2024-05-23 [1] CRAN (R 4.4.0)
##  broom          1.0.12  2026-01-27 [1] CRAN (R 4.4.1)
##  bslib          0.10.0  2026-01-26 [1] CRAN (R 4.4.1)
##  cachem         1.1.0   2024-05-16 [1] CRAN (R 4.4.0)
##  car            3.1-5   2026-02-03 [1] CRAN (R 4.4.1)
##  carData        3.0-6   2026-01-30 [1] CRAN (R 4.4.1)
##  cli            3.6.5   2025-04-23 [1] CRAN (R 4.4.1)
##  devtools     * 2.4.6   2025-10-03 [1] CRAN (R 4.4.1)
##  dichromat      2.0-0.1 2022-05-02 [1] CRAN (R 4.4.0)
##  digest         0.6.39  2025-11-19 [1] CRAN (R 4.4.1)
##  dplyr        * 1.2.0   2026-02-03 [1] CRAN (R 4.4.1)
##  dunn.test      1.3.6   2024-04-12 [1] CRAN (R 4.4.0)
##  ellipsis       0.3.2   2021-04-29 [1] CRAN (R 4.4.0)
##  evaluate       1.0.5   2025-08-27 [1] CRAN (R 4.4.1)
##  farver         2.1.2   2024-05-13 [1] CRAN (R 4.4.0)
##  fastmap        1.2.0   2024-05-15 [1] CRAN (R 4.4.0)
##  forcats      * 1.0.1   2025-09-25 [1] CRAN (R 4.4.1)
##  Formula        1.2-5   2023-02-24 [1] CRAN (R 4.4.0)
##  fs             1.6.6   2025-04-12 [1] CRAN (R 4.4.1)
##  FSA          * 0.10.1  2026-01-10 [1] CRAN (R 4.4.1)
##  generics       0.1.4   2025-05-09 [1] CRAN (R 4.4.1)
##  ggplot2      * 4.0.2   2026-02-03 [1] CRAN (R 4.4.1)
##  glue           1.8.0   2024-09-30 [1] CRAN (R 4.4.1)
##  gtable         0.3.6   2024-10-25 [1] CRAN (R 4.4.1)
##  hms            1.1.4   2025-10-17 [1] CRAN (R 4.4.1)
##  htmltools      0.5.9   2025-12-04 [1] CRAN (R 4.4.1)
##  jquerylib      0.1.4   2021-04-26 [1] CRAN (R 4.4.0)
##  jsonlite       2.0.0   2025-03-27 [1] CRAN (R 4.4.1)
##  knitr        * 1.51    2025-12-20 [1] CRAN (R 4.4.1)
##  lifecycle      1.0.5   2026-01-08 [1] CRAN (R 4.4.1)
##  lubridate    * 1.9.5   2026-02-04 [1] CRAN (R 4.4.1)
##  magrittr     * 2.0.4   2025-09-12 [1] CRAN (R 4.4.1)
##  MASS           7.3-65  2025-02-28 [1] CRAN (R 4.4.1)
##  memoise        2.0.1   2021-11-26 [1] CRAN (R 4.4.0)
##  otel           0.2.0   2025-08-29 [1] CRAN (R 4.4.1)
##  patchwork    * 1.3.2   2025-08-25 [1] CRAN (R 4.4.1)
##  pillar         1.11.1  2025-09-17 [1] CRAN (R 4.4.1)
##  pkgbuild       1.4.8   2025-05-26 [1] CRAN (R 4.4.1)
##  pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.4.0)
##  pkgload        1.5.0   2026-02-03 [1] CRAN (R 4.4.1)
##  purrr        * 1.2.1   2026-01-09 [1] CRAN (R 4.4.1)
##  R6             2.6.1   2025-02-15 [1] CRAN (R 4.4.1)
##  RColorBrewer   1.1-3   2022-04-03 [1] CRAN (R 4.4.0)
##  Rcpp           1.1.1   2026-01-10 [1] CRAN (R 4.4.1)
##  readr        * 2.1.6   2025-11-14 [1] CRAN (R 4.4.1)
##  remotes        2.5.0   2024-03-17 [1] CRAN (R 4.4.0)
##  rlang          1.1.7   2026-01-09 [1] CRAN (R 4.4.1)
##  rmarkdown      2.30    2025-09-28 [1] CRAN (R 4.4.1)
##  rstatix      * 0.7.3   2025-10-18 [1] CRAN (R 4.4.1)
##  rstudioapi     0.18.0  2026-01-16 [1] CRAN (R 4.4.1)
##  S7             0.2.1   2025-11-14 [1] CRAN (R 4.4.1)
##  sass           0.4.10  2025-04-11 [1] CRAN (R 4.4.1)
##  scales         1.4.0   2025-04-24 [1] CRAN (R 4.4.1)
##  seqinr       * 4.2-36  2023-12-08 [1] CRAN (R 4.4.0)
##  sessioninfo    1.2.3   2025-02-05 [1] CRAN (R 4.4.1)
##  stringi        1.8.7   2025-03-27 [1] CRAN (R 4.4.1)
##  stringr      * 1.6.0   2025-11-04 [1] CRAN (R 4.4.1)
##  tibble       * 3.3.1   2026-01-11 [1] CRAN (R 4.4.1)
##  tidyr        * 1.3.2   2025-12-19 [1] CRAN (R 4.4.1)
##  tidyselect     1.2.1   2024-03-11 [1] CRAN (R 4.4.0)
##  tidyverse    * 2.0.0   2023-02-22 [1] CRAN (R 4.4.0)
##  timechange     0.4.0   2026-01-29 [1] CRAN (R 4.4.1)
##  tzdb           0.5.0   2025-03-15 [1] CRAN (R 4.4.1)
##  usethis      * 3.2.1   2025-09-06 [1] CRAN (R 4.4.1)
##  vctrs          0.7.1   2026-01-23 [1] CRAN (R 4.4.1)
##  withr          3.0.2   2024-10-28 [1] CRAN (R 4.4.1)
##  xfun           0.56    2026-01-18 [1] CRAN (R 4.4.1)
##  yaml           2.3.12  2025-12-10 [1] CRAN (R 4.4.1)
## 
##  [1] /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/library
##  * ── Packages attached to the search path.
## 
## ──────────────────────────────────────────────────────────────────────────────