Data
Read the previous merged data
datPre <- read_tsv("../output/20220614-mar22-batch-bg-sub-data.tsv", col_types = "cccfdddddll") %>%
mutate(host = ordered(host))
list.mutations <- as.character(c(215, 209, 211, 210, 217, 218, 220, 221, 223, 224, 222, 227, 230, 229, 231, 233, 240, 241, 250:255, 257, 258))
exptPre <- datPre %>%
group_by(date, plasmid, host) %>%
summarize(n = n(), n_filter = sum(!low_event_count), .groups = "drop") %>%
mutate(flag = ifelse(plasmid %in% list.mutations, "mutation", ""))
Read the new batches of data
raw <- list(
"05/12/22" = read_tsv("../data/20220512-EO-repeat-batch-1/20220512-gated-median-out.txt", show_col_types = FALSE),
"05/17/22" = read_tsv("../data/20220517-EO-repeat-batch-2/20220517-gated-median-out.txt", show_col_types = FALSE),
"05/19/22" = read_tsv("../data/20220519-EO-repeat-batch-3/20220519-gated-median-out.txt", show_col_types = FALSE),
"05/31/22" = read_tsv("../data/20220531-EO-repeat-batch-4/20220531-gated-median-out.txt", show_col_types = FALSE),
"06/02/22" = read_tsv("../data/20220602-EO-repeat-batch-5/20220602-gated-median-out.txt", show_col_types = FALSE)
)
dat <- bind_rows(raw, .id = "date") %>%
#separate(col = well, sep = 1, into = c("row", "col")) %>%
separate(col = sample, sep = "-", into = c("plasmid", "host", "rep")) %>%
mutate(plasmid = na_if(plasmid, "NA"))
QC
1. Number of events per sample
dat %>% ggplot(aes(x = n_induction/1000)) + geom_histogram(bins = 30, fill = "forestgreen") +
xlab("# events x 1000") +
facet_grid(host~date, scales = "free_y") + theme_bw(base_size = 14) #+ panel_border()

Majority of the experimental wells (not blank) have > 10000
events, with the exception of a few samples
dat %>% filter(host != "blank", n_induction < 10000)
Nearly all samples with a low event count were in the 555 background.
Emily repeated many of them. Will remove all samples with lower than
7,000 events in the final gated population from further analysis.
Flag samples that have a low event count, contain mutations or show
multiple populations.
event.th <- 7000
dat <- dat %>%
mutate(
low_event_count = n_induction < event.th,
# Emily labeled the old plasmids with mutations with an "o" in the flow sample name column
mutation = ifelse(grepl("o$", name), TRUE, FALSE)
) %>%
filter(host != "blank")
2. Background subtraction
Check the background fluorescence levels across different days.
dat %>% filter(host == "156") %>%
mutate(date = gsub("/22", "", date)) %>%
pivot_longer(BL1.H:nRFP, names_to = "parameter", values_to = "intensity") %>%
mutate(parameter = ordered(parameter, levels = c("BL1.H", "YL2.H", "nGFP", "nRFP"))) %>%
ggplot(aes(x = date, y = intensity)) + geom_point(aes(shape = well)) +
stat_summary(fun.data = "mean_se", geom = "pointrange", color = "red", position = position_nudge(x = 0.1)) +
facet_wrap(~parameter, scale = "free_y") +
theme_bw(base_size = 14)

06/02 run had smaller FSC values (~ cell volume) compared with the
other days, while the background fluorescence doesn’t appear to be a lot
higher, resulting in higher normalized values.
Subtract the background
bg <- dat %>%
filter(host == "156") %>%
group_by(date) %>%
summarize(across(FSC.H:nRFP, ~ round(mean(.x),1))) %>%
column_to_rownames(var = "date")
dat1 <- dat %>%
filter(host != "blank") %>%
select(date:host, FSC.H:nRFP, low_event_count:mutation) %>%
mutate(
BL1.H = BL1.H - bg[date, "BL1.H"],
YL2.H = YL2.H - bg[date, "YL2.H"],
nGFP = nGFP - bg[date, "nGFP"],
nRFP = nRFP - bg[date, "nRFP"],
)
Double check that the subtraction worked correctly
dat1 %>% filter(host == "156") %>%
pivot_longer(BL1.H:nRFP, names_to = "parameter", values_to = "intensity") %>%
mutate(
parameter = ordered(parameter, levels = c("BL1.H", "YL2.H", "nGFP", "nRFP")),
date = gsub("\\/22$", "", date)
) %>%
ggplot(aes(x = date, y = intensity)) + geom_point(aes(shape = well)) +
stat_summary(fun.data = "mean_se", geom = "pointrange", color = "red", position = position_nudge(x = 0.1)) +
facet_wrap(~parameter, scale = "free_y") +
theme_bw(base_size = 14)

# remove the yH156 samples
dat1 <- dat1 %>%
filter(host != "156") %>%
mutate(host = ordered(host, levels = c("555", "373"), labels = c("PHO2", "pho2∆")))
3. Well position
Rationale
In my plate design, I placed a positive control strain (CgPho4-mNeon
in PHO2 background) in the control columns (1, 5, 9) on every
other row (A, C, E, G). The rationale for this is to use it to identify
any well position effect on the fluorescence readings. In this round of
repeat experiments, Emily followed this design and put pH188-yH323 in
these wells. Now we can properly assess the well position effect.
For this analysis, we will combine and compare the May/Jun batch with
the Mar batch
datM <- bind_rows(datPre, dat1)
datM %>%
filter(plasmid == "188", host == "PHO2") %>%
separate(well, into = c("row", "col"), sep = 1) %>%
pivot_longer(BL1.H:YL2.H, names_to = "parameter", values_to = "intensity") %>%
ggplot(aes(x = col, y = intensity, group = date)) +
geom_point(aes(color = date), size = 1) +
#geom_line(aes(color = date), size = 0.4) +
stat_summary(aes(color = date), geom = "line", fun = mean) +
scale_color_brewer(type = "qual", palette = 2) +
xlab("Column") +
facet_grid(parameter~row, scales = "free_y") +
theme_bw(base_size = 14)

- no obvious trend between the rows or columns
- a clear correlation between YL2.H and BL1.H.
Same plot for normalized fluorescence
datM %>%
filter(plasmid == "188", host == "PHO2") %>%
separate(well, into = c("row", "col"), sep = 1) %>%
pivot_longer(c(nGFP, nRFP), names_to = "parameter", values_to = "intensity") %>%
ggplot(aes(x = col, y = intensity, group = date)) +
geom_point(aes(color = date), size = 1) +
#geom_line(aes(color = date), size = 0.4) +
stat_summary(aes(color = date), geom = "line", fun = mean) +
scale_color_brewer(type = "qual", palette = 2) +
xlab("Column") +
facet_grid(parameter~row, scales = "free_y") +
theme_bw(base_size = 14)

- What we care about is the ratio between Pho4-mNeon and
PHO5pr-mCherry
datM %>%
filter(plasmid == "188", host == "PHO2") %>%
separate(well, into = c("row", "col"), sep = 1) %>%
ggplot(aes(x = BL1.H, y = YL2.H)) +
geom_point(aes(color = date, shape = row), size = 3) +
stat_smooth(method = "lm", se = FALSE, color = "gray50", size = 0.5) +
scale_color_brewer(type = "qual", palette = 2) +
#scale_color_viridis_d() +
#scale_shape_manual(values = c(15:17, 23, 25)) +
theme_bw(base_size = 14)
`geom_smooth()` using formula 'y ~ x'

- There is variation in BL1.H and correspondingly in YL2.H, but the
ratio between the two are very consistent
4. Run effect
How does the same control strain behave in different runs
(represented as date variable here)? Is there a strong run effect?
Let’s check both ScPho4 (pH194) and CgPho4 (pH188) to see if their
behaviors are consistent across days and between this and the last
(03/30) batch.
datM %>%
filter(plasmid %in% c("188", "194")) %>%
mutate(
date = gsub("/22$", "", date),
`YL2.H / BL1.H` = YL2.H / BL1.H,
Pho4 = factor(plasmid, levels = c("194", "188"), labels = c("ScPho4", "CgPho4"))
) %>%
separate(well, into = c("row", "col"), sep = 1) %>%
ggplot(aes(x = date, y = `YL2.H / BL1.H`, group = host)) +
geom_point(aes(color = host), position = position_jitter(0.1), alpha = 0.8, size = 1) +
scale_color_viridis_d(begin = 0.2, end = 0.6) +
stat_summary(fun.data = "mean_se", geom = "pointrange", shape = 16, color = "red") +
ylim(0, 30) + facet_grid(Pho4~.) + theme_bw(base_size = 14)

while the characteristic behaviors of ScPho4 and CgPho4 are
consistent across runs, there are also clear variability in the RFP/GFP
ratios between plates for the same genotype.
Next we examine the underlying background-subtracted fluorescence
values for pH188 and pH194 in the yH555 background, which are present on
each plate.
datM %>%
filter(plasmid %in% c("188", "194"), host == "PHO2") %>%
mutate(
date = gsub("/22$", "", date),
plasmid = factor(plasmid, levels = c("188", "194"), labels = c("CgPho4", "ScPho4"))
) %>%
pivot_longer(BL1.H:YL2.H, names_to = "parameter", values_to = "intensity") %>%
ggplot(aes(x = date, y = intensity, group = plasmid)) +
geom_point(aes(color = plasmid), alpha = 0.7, size = 1,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.7)) +
scale_color_manual(values = c("ScPho4" = "blue3", "CgPho4" = "forestgreen")) +
stat_summary(aes(color = plasmid), fun.data = "mean_se", geom = "crossbar", width = 0.4,
position = position_dodge(0.7)) +
facet_grid(parameter~., scale = "free_y") +
expand_limits(y = 0) +
theme_bw(base_size = 14) + labs(subtitle = "in PHO2 background")# +

#theme(plot.subtitle = element_markdown())
CgPho4 and ScPho4 values track each other – on days (runs) where one
is high, the other one also tends to be high. On the 05/31 run, ScPho4
is significantly higher than CgPho4.
datM %>%
filter(plasmid %in% c("188", "194"), host == "PHO2") %>%
mutate(
date = gsub("/22$", "", date),
plasmid = factor(plasmid, levels = c("188", "194"), labels = c("CgPho4", "ScPho4"))
) %>%
pivot_longer(nGFP:nRFP, names_to = "parameter", values_to = "intensity") %>%
ggplot(aes(x = date, y = intensity, group = plasmid)) +
geom_point(aes(color = plasmid), alpha = 0.7, size = 1,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.7)) +
scale_color_manual(values = c("ScPho4" = "blue3", "CgPho4" = "forestgreen")) +
stat_summary(aes(color = plasmid), fun.data = "mean_se", geom = "crossbar", width = 0.4,
position = position_dodge(0.7)) +
facet_grid(parameter~., scale = "free_y") +
expand_limits(y = 0) +
theme_bw(base_size = 14) + labs(subtitle = "in PHO2 background")# +

#theme(plot.subtitle = element_markdown())
The size normalized values show lower variance but a similar
correlated pattern. Here, however, ScPho4 overtakes CgPho4 on the 03/22
and 05/17 runs.
How about the RFP/GFP ratios?
datM %>%
filter(plasmid %in% c("188", "194"), host == "PHO2") %>%
mutate(
date = gsub("/22$", "", date),
plasmid = factor(plasmid, levels = c("188", "194"), labels = c("CgPho4", "ScPho4")),
`YL2 / BL1` = YL2.H / BL1.H,
`nRFP / nGFP` = nRFP / nGFP
) %>%
pivot_longer(`YL2 / BL1`:`nRFP / nGFP`, names_to = "parameter", values_to = "ratio") %>%
ggplot(aes(x = date, y = ratio, group = plasmid)) +
geom_point(aes(color = plasmid), alpha = 0.6, size = 1,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.7)) +
stat_summary(aes(color = plasmid), fun.data = "mean_se", geom = "crossbar", width = 0.4,
position = position_dodge(0.7)) +
scale_color_manual(values = c("ScPho4" = "blue3", "CgPho4" = "forestgreen")) +
facet_grid(parameter~.) + expand_limits(y = c(0,30)) +
theme_bw(base_size = 14) + labs(subtitle = "in PHO2 background")# +

#theme(plot.subtitle = element_markdown())
5. Statistical tests for confounding factors
Testing for run (date) as well as well (block) position effects on
fluorescence values. Using the nest-map-unnest
workflow
Note that in order to maintain a balanced design (same number of
replicates in each group), we will restrict the analysis to batches 1-4
in the May/Jun dataset. The March dataset had a different plate design,
while batch 5 of the May/Jun dataset is incomplete w.r.t. the controls
(because only half of the plate was used).
tmp <- filter(dat1, plasmid == "188", host == "PHO2", date != "06/02/22") %>%
separate(well, into = c("row", "col"), sep = 1, remove = FALSE) %>%
select(date, block = well, row, col, BL1.H:nRFP) %>%
mutate(RvG = YL2.H / BL1.H, nRvG = nRFP / nGFP)
tmp %>%
pivot_longer(cols = BL1.H:nRvG, names_to = "parameter", values_to = "value") %>%
nest(data = c(-parameter)) %>%
mutate(
fit = map(data, ~anova(lm(value ~ date + row + col, data = .x))),
tidied = map(fit, broom::tidy)
) %>%
unnest(tidied) %>%
mutate(
significant = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ ".",
TRUE ~ " "
)) %>%
filter(term != "Residuals") %>%
select(parameter, term, df, statistic, p.value, significant)
There is a highly significant run (date) effect for all variables.
The R/G and nR/G ratios show significant column effects. I also tried
using block (12 blocks on the plate) and most of the block effects are
significant.
Use dummy encoding in linear regression to view the effects of run
and well position.
tmp %>%
mutate(row = factor(row, levels = c("G", "E", "C", "A"))) %>%
pivot_longer(cols = BL1.H:nRvG, names_to = "parameter", values_to = "value") %>%
nest(data = c(-parameter)) %>%
mutate(
fit = map(data, ~lm(value ~ date + row + col, dat = .x)),
tidied = map(fit, broom::tidy)
) %>%
unnest(tidied) %>%
mutate(
term = gsub("/22$", "", term),
estimate = round(estimate, digits = 2),
significant = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ ".",
TRUE ~ " "
),
eff = paste(estimate, significant, sep = " ")
) %>%
select(parameter, term, eff) %>%
pivot_wider(names_from = parameter, values_from = eff)# %>%
#mutate(across(e_BL1:e_nRFP, ~ round(.x, digits = 0)))
- There is a significant run (date) effect for all variables. RvG
ratio appears to be better than nRvG in terms of being less affected by
either run or well positions.
- Row A has a higher fluorescence values overall. But the RvG ratios
are not significantly higher in row A.
- Column 9 has a significantly higher RvG, because, for some reason,
the RFP is higher but not the GFP.
With the above observations, it is yet unclear to me (1) whether it
is necessary to account for the run and position effects (if the effect
sizes we are interested in are much larger than these, we may not care)
and (2) if necessary, what’s the best way to account for it. One idea I
have is to use the common strains on each plate, e.g., the host strains
(no Pho4) or CgPho4 and ScPho4 with and without Pho2, as normalizing
factors. One can think of several ways of normalization, e.g., setting
ScPho4 w/ Pho2’s RvG as 100%. Will explore these ideas below.
Main
1. Mutation effects
Here we examine the effects of the mutations on the chimera activity.
Emily remade 26 chimeric constructs to correct the non-synonymous
mutations in them and reran those constructs in the May/Jun batch. While
doing that, she encountered some issues with certain samples not growing
well or the flow data showing multiple populations. She repeated some of
those strains. As a result, we have more than two measurements for some
of the strains. The goal in this analysis is to compare their
measurements both with and without mutations, and also across the
repeats.
Read in the experimental flags for the May/Jun batch, for the
multiple population labels
multipop <- read_tsv("../data/20220614-EO-repeat-experiment-info.tsv", col_types = cols()) %>%
filter(flag == "multi pop") %>% select(-flag) %>%
separate(sample, into = c("plasmid", "host"), sep = "-") %>%
mutate(host = ordered(host, levels = c("555", "373"), labels = c("PHO2", "pho2∆")), multi_pop = TRUE)
Create a flag variable
datN <- datM %>%
mutate(multi_pop = FALSE) %>%
rows_update(multipop, by = c("date", "plasmid", "host")) %>%
mutate(
RvG = ifelse(BL1.H > 0, YL2.H/BL1.H, NA),
#RvG = num(RvG, digits = 2),
flag = factor(mutation + multi_pop*2,
levels = as.character(0:3),
labels = c("pass", "mutation", "multi_pop", "mutation;multi_pop"))
) %>% relocate(RvG, .after = nRFP)
Emily has included two old samples containing mutations (pH209, 210)
along with the remade, mutation-free plasmids, in the new batch. Let’s
see how they compare with each other and with the March batch (with
mutations).
tmp <- filter(datN, plasmid %in% c("209", "210")) %>%
mutate(
month = str_sub(date, 1, 2),
sample = ifelse(month == "03", "03-old", ifelse(mutation, "05-old", "05-new")),
sample = factor(sample, levels = c("03-old", "05-old", "05-new"))
)
First compare the same plasmids (with mutations) measured on
different days
tmp %>%
select(-FSC.H) %>%
pivot_longer(BL1.H:RvG, names_to = "parameter", values_to = "value") %>%
mutate(parameter = factor(parameter, levels = c("BL1.H", "YL2.H", "nGFP", "nRFP", "RvG"))) %>%
ggplot(aes(x = plasmid, y = value, group = sample)) +
geom_point(aes(color = sample), size = 1, position = position_dodge(0.7)) +
#stat_summary(aes(color = sample), fun.data = "mean_se", geom = "crossbar", width = 0.1,
# position = position_dodge(0.7)) +
scale_color_manual(values = c("03-old" = "gray50", "05-old" = "gray30", "05-new" = "red2")) +
scale_y_continuous(n.breaks = 3) + expand_limits(y = 0) +
facet_grid(parameter~host, scales = "free_y") +
theme_bw(base_size = 14) + background_grid(major = "y")

tmp %>%
select(-FSC.H) %>%
pivot_longer(BL1.H:RvG, names_to = "parameter", values_to = "value") %>%
mutate(parameter = factor(parameter, levels = c("BL1.H", "YL2.H", "nGFP", "nRFP", "RvG"))) %>%
nest(data = c(-parameter)) %>%
mutate(
fit = map(data, ~anova(lm(value ~ sample + host, data = .x))),
tidied = map(fit, broom::tidy)
) %>%
unnest(tidied) %>%
mutate(
significant = case_when(
p.value < 0.001 ~ "***",
p.value < 0.01 ~ "**",
p.value < 0.05 ~ "*",
p.value < 0.1 ~ ".",
TRUE ~ " "
)) %>%
filter(term != "Residuals") %>%
select(parameter, term, df, statistic, p.value, significant)
For the RvG ratio, there is no significant difference between the
03-old, 05-old and 05-new. In other words, we don’t have evidence that
either the different runs or the correction of mutations impacted the
behavior of the chimera in the case of pH209 and pH210.
We will now move on to the rest of the plasmids that have been
remade.
pp <- theme(axis.title.x = element_blank(), legend.position = "none")
myPlotMutEffect <- function(l){
even_numbers <- seq(2, length(l), 2) # used to draw stripes on even numbered x-values
df_tile <- tibble(plasmid = rep(sort(l)[even_numbers], times = 2),
host = factor(rep(c("PHO2", "pho2∆"), each = length(even_numbers))))
datN %>%
#mutate(date = gsub("/22$", "", date)) %>%
filter(!low_event_count, plasmid %in% l) %>%
select(date:host, RvG, flag) %>%
ggplot(aes(x = plasmid, y = RvG)) +
# draw stripes on even columns,
# from: https://stackoverflow.com/questions/56961744/draw-alternate-rectangles-in-boxplots-with-facets-r-ggplot2'
geom_tile(aes(x = plasmid, y = 1), height = Inf, width = 1, data = df_tile, alpha = 0.3) +
#ymin = -Inf, ymax = Inf, fill = "grey80", color = NA, alpha = 0.5) +
geom_point(aes(color = flag, group = date), size = 1.5, shape = 19,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.7)) +
#stat_summary(aes(color = sample), fun.data = "mean_se", geom = "crossbar", width = 0.1,
# position = position_dodge(0.7)) +
scale_color_manual(values = c("pass" = "forestgreen", "mutation" = "purple", "multi_pop" = "steelblue3")) +
scale_y_continuous(n.breaks = 4) + expand_limits(y = 0) +
facet_grid(host~., scales = "free_y") +
theme_cowplot() + panel_border(color = "gray30") +
theme(strip.text = element_text(face = 3))
}
lst <- split(list.mutations, f = ceiling(seq_along(list.mutations)/9))
p <- lapply(lst, myPlotMutEffect)
legend_p <- get_legend(p[[1]] + theme(legend.position = "bottom", legend.justification = "center"))
plot_grid(p[[1]] + pp, p[[2]] + pp, p[[3]] + theme(legend.position = "none"), legend_p, ncol = 1, rel_heights = c(1,1,1,.15))

Samples showing multiple populations mostly agree with the latter
measurements where the issue was ressolved, with the exception of one of
the samples of 231 in PHO2 background.
Here are the ones that show obvious differences between the old and
corrected constructs:
ll <- c("215", "218", "227", "233", "241", "252", "257")
myPlotMutEffect(ll) + scale_y_log10()
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
Warning: Transformation introduced infinite values in continuous y-axis

2. Summarize results per chimera
We will remove all the measurements made of chimeras with mutations.
For the measurements made of remade, corrected chimeras, we will remove
one sample for pH231 from the 05/19 batch (well C2, the
other 2 replicates of the same strain in that batch were removed due to
the presence of multiple populations and low event count in the gated
population), as well as one sample for pH233 from the 06/02
batch (well D4, which showed overexpression of the
chimera).
Number of replicates measured for each chimera x host
combination:
# load genotype information
meta <- read_tsv("../data/20220621-chimera-Pho4-makeup.tsv", col_types = "ccccc") %>%
rename(genotype = full) %>% select(-plasmid_wrong)
datN %>%
filter(!low_event_count, !mutation) %>%
count(plasmid, host) %>%
pivot_wider(names_from = host, values_from = n) %>%
full_join(meta, by = "plasmid") %>%
select(-letter, -genotype)
pH211 was measured 9 times in the PHO2 background. The
results were consistent.
For each chimera, we would like to calculate three
values:
- RvG in pho2∆: this is its base activity without Pho2
- RvG in PHO2: this is its full activity with Pho2
- RvG_PHO2 / RvG_pho2∆: this is the Pho2 enhancement of activity
The first two values are further normalized against the corresponding
ScPho4 measurements on the same plate.
# filter data
tmp <- filter(datN, !low_event_count, !mutation, !is.na(plasmid)) %>% select(date:host, BL1.H:RvG)
# calculate the ScPho4 values per plate
mScPho4 <- tmp %>% filter(plasmid == "194") %>% group_by(date, host) %>% summarize(mean = mean(RvG), .groups = "drop") %>%
pivot_wider(names_from = host, values_from = mean) %>% column_to_rownames(var = "date")
# normalize the values for non-ScPho4 with those of ScPho4 per plate
datO <- tmp %>% mutate(sRvG = RvG / mScPho4[cbind(date, as.character(host))])
# summarize
datsum <- datO %>%
group_by(plasmid, host) %>%
summarize(across(RvG:sRvG, mean), .groups = "drop") %>%
pivot_wider(names_from = host, values_from = RvG:sRvG) %>%
rename(`A_PHO2` = `RvG_PHO2`, `A_pho2` = `RvG_pho2∆`, full = sRvG_PHO2, base = `sRvG_pho2∆`) %>%
mutate(boost = `A_PHO2` / `A_pho2`, across(`A_PHO2`:boost, ~ round(.x, digits = 3))) %>%
full_join(meta, by = "plasmid")
Export the summary data
write_tsv(datsum, file = "../output/20220619-chimera-summary-result.tsv")
Here are the chimera that have little full activity (with Pho2):
| 229 |
1.154 |
1.288 |
0.045 |
0.910 |
0.896 |
SSSCS |
Sc(1-176) Cg(283-458) Sc(243-312) |
| 250 |
1.695 |
1.873 |
0.071 |
1.632 |
0.905 |
CSSCS |
Cg(1-44) Sc(43-176) Cg(283-458) Sc(243-312) |
| 224 |
2.750 |
1.473 |
0.108 |
1.040 |
1.868 |
SCSCS |
Sc(1-42) Cg(45-112) Sc(100-176) Cg(283-458)
Sc(243-312) |
| 235 |
2.169 |
2.188 |
0.109 |
2.211 |
0.991 |
SSCCS |
Sc(1-99) Cg(113-458) Sc(243-312) |
| 220 |
2.979 |
2.839 |
0.116 |
2.005 |
1.049 |
SCSCC |
Sc(1-42) Cg(45-112) Sc(100-176) Cg(283-533) |
| 240 |
3.093 |
2.582 |
0.125 |
2.093 |
1.198 |
CCSCS |
Cg(1-112) Sc(100-176) Cg(283-458) Sc(243-312) |
| 216 |
3.774 |
3.704 |
0.139 |
2.519 |
1.019 |
SCCCS |
Sc(1-42) Cg(45-458) Sc(243-312) |
| 227 |
3.722 |
2.763 |
0.155 |
2.409 |
1.347 |
SSSCC |
Sc(1-176) Cg(283-533) |
| 253 |
3.287 |
3.396 |
0.175 |
3.309 |
0.968 |
CSCCS |
Cg(1-44) Sc(43-99) Cg(113-458) Sc(243-312) |
Here are the ones with the highest base activity (without Pho2):
| 188 |
20.771 |
17.674 |
0.899 |
14.709 |
1.175 |
CCCCC |
Cg(1-533) |
| 257 |
40.497 |
20.073 |
1.583 |
14.174 |
2.017 |
CCCscC |
Cg (1-282) Sc(177-204) Cg(328-533) |
| 213 |
13.974 |
11.972 |
0.700 |
12.097 |
1.167 |
CSCCC |
Cg(1-44) Sc(43-99) Cg(113-533) |
| 212 |
17.570 |
13.155 |
0.646 |
8.945 |
1.336 |
SCCCC |
Sc(1-42) Cg(45-533) |
| 209 |
10.920 |
8.746 |
0.456 |
7.624 |
1.249 |
CCSCC |
Cg(1-112) Sc(100-176) Cg(283-533) |
| 255 |
18.337 |
7.505 |
0.765 |
6.541 |
2.443 |
CSSscC |
Cg(1-44) Sc(43-204) Cg(328-533) |
| 219 |
47.187 |
9.115 |
1.734 |
6.198 |
5.177 |
CCCSS |
Cg(1-282) Sc(177-312) |
| 215 |
8.904 |
7.686 |
0.348 |
5.427 |
1.159 |
SSCCC |
Sc(1-99) Cg(113-533) |
| 210 |
6.359 |
5.536 |
0.265 |
4.825 |
1.149 |
CCCCS |
Cg(1-458) Sc(243-312) |
| 266 |
6.546 |
5.438 |
0.273 |
4.740 |
1.204 |
SSscCC |
Sc(1-153) Cg(250-533) |
3. Plotting functions
The goal here is to develop a set of plotting functions to visualize
the results.
For testing purposes, we will select a set of chimera along with the
endogenous ScPho4 and CgPho4.
# extract ximera names
refs <- c("CCCCC","SSSSS")
ximeras <- setdiff(meta$symbol, refs)
# make a test set
test <- c(refs, filter(meta, plasmid %in% c("211", "229", "250", "224", "257", "219", "266")) %>% pull(symbol))
# subset data
datT <- meta %>%
filter(symbol %in% test) %>%
inner_join(datO, by = "plasmid") %>%
mutate(symbol = factor(symbol, levels = test))
Design a set of color palettes:
# for two groups, e.g., with vs w/o Pho2
cols.two <- c("PHO2" = "#457dbc", "pho2∆" = "#ea9e25")
Plot individual components and the normalized activity (RvG
ratio)
datT %>%
pivot_longer(c(nGFP, nRFP, RvG), names_to = "parameter", values_to = "intensity") %>%
mutate(parameter = ordered(parameter, levels = c("nGFP", "nRFP", "RvG"),
labels = c("Pho4-GFP", "_PHO5pr_ => RFP", "Activity = RFP/GFP"))) %>%
#pivot_longer(BL1.H:YL2.H, names_to = "parameter", values_to = "intensity") %>%
#mutate(parameter = ordered(parameter, levels = c("YL2.H", "BL1.H"))) %>%
ggplot(aes(x = symbol, y = intensity, group = host)) +
geom_bar(aes(fill = host), width = 0.5,# alpha = 0.8,
stat = "summary", fun = "mean", position = position_dodge(0.5)) +
geom_point(aes(color = host), size = 1, alpha = 0.9, shape = 3,
position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.5)) +
scale_color_manual("co-TF", values = c("PHO2" = "gray30", "pho2∆" = "gray40")) +
scale_fill_manual("co-TF", values = cols.two) +
#stat_summary(fun = "mean", geom = "crossbar", color = "red", width = 0.25,
# position = position_dodge(0.75), ) +
facet_wrap(~parameter, scale = "free_y", ncol = 1) +
xlab("Pho4 chimera") + expand_limits(y = 0) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, family = "mono"),
strip.text = element_markdown())

Plot the RvG values normalized against the corresponding ScPho4
construct on the same plate, to obtain a relative measure.
# useful to group the plasmids
ximera.grp <- datsum %>%
mutate(group = case_when(
full < 0.2 ~ "defective",
base > 5 ~ "less Pho2-dep.",
TRUE ~ "others"
)) %>% select(plasmid, symbol, group)
p0 <- datT %>%
left_join(ximera.grp, by = c("symbol", "plasmid")) %>%
ggplot(aes(x = symbol, y = sRvG, group = host)) +
geom_bar(aes(fill = host), width = 0.5,# alpha = 0.8,
stat = "summary", fun = "mean", position = position_dodge(0.5)) +
geom_point(aes(color = host), size = 1, alpha = 0.9, shape = 3,
position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.5)) +
scale_color_manual(values = c("PHO2" = "gray30", "pho2∆" = "gray40"), guide = "none") +
scale_fill_manual(values = cols.two, guide = "none") +
#stat_summary(fun = "mean", geom = "crossbar", color = "red", width = 0.25,
# position = position_dodge(0.75), ) +
facet_grid(host~group, scale = "free") +
xlab("Pho4 chimera") + ylab("A<sub>chimera</sub> / A<sub>ScPho4</sub>") +
theme_bw(base_size = 14) + background_grid(minor = "none") +
theme(axis.text.x = element_text(angle = 30, hjust = 1, family = "mono"),
strip.text = element_markdown(), axis.title.y = element_markdown())
p1 <- datsum %>%
filter(symbol %in% test) %>%
left_join(ximera.grp, by = c("plasmid", "symbol")) %>%
ggplot(aes(x = symbol, y = boost)) +
geom_col(width = 0.3, color = "black", fill = "gray80") +
geom_hline(yintercept = 1, linetype = 2, color = "gray30") +
facet_wrap(~group, scales = "free_x") +
scale_y_log10() +
xlab("Pho4 chimera") + ylab("A<sub>PHO2</sub> / A<sub>pho2∆</sub>") +
theme_bw(base_size = 14) + background_grid(minor = "none") +
theme(axis.text.x = element_text(angle = 30, hjust = 1, family = "mono"),
strip.text = element_blank(), axis.title.y = element_markdown())
set_null_device("png") # from https://github.com/wilkelab/cowplot/issues/174
plot_grid(p0 + theme(axis.title.x = element_blank()),
p1, ncol = 1, rel_heights = c(3,2))

save the data objects for the interactive data plotter
save(datO, datsum, meta, file = "../shinyapp/20220620-data-for-interactive-plotting.RData")
Design the plot
tmpsum %>%
mutate(Activity = ifelse(`R/G_PHO2` < 2*low.act.th["R/G_pho2∆"], "low", "pass")) %>%
ggplot(aes(x = symbol, y = `pho2∆/PHO2`)) +
geom_col(aes(group = date, fill = `R/G_PHO2`), width = 0.75, color = "gray50",
position = position_dodge(0.9)) +
#geom_point(aes(color = host), position = position_jitterdodge(dodge.width = 0.5)) +
scale_fill_gradient2("Activity") +
#scale_color_manual(values = c(alpha("black",0), "red3")) +
#stat_summary(fun = "mean", color = "red", geom = "crossbar", width = 0.2,
# position = position_dodge(0.75), ) +
facet_grid(.~Activity, scales = "free_x", space = "free_x", labeller = "label_both") +
theme_bw(base_size = 14) + xlab("Pho4 chimera") +
theme(axis.text.x = element_text(angle = 30, hjust = 1, family = "mono"))
X-Y plot
p3 <- tmpsum %>%
mutate(`nR/G_PHO2` = signif(`nR/G_PHO2`, digits = 2),
`nR/G_pho2∆` = signif(`nR/G_pho2∆`, digits = 2)) %>%
ggplot(aes(x = `nR/G_PHO2`, y = `nR/G_pho2∆`, label = symbol)) +
geom_point(size = 2.5) + geom_abline(slope = 1) +
theme_gray(base_size = 14)
ggplotly(p3, tooltip = c("label", "x", "y"))
---
title: "E013 Pho4 chimera activity analysis, new host, constructs with mutations replaced"
author: Bin He
date: "2022-03-30 (updated `r Sys.Date()`)"
output: 
  html_notebook:
    toc: true
    toc_float: true
    code_folding: hide
---

```{r message=FALSE}
suppressPackageStartupMessages(require(plotly))
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(cowplot))
suppressPackageStartupMessages(require(ggtext))
```

# Background
Emily found out that a number of constructs included in the last batch of experiments had nonsynonymous mutations. To ensure that the results were not due to mutations, she remade those constructs using higher fidelity enzymes.

# Goal
- Quality control for the new batch of repeat experiments. Determine if there is any well position effect and how well the values agree across multiple days of experiments.
- Once QC is done, merge the new, repeat dataset with the old one, replacing the results for the old constructs with mutations.
- Implement the idea of separating the basal activity and Pho2-boost traits.

# Data
Read the previous merged data
```{r}
datPre <- read_tsv("../output/20220614-mar22-batch-bg-sub-data.tsv", col_types = "cccfdddddll") %>% 
  mutate(host = ordered(host))

list.mutations <- as.character(c(215, 209, 211, 210, 217, 218, 220, 221, 223, 224, 222, 227, 230, 229, 231, 233, 240, 241, 250:255, 257, 258))

exptPre <- datPre %>% 
  group_by(date, plasmid, host) %>% 
  summarize(n = n(), n_filter = sum(!low_event_count), .groups = "drop") %>% 
  mutate(flag = ifelse(plasmid %in% list.mutations, "mutation", ""))
```

Read the new batches of data
```{r}
raw <- list(
  "05/12/22" = read_tsv("../data/20220512-EO-repeat-batch-1/20220512-gated-median-out.txt", show_col_types = FALSE),
  "05/17/22" = read_tsv("../data/20220517-EO-repeat-batch-2/20220517-gated-median-out.txt", show_col_types = FALSE),
  "05/19/22" = read_tsv("../data/20220519-EO-repeat-batch-3/20220519-gated-median-out.txt", show_col_types = FALSE),
  "05/31/22" = read_tsv("../data/20220531-EO-repeat-batch-4/20220531-gated-median-out.txt", show_col_types = FALSE),
  "06/02/22" = read_tsv("../data/20220602-EO-repeat-batch-5/20220602-gated-median-out.txt", show_col_types = FALSE)
)
dat <- bind_rows(raw, .id = "date") %>% 
  #separate(col = well, sep = 1, into = c("row", "col")) %>% 
  separate(col = sample, sep = "-", into = c("plasmid", "host", "rep")) %>% 
  mutate(plasmid = na_if(plasmid, "NA"))
```


# QC
## 1. Number of events per sample
```{r}
dat %>% ggplot(aes(x = n_induction/1000)) + geom_histogram(bins = 30, fill = "forestgreen") + 
  xlab("# events x 1000") +
  facet_grid(host~date, scales = "free_y") + theme_bw(base_size = 14) #+ panel_border()
```

> Majority of the experimental wells (not blank) have > 10000 events, with the exception of a few samples

```{r}
dat %>% filter(host != "blank", n_induction < 10000)
```
> Nearly all samples with a low event count were in the 555 background. Emily repeated many of them.
> Will remove all samples with lower than 7,000 events in the final gated population from further analysis.

Flag samples that have a low event count, contain mutations or show multiple populations.
```{r}
event.th <- 7000

dat <- dat %>% 
  mutate(
    low_event_count = n_induction < event.th,
    # Emily labeled the old plasmids with mutations with an "o" in the flow sample name column
    mutation = ifelse(grepl("o$", name), TRUE, FALSE)
  ) %>% 
  filter(host != "blank")
```

## 2. Background subtraction

Check the background fluorescence levels across different days.
```{r}
dat %>% filter(host == "156") %>% 
  mutate(date = gsub("/22", "", date)) %>% 
  pivot_longer(BL1.H:nRFP, names_to = "parameter", values_to = "intensity") %>% 
  mutate(parameter = ordered(parameter, levels = c("BL1.H", "YL2.H", "nGFP", "nRFP"))) %>% 
  ggplot(aes(x = date, y = intensity)) + geom_point(aes(shape = well)) +
  stat_summary(fun.data = "mean_se", geom = "pointrange", color = "red", position = position_nudge(x = 0.1)) +
  facet_wrap(~parameter, scale = "free_y") +
  theme_bw(base_size = 14)
```

> 06/02 run had smaller FSC values (~ cell volume) compared with the other days, while the background fluorescence doesn't appear to be a lot higher, resulting in higher normalized values.

Subtract the background
```{r}
bg <- dat %>% 
  filter(host == "156") %>% 
  group_by(date) %>% 
  summarize(across(FSC.H:nRFP, ~ round(mean(.x),1))) %>% 
  column_to_rownames(var = "date")

dat1 <- dat %>% 
  filter(host != "blank") %>% 
  select(date:host, FSC.H:nRFP, low_event_count:mutation) %>%
  mutate(
    BL1.H = BL1.H - bg[date, "BL1.H"],
    YL2.H = YL2.H - bg[date, "YL2.H"],
    nGFP = nGFP - bg[date, "nGFP"],
    nRFP = nRFP - bg[date, "nRFP"],
  )
```

Double check that the subtraction worked correctly
```{r}
dat1 %>% filter(host == "156") %>% 
  pivot_longer(BL1.H:nRFP, names_to = "parameter", values_to = "intensity") %>% 
  mutate(
    parameter = ordered(parameter, levels = c("BL1.H", "YL2.H", "nGFP", "nRFP")),
    date = gsub("\\/22$", "", date)
  ) %>% 
  ggplot(aes(x = date, y = intensity)) + geom_point(aes(shape = well)) +
  stat_summary(fun.data = "mean_se", geom = "pointrange", color = "red", position = position_nudge(x = 0.1)) +
  facet_wrap(~parameter, scale = "free_y") +
  theme_bw(base_size = 14)

# remove the yH156 samples
dat1 <- dat1 %>% 
  filter(host != "156") %>%
  mutate(host = ordered(host, levels = c("555", "373"), labels = c("PHO2", "pho2∆")))
```


## 3. Well position

**_Rationale_**

In my plate design, I placed a positive control strain (CgPho4-mNeon in _PHO2_ background) in the control columns (1, 5, 9) on every other row (A, C, E, G). The rationale for this is to use it to identify any well position effect on the fluorescence readings. In this round of repeat experiments, Emily followed this design and put pH188-yH323 in these wells. Now we can properly assess the well position effect.

For this analysis, we will combine and compare the May/Jun batch with the Mar batch
```{r}
datM <- bind_rows(datPre, dat1)
```

```{r}
datM %>% 
  filter(plasmid == "188", host == "PHO2") %>% 
  separate(well, into = c("row", "col"), sep = 1) %>% 
  pivot_longer(BL1.H:YL2.H, names_to = "parameter", values_to = "intensity") %>% 
  ggplot(aes(x = col, y = intensity, group = date)) + 
  geom_point(aes(color = date), size = 1) + 
  #geom_line(aes(color = date), size = 0.4) +
  stat_summary(aes(color = date), geom = "line", fun = mean) +
  scale_color_brewer(type = "qual", palette = 2) +
  xlab("Column") +
  facet_grid(parameter~row, scales = "free_y") +
  theme_bw(base_size = 14)
```

> - no obvious trend between the rows or columns
> - a clear correlation between YL2.H and BL1.H.

Same plot for normalized fluorescence
```{r}
datM %>% 
  filter(plasmid == "188", host == "PHO2") %>% 
  separate(well, into = c("row", "col"), sep = 1) %>% 
  pivot_longer(c(nGFP, nRFP), names_to = "parameter", values_to = "intensity") %>% 
  ggplot(aes(x = col, y = intensity, group = date)) + 
  geom_point(aes(color = date), size = 1) + 
  #geom_line(aes(color = date), size = 0.4) +
  stat_summary(aes(color = date), geom = "line", fun = mean) +
  scale_color_brewer(type = "qual", palette = 2) +
  xlab("Column") +
  facet_grid(parameter~row, scales = "free_y") +
  theme_bw(base_size = 14)
```

> - What we care about is the ratio between Pho4-mNeon and _PHO5pr_-mCherry

```{r}
datM %>% 
  filter(plasmid == "188", host == "PHO2") %>% 
  separate(well, into = c("row", "col"), sep = 1) %>% 
  ggplot(aes(x = BL1.H, y = YL2.H)) + 
  geom_point(aes(color = date, shape = row), size = 3) + 
  stat_smooth(method = "lm", se = FALSE, color = "gray50", size = 0.5) +
  scale_color_brewer(type = "qual", palette = 2) +
  #scale_color_viridis_d() +
  #scale_shape_manual(values = c(15:17, 23, 25)) +
  theme_bw(base_size = 14)
```

> - There is variation in BL1.H and correspondingly in YL2.H, but the ratio between the two are very consistent

## 4. Run effect

How does the same control strain behave in different runs (represented as date variable here)? Is there a strong run effect?

Let's check both ScPho4 (pH194) and CgPho4 (pH188) to see if their behaviors are consistent across days and between this and the last (03/30) batch.
```{r}
datM %>% 
  filter(plasmid %in% c("188", "194")) %>% 
  mutate(
    date = gsub("/22$", "", date),
    `YL2.H / BL1.H` = YL2.H / BL1.H,
    Pho4 = factor(plasmid, levels = c("194", "188"), labels = c("ScPho4", "CgPho4"))
  ) %>% 
  separate(well, into = c("row", "col"), sep = 1) %>% 
  ggplot(aes(x = date, y = `YL2.H / BL1.H`, group = host)) + 
  geom_point(aes(color = host), position = position_jitter(0.1), alpha = 0.8, size = 1) + 
  scale_color_viridis_d(begin = 0.2, end = 0.6) +
  stat_summary(fun.data = "mean_se", geom = "pointrange", shape = 16, color = "red") +
  ylim(0, 30) + facet_grid(Pho4~.) + theme_bw(base_size = 14)
```
> while the characteristic behaviors of ScPho4 and CgPho4 are consistent across runs, there are also clear variability in the RFP/GFP ratios between plates for the same genotype.

Next we examine the underlying background-subtracted fluorescence values for pH188 and pH194 in the yH555 background, which are present on each plate.

```{r}
datM %>% 
  filter(plasmid %in% c("188", "194"), host == "PHO2") %>% 
  mutate(
    date = gsub("/22$", "", date),
    plasmid = factor(plasmid, levels = c("188", "194"), labels = c("CgPho4", "ScPho4"))
  ) %>% 
  pivot_longer(BL1.H:YL2.H, names_to = "parameter", values_to = "intensity") %>% 
  ggplot(aes(x = date, y = intensity, group = plasmid)) + 
  geom_point(aes(color = plasmid), alpha = 0.7, size = 1,
             position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.7)) + 
  scale_color_manual(values = c("ScPho4" = "blue3", "CgPho4" = "forestgreen")) +
  stat_summary(aes(color = plasmid), fun.data = "mean_se", geom = "crossbar", width = 0.4,
               position = position_dodge(0.7)) +
  facet_grid(parameter~., scale = "free_y") +
  expand_limits(y = 0) +
  theme_bw(base_size = 14) + labs(subtitle = "in PHO2 background")# +
  #theme(plot.subtitle = element_markdown())
```

> CgPho4 and ScPho4 values track each other -- on days (runs) where one is high, the other one also tends to be high. On the 05/31 run, ScPho4 is significantly higher than CgPho4.

```{r}
datM %>% 
  filter(plasmid %in% c("188", "194"), host == "PHO2") %>% 
  mutate(
    date = gsub("/22$", "", date),
    plasmid = factor(plasmid, levels = c("188", "194"), labels = c("CgPho4", "ScPho4"))
  ) %>% 
  pivot_longer(nGFP:nRFP, names_to = "parameter", values_to = "intensity") %>% 
  ggplot(aes(x = date, y = intensity, group = plasmid)) + 
  geom_point(aes(color = plasmid), alpha = 0.7, size = 1,
             position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.7)) + 
  scale_color_manual(values = c("ScPho4" = "blue3", "CgPho4" = "forestgreen")) +
  stat_summary(aes(color = plasmid), fun.data = "mean_se", geom = "crossbar", width = 0.4,
               position = position_dodge(0.7)) +
  facet_grid(parameter~., scale = "free_y") +
  expand_limits(y = 0) +
  theme_bw(base_size = 14) + labs(subtitle = "in PHO2 background")# +
  #theme(plot.subtitle = element_markdown())
```
> The size normalized values show lower variance but a similar correlated pattern. Here, however, ScPho4 overtakes CgPho4 on the 03/22 and 05/17 runs.

How about the RFP/GFP ratios?
```{r}
datM %>% 
  filter(plasmid %in% c("188", "194"), host == "PHO2") %>% 
  mutate(
    date = gsub("/22$", "", date),
    plasmid = factor(plasmid, levels = c("188", "194"), labels = c("CgPho4", "ScPho4")),
    `YL2 / BL1` = YL2.H / BL1.H,
    `nRFP / nGFP` = nRFP / nGFP
  ) %>% 
  pivot_longer(`YL2 / BL1`:`nRFP / nGFP`, names_to = "parameter", values_to = "ratio") %>% 
  ggplot(aes(x = date, y = ratio, group = plasmid)) + 
  geom_point(aes(color = plasmid), alpha = 0.6, size = 1, 
             position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.7)) + 
  stat_summary(aes(color = plasmid), fun.data = "mean_se", geom = "crossbar", width = 0.4,
               position = position_dodge(0.7)) +
  scale_color_manual(values = c("ScPho4" = "blue3", "CgPho4" = "forestgreen")) +
  facet_grid(parameter~.) + expand_limits(y = c(0,30)) +
  theme_bw(base_size = 14) + labs(subtitle = "in PHO2 background")# +
  #theme(plot.subtitle = element_markdown())
```

 
## 5. Statistical tests for confounding factors
Testing for run (date) as well as well (block) position effects on fluorescence values. Using the [nest-map-unnest workflow](https://www.tidymodels.org/learn/statistics/tidy-analysis/)

Note that in order to maintain a balanced design (same number of replicates in each group), we will restrict the analysis to batches 1-4 in the May/Jun dataset. The March dataset had a different plate design, while batch 5 of the May/Jun dataset is incomplete w.r.t. the controls (because only half of the plate was used).

```{r}
tmp <- filter(dat1, plasmid == "188", host == "PHO2", date != "06/02/22") %>%   
  separate(well, into = c("row", "col"), sep = 1, remove = FALSE) %>% 
  select(date, block = well, row, col, BL1.H:nRFP) %>% 
  mutate(RvG = YL2.H / BL1.H, nRvG = nRFP / nGFP)
```

```{r}
tmp %>% 
  pivot_longer(cols = BL1.H:nRvG, names_to = "parameter", values_to = "value") %>% 
  nest(data = c(-parameter)) %>% 
  mutate(
    fit = map(data, ~anova(lm(value ~ date + row + col, data = .x))),
    tidied = map(fit, broom::tidy)
  ) %>% 
  unnest(tidied) %>% 
  mutate(
    significant = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ ".",
      TRUE ~ " "
    )) %>% 
  filter(term != "Residuals") %>% 
  select(parameter, term, df, statistic, p.value, significant)
```
> There is a highly significant run (date) effect for all variables. The R/G and nR/G ratios show significant column effects.
> I also tried using block (12 blocks on the plate) and most of the block effects are significant.

Use dummy encoding in linear regression to view the effects of run and well position.
```{r}
tmp %>% 
  mutate(row = factor(row, levels = c("G", "E", "C", "A"))) %>% 
  pivot_longer(cols = BL1.H:nRvG, names_to = "parameter", values_to = "value") %>% 
  nest(data = c(-parameter)) %>% 
  mutate(
    fit = map(data, ~lm(value ~ date + row + col, dat = .x)),
    tidied = map(fit, broom::tidy)
  ) %>% 
  unnest(tidied) %>% 
  mutate(
    term = gsub("/22$", "", term),
    estimate = round(estimate, digits = 2),
    significant = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ ".",
      TRUE ~ " "
    ),
    eff = paste(estimate, significant, sep = "  ")
  ) %>% 
  select(parameter, term, eff) %>% 
  pivot_wider(names_from = parameter, values_from = eff)# %>% 
  #mutate(across(e_BL1:e_nRFP, ~ round(.x, digits = 0)))
```
> - There is a significant run (date) effect for all variables. RvG ratio appears to be better than nRvG in terms of being less affected by either run or well positions.
> - Row A has a higher fluorescence values overall. But the RvG ratios are not significantly higher in row A.
> - Column 9 has a significantly higher RvG, because, for some reason, the RFP is higher but not the GFP.

With the above observations, it is yet unclear to me (1) whether it is necessary to account for the run and position effects (if the effect sizes we are interested in are much larger than these, we may not care) and (2) if necessary, what's the best way to account for it. One idea I have is to use the common strains on each plate, e.g., the host strains (no Pho4) or CgPho4 and ScPho4 with and without Pho2, as normalizing factors. One can think of several ways of normalization, e.g., setting ScPho4 w/ Pho2's RvG as 100%. Will explore these ideas below.

# Main
## 1. Mutation effects
Here we examine the effects of the mutations on the chimera activity. Emily remade `r length(list.mutations)` chimeric constructs to correct the non-synonymous mutations in them and reran those constructs in the May/Jun batch. While doing that, she encountered some issues with certain samples not growing well or the flow data showing multiple populations. She repeated some of those strains. As a result, we have more than two measurements for some of the strains. The goal in this analysis is to compare their measurements both with and without mutations, and also across the repeats.

Read in the experimental flags for the May/Jun batch, for the multiple population labels
```{r}
multipop <- read_tsv("../data/20220614-EO-repeat-experiment-info.tsv", col_types = cols()) %>% 
  filter(flag == "multi pop") %>% select(-flag) %>% 
  separate(sample, into = c("plasmid", "host"), sep = "-") %>% 
  mutate(host = ordered(host, levels = c("555", "373"), labels = c("PHO2", "pho2∆")), multi_pop = TRUE)
```

Create a flag variable
```{r}
datN <- datM %>% 
  mutate(multi_pop = FALSE) %>% 
  rows_update(multipop, by = c("date", "plasmid", "host")) %>% 
  mutate(
    RvG = ifelse(BL1.H > 0, YL2.H/BL1.H, NA),
    #RvG = num(RvG, digits = 2),
    flag = factor(mutation + multi_pop*2,
                  levels = as.character(0:3),
                  labels = c("pass", "mutation", "multi_pop", "mutation;multi_pop"))
  ) %>% relocate(RvG, .after = nRFP)
```

Emily has included two old samples containing mutations (pH209, 210) along with the remade, mutation-free plasmids, in the new batch. Let's see how they compare with each other and with the March batch (with mutations).
```{r}
tmp <- filter(datN, plasmid %in% c("209", "210")) %>% 
  mutate(
    month = str_sub(date, 1, 2),
    sample = ifelse(month == "03", "03-old", ifelse(mutation, "05-old", "05-new")),
    sample = factor(sample, levels = c("03-old", "05-old", "05-new"))
  )
```

First compare the same plasmids (with mutations) measured on different days
```{r fig.width=6, fig.height=6}
tmp %>% 
  select(-FSC.H) %>% 
  pivot_longer(BL1.H:RvG, names_to = "parameter", values_to = "value") %>%
  mutate(parameter = factor(parameter, levels = c("BL1.H", "YL2.H", "nGFP", "nRFP", "RvG"))) %>% 
  ggplot(aes(x = plasmid, y = value, group = sample)) +
  geom_point(aes(color = sample), size = 1, position = position_dodge(0.7)) +
  #stat_summary(aes(color = sample), fun.data = "mean_se", geom = "crossbar", width = 0.1,
  #             position = position_dodge(0.7)) +
  scale_color_manual(values = c("03-old" = "gray50", "05-old" = "gray30", "05-new" = "red2")) +
  scale_y_continuous(n.breaks = 3) + expand_limits(y = 0) +
  facet_grid(parameter~host, scales = "free_y") +
  theme_bw(base_size = 14) + background_grid(major = "y")
```

```{r}
tmp %>% 
  select(-FSC.H) %>% 
  pivot_longer(BL1.H:RvG, names_to = "parameter", values_to = "value") %>%
  mutate(parameter = factor(parameter, levels = c("BL1.H", "YL2.H", "nGFP", "nRFP", "RvG"))) %>% 
  nest(data = c(-parameter)) %>% 
  mutate(
    fit = map(data, ~anova(lm(value ~ sample + host, data = .x))),
    tidied = map(fit, broom::tidy)
  ) %>% 
  unnest(tidied) %>% 
  mutate(
    significant = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      p.value < 0.1   ~ ".",
      TRUE ~ " "
    )) %>% 
  filter(term != "Residuals") %>% 
  select(parameter, term, df, statistic, p.value, significant)
```
> For the RvG ratio, there is no significant difference between the 03-old, 05-old and 05-new. In other words, we don't have evidence that either the different runs or the correction of mutations impacted the behavior of the chimera in the case of pH209 and pH210.

We will now move on to the rest of the plasmids that have been remade.
```{r}
pp <- theme(axis.title.x = element_blank(), legend.position = "none")
myPlotMutEffect <- function(l){
  even_numbers <- seq(2, length(l), 2) # used to draw stripes on even numbered x-values
  df_tile <- tibble(plasmid = rep(sort(l)[even_numbers], times = 2),
                    host = factor(rep(c("PHO2", "pho2∆"), each = length(even_numbers))))
  datN %>% 
    #mutate(date = gsub("/22$", "", date)) %>% 
    filter(!low_event_count, plasmid %in% l) %>% 
    select(date:host, RvG, flag) %>%
    ggplot(aes(x = plasmid, y = RvG)) +
    # draw stripes on even columns,
    # from: https://stackoverflow.com/questions/56961744/draw-alternate-rectangles-in-boxplots-with-facets-r-ggplot2'
    geom_tile(aes(x = plasmid, y = 1), height = Inf, width = 1, data = df_tile, alpha = 0.3) +
              #ymin = -Inf, ymax = Inf, fill = "grey80", color = NA, alpha = 0.5) +
    geom_point(aes(color = flag, group = date), size = 1.5, shape = 19,
               position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.7)) +
    #stat_summary(aes(color = sample), fun.data = "mean_se", geom = "crossbar", width = 0.1,
    #             position = position_dodge(0.7)) +
    scale_color_manual(values = c("pass" = "forestgreen", "mutation" = "purple", "multi_pop" = "steelblue3")) +
    scale_y_continuous(n.breaks = 4) + expand_limits(y = 0) +
    facet_grid(host~., scales = "free_y") +
    theme_cowplot() + panel_border(color = "gray30") + 
    theme(strip.text = element_text(face = 3))
}
```

```{r warning=FALSE, fig.width = 5, fig.height = 7}
lst <- split(list.mutations, f = ceiling(seq_along(list.mutations)/9))
p <- lapply(lst, myPlotMutEffect)
legend_p <- get_legend(p[[1]] + theme(legend.position = "bottom", legend.justification = "center"))
plot_grid(p[[1]] + pp, p[[2]] + pp, p[[3]] + theme(legend.position = "none"), legend_p, ncol = 1, rel_heights = c(1,1,1,.15))
```
> Samples showing multiple populations mostly agree with the latter measurements where the issue was ressolved, with the exception of one of the samples of 231 in _PHO2_ background.
> 

Here are the ones that show obvious differences between the old and corrected constructs:
```{r}
ll <- c("215", "218", "227", "233", "241", "252", "257")
myPlotMutEffect(ll) + scale_y_log10()
```

## 2. Summarize results per chimera
We will remove all the measurements made of chimeras with mutations. For the measurements made of remade, corrected chimeras, we will remove **one sample for pH231 from the 05/19 batch** (well C2, the other 2 replicates of the same strain in that batch were removed due to the presence of multiple populations and low event count in the gated population), as well as one sample for **pH233 from the 06/02 batch** (well D4, which showed overexpression of the chimera).

Number of replicates measured for each chimera x host combination:
```{r}
# load genotype information
meta <- read_tsv("../data/20220621-chimera-Pho4-makeup.tsv", col_types = "ccccc") %>% 
  rename(genotype = full) %>% select(-plasmid_wrong)

datN %>% 
  filter(!low_event_count, !mutation) %>% 
  count(plasmid, host) %>% 
  pivot_wider(names_from = host, values_from = n) %>% 
  full_join(meta, by = "plasmid") %>% 
  select(-letter, -genotype)
```
> pH211 was measured 9 times in the _PHO2_ background. The results were consistent.

For each chimera, we would like to calculate **three values**:

1. RvG in _pho2∆_: this is its base activity without Pho2
1. RvG in _PHO2_: this is its full activity with Pho2
1. RvG_PHO2 / RvG_pho2∆: this is the Pho2 enhancement of activity

The first two values are further normalized against the corresponding ScPho4 measurements on the same plate.
```{r}
# filter data
tmp <- filter(datN, !low_event_count, !mutation, !is.na(plasmid)) %>% select(date:host, BL1.H:RvG)

# calculate the ScPho4 values per plate
mScPho4 <- tmp %>% filter(plasmid == "194") %>% group_by(date, host) %>% summarize(mean = mean(RvG), .groups = "drop") %>% 
  pivot_wider(names_from = host, values_from = mean) %>% column_to_rownames(var = "date")

# normalize the values for non-ScPho4 with those of ScPho4 per plate
datO <- tmp %>% mutate(sRvG = RvG / mScPho4[cbind(date, as.character(host))])

# summarize
datsum <- datO %>% 
  group_by(plasmid, host) %>% 
  summarize(across(RvG:sRvG, mean), .groups = "drop") %>% 
  pivot_wider(names_from = host, values_from = RvG:sRvG) %>% 
  rename(`A_PHO2` = `RvG_PHO2`, `A_pho2` = `RvG_pho2∆`, full = sRvG_PHO2, base = `sRvG_pho2∆`) %>% 
  mutate(boost = `A_PHO2` / `A_pho2`, across(`A_PHO2`:boost, ~ round(.x, digits = 3))) %>% 
  full_join(meta, by = "plasmid")
```

Export the summary data
```{r}
write_tsv(datsum, file = "../output/20220619-chimera-summary-result.tsv")
```

Here are the chimera that have little full activity (with Pho2):
`r datsum %>% filter(full < 0.2) %>% arrange(full) %>% select(-letter) %>% knitr::kable()`

Here are the ones with the highest base activity (without Pho2):
`r datsum %>% arrange(desc(base)) %>% select(-letter) %>% head(10) %>%  knitr::kable()`

## 3. Plotting functions
The goal here is to develop a set of plotting functions to visualize the results.

For testing purposes, we will select a set of chimera along with the endogenous ScPho4 and CgPho4.
```{r}
# extract ximera names
refs <- c("CCCCC","SSSSS")
ximeras <- setdiff(meta$symbol, refs)
# make a test set
test <- c(refs, filter(meta, plasmid %in% c("211", "229", "250", "224", "257", "219", "266")) %>% pull(symbol))
# subset data
datT <- meta %>% 
  filter(symbol %in% test) %>% 
  inner_join(datO, by = "plasmid") %>% 
  mutate(symbol = factor(symbol, levels = test))
```

Design a set of color palettes:
```{r}
# for two groups, e.g., with vs w/o Pho2
cols.two <- c("PHO2" = "#457dbc", "pho2∆" = "#ea9e25")
```

Plot individual components and the normalized activity (RvG ratio)
```{r fig.width=7, fig.height=6}
datT %>% 
  pivot_longer(c(nGFP, nRFP, RvG), names_to = "parameter", values_to = "intensity") %>% 
  mutate(parameter = ordered(parameter, levels = c("nGFP", "nRFP", "RvG"),
                             labels = c("Pho4-GFP", "_PHO5pr_ => RFP", "Activity = RFP/GFP"))) %>% 
  #pivot_longer(BL1.H:YL2.H, names_to = "parameter", values_to = "intensity") %>% 
  #mutate(parameter = ordered(parameter, levels = c("YL2.H", "BL1.H"))) %>% 
  ggplot(aes(x = symbol, y = intensity, group = host)) + 
  geom_bar(aes(fill = host), width = 0.5,# alpha = 0.8,
           stat = "summary", fun = "mean", position = position_dodge(0.5)) +
  geom_point(aes(color = host), size = 1, alpha = 0.9, shape = 3,
             position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.5)) + 
  scale_color_manual("co-TF", values = c("PHO2" = "gray30", "pho2∆" = "gray40")) +
  scale_fill_manual("co-TF", values = cols.two) +
  #stat_summary(fun = "mean", geom = "crossbar", color = "red", width = 0.25,
  #             position = position_dodge(0.75), ) +
  facet_wrap(~parameter, scale = "free_y", ncol = 1) +
  xlab("Pho4 chimera") + expand_limits(y = 0) +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, family = "mono"),
        strip.text = element_markdown())
```

Plot the RvG values normalized against the corresponding ScPho4 construct on the same plate, to obtain a relative measure.
```{r}
# useful to group the plasmids
ximera.grp <- datsum %>% 
  mutate(group = case_when(
    full < 0.2 ~ "defective",
    base > 5   ~ "less Pho2-dep.",
    TRUE       ~ "others"
  )) %>% select(plasmid, symbol, group)
```

```{r fig.width=7, fig.height=6}
p0 <- datT %>% 
  left_join(ximera.grp, by = c("symbol", "plasmid")) %>% 
  ggplot(aes(x = symbol, y = sRvG, group = host)) + 
  geom_bar(aes(fill = host), width = 0.5,# alpha = 0.8,
           stat = "summary", fun = "mean", position = position_dodge(0.5)) +
  geom_point(aes(color = host), size = 1, alpha = 0.9, shape = 3,
             position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.5)) + 
  scale_color_manual(values = c("PHO2" = "gray30", "pho2∆" = "gray40"), guide = "none") +
  scale_fill_manual(values = cols.two, guide = "none") +
  #stat_summary(fun = "mean", geom = "crossbar", color = "red", width = 0.25,
  #             position = position_dodge(0.75), ) +
  facet_grid(host~group, scale = "free") +
  xlab("Pho4 chimera") + ylab("A<sub>chimera</sub> / A<sub>ScPho4</sub>") +
  theme_bw(base_size = 14) + background_grid(minor = "none") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, family = "mono"),
        strip.text = element_markdown(), axis.title.y = element_markdown())

p1 <- datsum %>% 
  filter(symbol %in% test) %>% 
  left_join(ximera.grp, by = c("plasmid", "symbol")) %>% 
  ggplot(aes(x = symbol, y = boost)) +  
  geom_col(width = 0.3, color = "black", fill = "gray80") + 
  geom_hline(yintercept = 1, linetype = 2, color = "gray30") +
  facet_wrap(~group, scales = "free_x") + 
  scale_y_log10() +
  xlab("Pho4 chimera") + ylab("A<sub>PHO2</sub> / A<sub>pho2∆</sub>") +
  theme_bw(base_size = 14) + background_grid(minor = "none") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, family = "mono"),
        strip.text = element_blank(), axis.title.y = element_markdown())

set_null_device("png") # from https://github.com/wilkelab/cowplot/issues/174
plot_grid(p0 + theme(axis.title.x = element_blank()), 
          p1, ncol = 1, rel_heights = c(3,2))
```

save the data objects for the interactive data plotter
```{r}
save(datO, datsum, meta, file = "../shinyapp/20220620-data-for-interactive-plotting.RData")
```

Design the plot
```{r}
tmpsum %>% 
  mutate(Activity = ifelse(`R/G_PHO2` < 2*low.act.th["R/G_pho2∆"], "low", "pass")) %>% 
  ggplot(aes(x = symbol, y = `pho2∆/PHO2`)) + 
  geom_col(aes(group = date, fill = `R/G_PHO2`), width = 0.75, color = "gray50",
           position = position_dodge(0.9)) +
  #geom_point(aes(color = host), position = position_jitterdodge(dodge.width = 0.5)) + 
  scale_fill_gradient2("Activity") +
  #scale_color_manual(values = c(alpha("black",0), "red3")) +
  #stat_summary(fun = "mean", color = "red", geom = "crossbar", width = 0.2,
  #             position = position_dodge(0.75), ) +
  facet_grid(.~Activity, scales = "free_x", space = "free_x", labeller = "label_both") +
  theme_bw(base_size = 14) + xlab("Pho4 chimera") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, family = "mono"))
```

X-Y plot
```{r}
p3 <- tmpsum %>% 
  mutate(`nR/G_PHO2` = signif(`nR/G_PHO2`, digits = 2),
         `nR/G_pho2∆` = signif(`nR/G_pho2∆`, digits = 2)) %>% 
  ggplot(aes(x = `nR/G_PHO2`, y = `nR/G_pho2∆`, label = symbol)) + 
  geom_point(size = 2.5) + geom_abline(slope = 1) +
  theme_gray(base_size = 14)
ggplotly(p3, tooltip = c("label", "x", "y"))
```

