Welcome to an example workflow to generate figures and analysis in the following manuscript:
Robinson, S.L., Smith, M.D., Richman, J.E., Aukema, K.G., & Wackett, L.P. (2020) Machine learning-based prediction of activity and substrate specificity for OleA enzymes in the thiolase superfamily. Under review.
The complete scripts for analysis conducted in this study are available on the thiolase-machine-learning Github repo
maprdat <- read_csv("data/enzyme_substrate_activity_data.csv")
head(maprdat)
## # A tibble: 6 x 4
## filename cmpnd org log_slope
## <chr> <chr> <chr> <dbl>
## 1 output//2019-10-18/2019-10-18_dodecanoa… dodecan… Arenimonas oryzit… 1.83
## 2 output//2019-10-18/2019-10-18_dodecanoa… dodecan… Brachybacterium p… 1.64
## 3 output//2019-10-18/2019-10-18_dodecanoa… dodecan… Chromatocurvus ha… 1.77
## 4 output//2019-10-18/2019-10-18_dodecanoa… dodecan… Humibacillus sp. … 1.74
## 5 output//2019-10-18/2019-10-18_dodecanoa… dodecan… Kytococcus sedent… 1.72
## 6 output//2019-10-18/2019-10-18_dodecanoa… dodecan… Micromonospora pe… 1.68
# Find the average across biological triplicates
maprdat_avg <- maprdat %>%
dplyr::group_by(cmpnd, org) %>%
dplyr::summarise_each(list(mean_activity = mean), log_slope)
# Convert to wide format
maprdat_merg <- as.data.frame(maprdat_avg, stringsAsFactors = F)
maprdat_wide <- reshape2::dcast(maprdat_merg, org ~ cmpnd, value.var = "mean_activity")
maprdat_wide[is.na(maprdat_wide)] <- 0
rawdat_mat <- maprdat_wide %>%
dplyr::select(-org) %>%
as.matrix()
# Set color palette
pal <- inferno(80)
pal2 <- pal[c(10:80)]
# Fix names
maprdat_mat <- rawdat_mat
rownames(maprdat_mat) <- maprdat_wide$org
rownames(maprdat_mat) <- gsub("_", " ", rownames(maprdat_mat))
rownames(maprdat_mat) <- gsub("XC", "Xanthomonas campestris OleA*", rownames(maprdat_mat))
rownames(maprdat_mat) <- gsub("Pseudoxanthomonas", "Pseudoxanthomonas sp.", rownames(maprdat_mat))
rownames(maprdat_mat) <- paste0(word(rownames(maprdat_mat), 1, sep = " "), " ", word(rownames(maprdat_mat), 2, sep = " "))
# Remove duplicates and exceptions
dedup <- maprdat_mat[!duplicated(rownames(maprdat_mat)),]
dedup_sort <- dedup[order(rowSums(dedup), decreasing = T),]
# Add in benzoate
testr <- cbind(dedup_sort, rep(0, nrow(dedup_sort)))
colnames(testr)[ncol(testr)] <- "benzoate"
testm <- testr
testm[testm > 0] <- 1
nsub <- data.frame(cbind(rownames(testm), rowSums(testm)))
colnames(nsub) <- c("org", "nsub")
tr <- "data/73_OleA_aligned_trimmed_fasttree_phylo.nwk"
fastree <- treeio::read.tree(tr)
ggtree_phylo <- treeio::as.phylo(fastree)
# Read in the taxonomy
rawtax <- read_excel("data/OleA_taxonomic_classification.xlsx")
xantho <- rawtax[grep("Xanthomonas", rawtax$organism),]
xantho$organism <- c("Xanthomonas_campestris_OleA")
xantho$genus <- "Xanthomonas_campestris"
tax <- rawtax %>%
bind_rows(xantho)
tax$organism <- gsub(" ", "_", tax$organism)
ptax <- ggtree_phylo$tip.label
for(i in 1:length(tax$organism)) {
ind <- grep(tax$organism[i], ptax)
tax$acc[i] <- word(ptax[ind], sep = "\\.1", 1)
ptax[ind] <- paste0(ptax[ind], "_", tax$class[i], "_", tax$phylum[i])
}
tax$acc[is.na(tax$acc)] <- "NP_635607"
# Reorder the data frame to match
dt2 <- data.frame(ptax, word(ptax, sep = "\\.1", 1))
colnames(dt2) <- c("label", "acc")
# Merge with the tax df
merg <- dt2 %>%
inner_join(tax, by = "acc") %>%
dplyr::mutate(labs = paste0(word(organism, sep = "_", 1), " ",
word(organism, sep = "_", 2)))
# Fix discrepancies in merging names
pseudo1 <- rownames(maprdat_mat)[grep("Pseudoxanthomonas", rownames(maprdat_mat))]
pseudo2 <- merg$labs[grep("Pseudoxanthomonas", merg$labs)][1]
merg$labs[grep("Pseudoxanthomonas", merg$labs)] <- pseudo1
leif1 <- rownames(maprdat_mat)[grep("Leifsonia", rownames(maprdat_mat))]
leif2 <- merg$labs[grep("Leifsonia", merg$labs)][1]
merg$labs[grep("Leifsonia", merg$labs)] <- leif1
# Create a phylogenetic tree
ggtree_phylo$tip.label <- merg$labs
namord <- names(sort(colMeans(testr), decreasing = T))
test_ord <- testr[,match(namord, colnames(testr))]
ggt <- ggtree(ggtree_phylo) +
geom_tiplab(aes(fontface = "italic"),
align = TRUE, linetype = 2, offset = 0.04) +
xlim(NA, 20)
# Calculate average activity
avg_act <- test_ord %>%
rowMeans(.) %>%
data.frame(., stringsAsFactors = F) %>%
rownames_to_column(., var = "org")
colnames(avg_act) <- c("org", "avg_activity")
# Merge with taxonomic information
mergtax <- avg_act %>%
left_join(., merg, by = c("org" = "labs"))
# Read in the custom palette
clustkey <- read_csv("data/OleA_palette_key.csv")
clustkey$levs[clustkey$levs == "Green non-sulfur bacteria"] <- "Chloroflexi"
# Fix discrepancies in naming
mergtax$class[grepl("Opit", mergtax$class)] <- "Opitutae"
mergord <- mergtax[rev(match(ggt$data$label[ggt$data$isTip], mergtax$org)),] %>%
dplyr::select(-label) %>%
dplyr::mutate(label = org) %>%
dplyr::left_join(., clustkey, by = c("class" = "levs")) %>%
dplyr::select(label, org, avg_activity, acc, organism, genus, family, order, class, phylum, pal2)
df2 <- data.frame(cbind(mergord$acc, mergord$organism, mergord$label, as.numeric(mergord$avg_activity)), stringsAsFactors = F)
df3 <- df2[order(df2$X2, decreasing = T),] %>%
dplyr::left_join(., nsub, by = c("X3" = "org")) %>%
dplyr::mutate(organism = gsub("_", " ", X2)) %>%
dplyr::mutate(activity = round(as.numeric(X4), 3)) %>%
dplyr::select(X1, organism, activity, nsub) %>%
arrange(desc(activity))
colnames(df3) <- c('NCBI Accession', 'Organism', 'Average activity', 'Total number of substrates accepted')
ggdf <- ggt %<+% mergord
clustord <- clustkey[match(levels(as.factor(ggdf$data$class)), clustkey$levs),]
# Draw a phylogenetic tree
gsz <- ggdf +
geom_tippoint(aes(size = avg_activity), color = ggdf$data$pal2[ggdf$data$isTip],
fill = ggdf$data$pal2[ggdf$data$isTip], x = 10.35)
# Pair phylogenetic tree with heatmap
gheatmap(gsz, test_ord, offset = 5.75, width = 2,
colnames_position = "top", colnames_offset_y = 2,
colnames_angle = 60, font.size = 4, color = NULL) +
scale_fill_viridis(option = "inferno") +
theme(legend.title = element_blank(),
legend.position = "none")
# Taxonomic legend
plot.new()
par(mar=c(0.01,0.01,0.01,0.01))
legend("center", bty = "n", bg = "white",
legend = clustord$levs, pch = 19,
cex = 1, pt.cex = 1,
col = clustord$pal2)
# Read in the features for machine learning
dat <- read_csv("data/machine_learning/20200111_1095_training_examples_12angstrom_prot_features.csv")
# Set seed
set.seed(1234)
# Split into test and training sets
dat_split <- initial_split(dat, strata = "is_active")
dat_train <- training(dat_split)
dat_test <- testing(dat_split)
nrow(dat_train)/nrow(dat) # 75 % training, 25 % testing
## [1] 0.7506849
# Define our response
x_train <- dat_train[,!colnames(dat_train) %in% c("id", "is_active")]
x_test <- dat_test[,!colnames(dat_test) %in% c("id", "is_active")]
y_train <- dat_train$is_active
y_test <- dat_test$is_active
# Make a data frame for prediction
df_train <- data.frame(x_train, stringsAsFactors = F, row.names = dat_train$id)
# Complete dataset for training and testing
form_train <- data.frame(cbind(x_train, y_train), stringsAsFactors = F, row.names = dat_train$id)
form_test <- data.frame(cbind(x_test, y_test), stringsAsFactors = F, row.names = dat_test$id)
# Random forest 10-fold cross-validation (not run due to computational time)
# rf <- train(
# x = df_train,
# y = y_train,
# method = "ranger",
# metric = "ROC",
# trControl = trainControl(method = "repeatedcv", number = 10,
# repeats = 3,
# verboseIter = T, classProbs = T,
# savePredictions = "final",
# returnResamp = "all"),
# num.trees = 1000,
# verbose = TRUE,
# importance = "permutation")
# saveRDS(rf, "data/machine_learning/models/20200111_rf_10foldcv.rds")
# Read in the random forest model from cross-validation
rf_bin_xval <- readRDS("data/machine_learning/models/20200111_rf_10foldcv.rds")
rf_bin_xval$bestTune # Hyperparameters from cross-validation
## mtry splitrule min.node.size
## 6 334 extratrees 1
# Receiver operating characteristic curve
rfRoc <- roc(response = rf_bin_xval$pred$obs,
predictor = rf_bin_xval$pred$Y,
levels = rev(levels(rf_bin_xval$pred$obs)))
par(bty = "L")
plot(rfRoc, type = "s",
col = "#529DCF", xaxs = "i", yaxs="i",
print.auc = TRUE, print.auc.x = 0.8, print.auc.y = 0.6)
vdf <- varImp(rf_bin_xval, scale = T)
dtfvec <- vdf$importance[order(vdf$importance, decreasing = T),]
dtfnams <- rownames(vdf$importance)[order(vdf$importance, decreasing = T)]
dtfsort <- data.frame(cbind(dtfnams[1:30], dtfvec[1:30]), stringsAsFactors = F)
colnames(dtfsort)[1:2] <- c("variable", "importance")
dtfsort$variable <- as.character(dtfsort$variable)
dtfsort$importance <- as.numeric(dtfsort$importance)
vimp_combined <- dtfsort %>%
dplyr::mutate(var_fix = case_when(grepl("PC3", variable) ~ "Aromaticity (PC3)",
grepl("PC1", variable) ~ "Molecular weight (PC1)",
grepl("PC2", variable) ~ "Molecular connectivity (PC2)",
grepl("PC4", variable) ~ "Solubility (PC4)",
grepl("PC5", variable) ~ "Oxygen content (PC5)",
grepl("PC6", variable) ~ "Nitrogen content (PC6)",
grepl("PC7", variable) ~ "Chlorine content (PC7)",
grepl("^KF1", variable) ~ "Helix preference (Kidera factor 1)",
grepl("^F2", variable) ~ "Turn propensity (FASGAI factor 2)",
grepl("hmoment", variable) ~ "Hydrophobic moment",
grepl("secstr", variable) ~ paste0("Secondary structure (aa ", stringr::word(variable, -1, sep = "_"), ")"),
grepl("polrty", variable) ~ paste0("Polarity (aa ", stringr::word(variable, -1, sep = "_"), ")"),
grepl("molsz", variable) ~ paste0("Amino acid volume (aa ", stringr::word(variable, -1, sep = "_"), ")"),
grepl("elechrg", variable) ~ paste0("Electrostatic charge (aa ", stringr::word(variable, -1, sep = "_"), ")"),
grepl("^boman", variable) ~ "Protein-protein interaction index",
grepl("^hmoment", variable) ~ "Hydrophobic moment",
grepl("^ST1", variable) ~ "Structural-topology scale factor 1",
grepl("mw_prot", variable) ~ "Protein molecular weight",
grepl("lngth_prot", variable) ~ "Protein length",
grepl("VHSE5", variable) ~ "Protein electronic properties",
grepl("hydrophob_core", variable) ~ "Hydrophobicity (12Ã… protein core)",
grepl("ST5", variable) ~ "Structural-topology scale factor 5",
grepl("PP3", variable) ~ "Protein H-bonding (Cruciani property 3)",
grepl("Z1", variable) ~ "Protein lipophilicity index",
grepl("KF6", variable) ~ "Protein partial specific volume",
grepl("ST3", variable) ~ "Structural-topology scale factor 3",
grepl("T4", variable) ~ "Topological descriptor 4",
grepl("F3", variable) ~ "Bulky properties (FASGAI factor 3)",
grepl("ST6", variable) ~ "Structural-topology scale factor 6")) %>%
dplyr::mutate(color_fct = case_when(grepl("PC", var_fix) ~ "3",
grepl("\\(aa", var_fix) ~ "2",
TRUE ~ "1"))
reds <- rev(c("firebrick4", "firebrick1", "lightpink"))
blues <- rev(c("midnightblue", "royalblue1", "lightskyblue1"))
blues <- brewer.pal(9, "Blues")[c(3, 6, 9)]
vpavg <- ggplot(data = vimp_combined,
aes(x=reorder(var_fix, importance), y=importance, fill=color_fct))+
geom_bar(stat="identity", position ="dodge", color = "black") +
ylab("Variable Importance")+
xlab("")+
guides(fill = F) +
scale_fill_manual(values = blues) +
theme_pubr()+
coord_flip()
vpavg
# Set seed
set.seed(1234)
# Read in the activity data
dat <- read_csv('data/machine_learning/20200111_550_regression_training_examples_12angstrom_prot_features.csv')
# Split into test and training data
dat_split <- initial_split(dat, strata = "activity")
dat_train <- training(dat_split)
dat_test <- testing(dat_split)
nrow(dat_train)/nrow(dat) # 75 %
## [1] 0.7527273
# Define our response
x_train <- dat_train[,!colnames(dat_train) %in% c("id", "activity")]
x_test <- dat_test[,!colnames(dat_test) %in% c("id", "activity")]
y_train <- dat_train$activity
y_test <- dat_test$activity
# Make a data frame for prediction
df_train <- data.frame(x_train, stringsAsFactors = F, row.names = dat_train$id)
# Complete dataset for training and testing
form_train <- data.frame(cbind(x_train, y_train), stringsAsFactors = F, row.names = dat_train$id)
form_test <- data.frame(cbind(x_test, y_test), stringsAsFactors = F, row.names = dat_test$id)
# Random forest regression 10-fold cross-validation (not run due to computational time)
# rf_mod <- train(
# x = df_train,
# y = y_train,
# method = "ranger",
# trControl = trainControl(method = "repeatedcv", number = 10,
# repeats = 3,
# savePredictions = "final"),
# num.trees = 1000,
# verbose = TRUE,
# importance = "permutation")
# saveRDS(rf_mod, "data/machine_learning/models/20200111_rf_mod_10foldcv.rds")
rf_reg_xval <- readRDS("data/machine_learning/models/20200111_rf_mod_10foldcv.rds")
# Calculate training set root-mean square error
Metrics::rmse(rf_reg_xval$pred$pred, rf_reg_xval$pred$obs)
## [1] 0.2552978
rf_reg <- ranger(y_train ~., data = form_train, num.trees = 1000,
splitrule = rf_reg_xval$bestTune$splitrule,
mtry = rf_reg_xval$bestTune$mtry,
min.node.size = rf_reg_xval$bestTune$min.node.size,
importance = "permutation")
rf_pred <- predict(rf_reg, data = form_test)$predictions
Metrics::rmse(y_test, rf_pred) # 0.219
## [1] 0.2190977
rf_df <- data.frame(cbind(rf_pred, y_test))
my.formula <- y ~ x
rf_df_resid <- rf_df %>%
mutate(resid = y_test - rf_pred)
colnames(rf_df_resid) <- c("pred", "obs", "resid")
ggplot(rf_df_resid, aes(x = pred, y = resid)) +
geom_point(alpha = .3) +
geom_line(col = "red", y = 0.0,
lty = 2, lwd = 1, alpha = .5) +
theme_pubr() +
xlab("Predicted enzyme activity") +
ylab("Residuals")
ggplot(rf_df_resid, aes(x = obs, y = pred)) +
geom_point(alpha = .3) +
geom_smooth(se = FALSE, col = "red", method = "lm",
lty = 2, lwd = 1, alpha = .5) +
theme_pubr() +
xlab("Observed enzyme activity") +
ylab("Predicted enzyme activity") +
stat_poly_eq(formula = my.formula,
aes(label = paste(..rr.label.., sep = "~~~")),
parse = TRUE)
rf_reg_xval <- readRDS("data/machine_learning/models/20200111_rf_mod_10foldcv.rds")
vdf <- varImp(rf_reg_xval, scale = T)
dtfvec <-vdf$importance[order(vdf$importance, decreasing = T),]
dtfnams <- rownames(vdf$importance)[order(vdf$importance, decreasing = T)]
dtfsort <- data.frame(cbind(dtfnams[1:30], dtfvec[1:30]), stringsAsFactors = F)
colnames(dtfsort)[1:2] <- c("variable", "importance")
dtfsort$variable <- as.character(dtfsort$variable)
dtfsort$importance <- as.numeric(dtfsort$importance)
vimp_combined <- dtfsort %>%
dplyr::mutate(var_fix = case_when(grepl("PC3", variable) ~ "Aromaticity (PC3)",
grepl("PC1", variable) ~ "Molecular weight (PC1)",
grepl("PC2", variable) ~ "Molecular connectivity (PC2)",
grepl("PC4", variable) ~ "Solubility (PC4)",
grepl("PC5", variable) ~ "Oxygen content (PC5)",
grepl("PC6", variable) ~ "Nitrogen content (PC6)",
grepl("PC7", variable) ~ "Chlorine content (PC7)",
grepl("^KF1", variable) ~ "Helix preference (Kidera factor 1)",
grepl("^F2", variable) ~ "Turn propensity (FASGAI factor 2)",
grepl("hmoment", variable) ~ "Hydrophobic moment",
grepl("secstr", variable) ~ paste0("Secondary structure (aa ", stringr::word(variable, -1, sep = "_"), ")"),
grepl("polrty", variable) ~ paste0("Polarity (aa ", stringr::word(variable, -1, sep = "_"), ")"),
grepl("molsz", variable) ~ paste0("Amino acid volume (aa ", stringr::word(variable, -1, sep = "_"), ")"),
grepl("elechrg", variable) ~ paste0("Electrostatic charge (aa ", stringr::word(variable, -1, sep = "_"), ")"),
grepl("^boman", variable) ~ "Protein-protein interaction index",
grepl("^hmoment", variable) ~ "Hydrophobic moment",
grepl("^ST1", variable) ~ "Structural-topology scale factor 1",
grepl("mw_prot", variable) ~ "Protein molecular weight",
grepl("lngth_prot", variable) ~ "Protein length",
grepl("VHSE5", variable) ~ "Protein electronic properties",
grepl("hydrophob_core", variable) ~ "Hydrophobicity (12Ã… protein core)",
grepl("ST5", variable) ~ "Structural-topology scale factor 5",
grepl("PP3", variable) ~ "Protein H-bonding (Cruciani property 3)",
grepl("Z1", variable) ~ "Protein lipophilicity index",
grepl("KF6", variable) ~ "Protein partial specific volume",
grepl("ST3", variable) ~ "Structural-topology scale factor 3",
grepl("F5", variable) ~ "Local flexibility (FASGAI factor 5)",
grepl("pi_prot", variable) ~ "Protein isoelectric point")) %>%
dplyr::mutate(color_fct = case_when(grepl("PC", var_fix) ~ "3",
grepl("\\(aa", var_fix) ~ "2",
TRUE ~ "1"))
reds <- rev(c("firebrick4", "firebrick1", "lightpink"))
blues <- rev(c("midnightblue", "royalblue1", "lightskyblue1"))
blues <- brewer.pal(9, "Blues")[c(3, 6, 9)]
vpavg <- ggplot(data = vimp_combined,
aes(x=reorder(var_fix, importance), y=importance, fill=color_fct))+
geom_bar(stat="identity", position="dodge", color = "black") +
ylab("Variable Importance")+
xlab("")+
guides(fill=F) +
scale_fill_manual(values = reds) +
theme_pubr() +
coord_flip()
vpavg
# Align with template sequence and extract active site residues
new_query_seq <- "data/new_test_OleA.fasta"
nqs <- readAAStringSet("data/new_test_OleA.fasta")
nqs$Vulcaniibacterium_new_seq
## 340-letter "AAString" instance
## seq: MLFQHVAIAGLAHIDAPRRLTSEEIHSRLKPTLDRL...IVLSKLREMGRLKKGNRIALLGIGSGLNCSMAEVVW
source("lib/extract_12angstrom_aas.R")
extract_84_list <- lapply(1:length(length(new_query_seq)),
function(x) { extract_12angstrom_aas(new_query_seq) })
extract_84_df <- data.frame(matrix(unlist(extract_84_list), nrow = length(extract_84_list), byrow=T),
stringsAsFactors = FALSE)
colnams84 <- read_csv("shiny_app/data/seqft_colnames.csv", col_names = F) %>%
pull()
colnames(extract_84_df) <- colnams84
# Remove sequence features with non-zero variance
which_rem <- read_csv("shiny_app/data/vars_to_remove.csv", col_names = T) %>%
pull()
# Create a data frame of substrate features
molec_fts <- read_csv("shiny_app/data/PC7_molecular_descriptors_clean.csv")
hex <- molec_fts[,2:8]
subs <- rep(pull(molec_fts[,1]), length(new_query_seq))
hex_df <- data.frame(cbind(subs, apply(hex, 2, function(x) rep(x, length(new_query_seq)))), stringsAsFactors = F)
colnames(hex_df) <- c("substrate", paste0("PC", 1:7))
hex_df_sorted <- hex_df[order(hex_df$PC1),]
ex84_df_scaled <- data.frame(apply(extract_84_df, 2, function(x) rep(x, 15)), stringsAsFactors = F)
# Create full data frame of features
test_df <- bind_cols(hex_df_sorted, ex84_df_scaled) %>%
dplyr::select(-all_of(which_rem), -substrate)
test_df[,1:7] <- apply(test_df[,1:7], 2, as.numeric)
# Binary classification
rf_funct_class <- readRDS("shiny_app/data/models/20200324_rf_bin_class_all_data_no_prot_unscaled_optimized.rds")
rf_funct_class_pred <- predict(rf_funct_class, data = test_df, predict.all = F)
rf_fc_pred <- rf_funct_class_pred$predictions
res_fc_prob <- apply(rf_fc_pred, 1, max)
res_fc <- tibble(colnames(rf_fc_pred)[apply(rf_fc_pred, 1, which.max)])
# Return a data frame of predictions
pred_df <- data.frame(rep(names(nqs), 15), hex_df_sorted$substrate, res_fc, round(res_fc_prob, 2), stringsAsFactors = F)
# Regression
rf_reg <- readRDS("shiny_app/data/models/20200401_rf_regression_all_data_no_prot_unscaled_optimized.rds")
rf_reg <- predict(rf_reg, data = test_df, predict.all = F)
rf_reg_pred <- rf_reg$predictions
reg_df <- data.frame(cbind(pred_df, round(rf_reg_pred, 2)), stringsAsFactors = F)
colnames(reg_df) <- c("query_name", "subst", "is_active", "pred_prob", "raw_activity")
# Show picture of substrate
reg_df$substrate <- paste0('<img src = "https://github.com/serina-robinson/thiolase-machine-learning/raw/master/shiny_app/www/', ... = reg_df$subst, '.jpeg" style="width: 90%; height: auto;" >')
reg_df$raw_activity[reg_df$is_active == "N"] <- "No predicted activity"
ord_df <- reg_df[order(reg_df$query_name),] %>%
dplyr::mutate(IUPAC_name = case_when(subst == "decanoate" ~ "4-nitrophenyl decanoate",
subst == "dodecanoate" ~ "4-nitrophenyl dodecanoate",
subst == "7Ph.heptanoate" ~ "4-nitrophenyl 7-phenylheptanoate",
subst == "hexanoate" ~ "4-nitrophenyl hexanoate",
subst == "heptanoate" ~ "4-nitrophenyl heptanoate",
subst == "biotin" ~ "4-nitrophenyl biotin",
subst == "heptynoate" ~ "4-nitrophenyl heptynoate",
subst == "oxadiazole" ~ "4-nitrophenyl 3-(5-phenyl-1,3,4-oxadiazol-2-yl)propanoate",
subst == "dimethyl" ~ "4-nitrophenyl 2,2-dimethylhexanoate",
subst == "butoxy" ~ "4-nitrophenyl 2-(2-butoxyethoxy)acetate",
subst == "cyclopentyl" ~ "4-nitrophenyl 3-cyclopentylpropanoate",
subst == "TMA" ~ "4-nitrophenyl trimethylacetate",
subst == "azido" ~ "4-nitrophenyl 6-azidohexanoate",
subst == "benzoate" ~ "4-nitrophenyl benzoate",
subst == "ClPh.propionate" ~ "4-nitrophenyl-3-(4-chlorophenoxy)propanoate",
TRUE ~ "")) %>%
dplyr::select(query_name, IUPAC_name, substrate, is_active, pred_prob, raw_activity)
ord_df %>%
mutate(`Query name` = query_name) %>%
mutate(`Substrate` = substrate) %>%
mutate(`IUPAC name` = IUPAC_name) %>%
mutate(`Active?` = cell_spec(is_active, color = "white",
background = case_when(is_active == "Y" & pred_prob >= 0.65 ~ "green",
is_active == "N" & pred_prob >= 0.65 ~ "darkred",
pred_prob < 0.65 ~ "grey"))) %>%
mutate(`Prediction probability` = cell_spec(pred_prob, color = "white",
background = case_when(is_active == "Y" & pred_prob >= 0.65 ~ "green",
is_active == "N" & pred_prob >= 0.65 ~ "darkred",
pred_prob < 0.65 ~ "grey"))) %>%
mutate(`Estimated log10 activity ☨` = cell_spec(raw_activity, color = "white",
background = case_when(is_active == "Y" & pred_prob >= 0.65 ~ "green",
is_active == "N" & pred_prob >= 0.65 ~ "darkred",
pred_prob < 0.65 ~ "grey"))) %>%
select(-query_name, -pred_prob, -is_active, -raw_activity, -substrate, -IUPAC_name) %>%
kable(., format = 'html', escape = F) %>%
row_spec(1:nrow(reg_df), background = "white") %>%
kable_styling(font_size = 15,
bootstrap_options = c("striped", "hover", "condensed"))
| Query name | Substrate | IUPAC name | Active? | Prediction probability | Estimated log10 activity ☨ |
|---|---|---|---|---|---|
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl dodecanoate | Y | 0.88 | 1.74 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl 3-(5-phenyl-1,3,4-oxadiazol-2-yl)propanoate | Y | 0.88 | 1.48 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl biotin | N | 0.94 | No predicted activity |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl-3-(4-chlorophenoxy)propanoate | N | 0.89 | No predicted activity |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl 7-phenylheptanoate | Y | 0.92 | 1.56 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl 2-(2-butoxyethoxy)acetate | Y | 1 | 2.37 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl decanoate | Y | 0.81 | 1.66 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl 3-cyclopentylpropanoate | Y | 0.99 | 1.89 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl 6-azidohexanoate | Y | 1 | 2.1 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl benzoate | N | 0.99 | No predicted activity |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl 2,2-dimethylhexanoate | N | 0.93 | No predicted activity |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl heptanoate | Y | 0.95 | 2.26 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl heptynoate | Y | 1 | 1.97 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl hexanoate | Y | 0.98 | 2.19 |
| Vulcaniibacterium_new_seq |
|
4-nitrophenyl trimethylacetate | Y | 0.97 | 1.2 |