This report presents a downstream data analysis and visualization of the matrix obtained from the AgeXtend tool. The matrix contains probability values from various predictive models for a set of chemical compounds. The primary goals of this analysis are to:
The dataset includes the following key columns:
The first step is to load the data from the provided CSV file.
# Load the data
# Make sure the file 'agextend_probabilities.csv' is in the same directory as this Rmd file,
# or provide the full path to the file.
file_path <- "agextend_probabilities.csv" # Make sure this file is accessible
if (file.exists(file_path)) {
data_orig <- read.csv(file_path, stringsAsFactors = FALSE, check.names = FALSE)
data <- data_orig # Work with a copy
cat("Data loaded successfully.\n")
cat("Number of rows:", nrow(data), "\n")
cat("Number of columns:", ncol(data), "\n")
} else {
warning("The file 'agextend_probabilities.csv' was not found. Using placeholder data for demonstration. Please ensure the file is in the correct path for actual analysis.")
data <- data.frame(
SMILES = paste0("C(C\\O)=N#N@", 1:10), # Example SMILES with special chars
Anti_Aging_Prob = runif(10),
AIC_Prob1 = runif(10), CS_Prob1 = runif(10), DNS_Prob1 = runif(10),
EA_Prob1 = runif(10), GI_Prob1 = runif(10), LP_Prob1 = runif(10),
MD_Prob1 = runif(10), SCE_Prob1 = runif(10), TA_Prob1 = runif(10),
AMES_Prob1 = runif(10), BBB_Prob1 = runif(10), CYP1A2_Prob1 = runif(10),
CYP2C19_Prob1 = runif(10), CYP2C9_Prob1 = runif(10), CYP2D6_Prob1 = runif(10),
CYP3A4_Prob1 = runif(10), DILI_Prob1 = runif(10), Hepato_Prob1 = runif(10),
hERG_Prob1 = runif(10), HLM_Prob1 = runif(10), MMP_Prob1 = runif(10),
`P-gp_Inhibitor_Prob1` = runif(10), `P-gp_Substrate_Prob1` = runif(10),
`MRTD (mg/day)` = rnorm(10, -4, 1),
`MRTD (uMol)` = rnorm(10, 50, 20),
check.names = FALSE
)
}
## Data loaded successfully.
## Number of rows: 120
## Number of columns: 27
# Display the first few rows using kable for PDF
cat("\nFirst few rows of the dataset:\n")
##
## First few rows of the dataset:
df_to_print_head <- head(data)
if ("SMILES" %in% colnames(df_to_print_head)) {
df_to_print_head$SMILES <- sanitize_smiles_for_latex(df_to_print_head$SMILES)
}
kable(df_to_print_head, caption = "First few rows of the dataset.", escape = FALSE)
| SMILES | Anti_Aging_Prob | AIC_Prob1 | CS_Prob1 | DNS_Prob1 | EA_Prob1 | GI_Prob1 | LP_Prob1 | MD_Prob1 | SCE_Prob1 | TA_Prob1 | AMES_Prob1 | BBB_Prob1 | CYP1A2_Prob1 | CYP2C19_Prob1 | CYP2C9_Prob1 | CYP2D6_Prob1 | CYP3A4_Prob1 | DILI_Prob1 | Hepato_Prob1 | hERG_Prob1 | HLM_Prob1 | MMP_Prob1 | P-gp_Inhibitor_Prob1 | P-gp_Substrate_Prob1 | MRTD (mg/day) | MRTD (uMol) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.4153507 | 0.0870812 | 0.1679620 | 0.0022072 | 0.0233503 | 0.0000000 | 0.6721489 | 0.9549921 | 0.0470 | 0.954 | 0.3541652 | 0.0012305 | 0.5704448 | 0.0503423 | 0.0108990 | 0.0540955 | 0.0460715 | 0.8058858 | 0.1062473 | 0.0709472 | 0.5895873 | 0.0169489 | 0.0240385 | 0.0870644 | -4.422064 | 37.83864 | |
| 0.8601621 | 0.0015100 | 0.1796195 | 0.0020561 | 0.0044521 | 0.0000000 | 0.9146299 | 0.0018636 | 0.6056 | 0.758 | 0.0038247 | 0.0003003 | 0.0000862 | 0.0612354 | 0.0128114 | 0.0475504 | 0.0311646 | 0.2049868 | 0.1590862 | 0.0409310 | 0.0391542 | 0.0062550 | 0.0000000 | 0.1111346 | -3.842360 | 143.76073 | |
| 0.8675090 | 0.0442689 | 0.1971555 | 0.0035731 | 0.0038362 | 0.0000000 | 0.5062657 | 0.4111765 | 0.6004 | 0.844 | 0.8564523 | 0.0000013 | 0.0000386 | 0.0582825 | 0.0019034 | 0.0617479 | 0.0564710 | 0.5432760 | 0.1559419 | 0.0519310 | 0.1466679 | 0.0046539 | 0.0600962 | 0.0853015 | -3.842360 | 143.76073 | |
| 0.8675582 | 0.0104090 | 0.1560525 | 0.0065604 | 0.0023478 | 0.0092593 | 0.8536523 | 0.0187889 | 0.4318 | 0.878 | 0.1791295 | 0.0000015 | 0.0008880 | 0.0639984 | 0.0087752 | 0.0486175 | 0.0328780 | 0.3559380 | 0.1664355 | 0.0435770 | 0.0929566 | 0.0034897 | 0.0000000 | 0.0580268 | -4.422064 | 37.83864 | |
| 0.8675303 | 0.0375370 | 0.2458189 | 0.0037018 | 0.1736523 | 0.0000000 | 0.8936273 | 0.0215177 | 0.4830 | 0.900 | 0.0075567 | 0.2062904 | 0.0031630 | 0.0400446 | 0.0085117 | 0.0520556 | 0.0314254 | 0.5750942 | 0.0669128 | 0.0283625 | 0.0486844 | 0.0050056 | 0.0456731 | 0.0998663 | -4.833956 | 14.65698 | |
| 0.5000000 | 0.0016074 | 0.0827787 | 0.0116589 | 0.0528406 | 0.0000000 | 0.9622016 | 0.0010812 | 0.3468 | 0.906 | 0.1979274 | 0.0115721 | 0.0037084 | 0.0619940 | 0.0072656 | 0.0618336 | 0.0573247 | 0.6784239 | 0.1241605 | 0.0344797 | 0.1381379 | 0.0077853 | 0.0000000 | 0.2291731 | -3.842360 | 143.76073 |
We will perform initial data cleaning, such as checking for missing values and ensuring correct data types. Column names often contain special characters or spaces that can be problematic in R. We will sanitize them.
# Sanitize column names (replace special characters with underscores)
original_colnames <- colnames(data)
colnames(data) <- make.names(colnames(data), unique = TRUE)
sanitized_colnames_df <- data.frame(Original = original_colnames, Sanitized = colnames(data))
cat("Sanitized Column Names:\n")
## Sanitized Column Names:
kable(sanitized_colnames_df, caption = "Mapping of Original to Sanitized Column Names.")
| Original | Sanitized |
|---|---|
| SMILES | SMILES |
| Anti_Aging_Prob | Anti_Aging_Prob |
| AIC_Prob1 | AIC_Prob1 |
| CS_Prob1 | CS_Prob1 |
| DNS_Prob1 | DNS_Prob1 |
| EA_Prob1 | EA_Prob1 |
| GI_Prob1 | GI_Prob1 |
| LP_Prob1 | LP_Prob1 |
| MD_Prob1 | MD_Prob1 |
| SCE_Prob1 | SCE_Prob1 |
| TA_Prob1 | TA_Prob1 |
| AMES_Prob1 | AMES_Prob1 |
| BBB_Prob1 | BBB_Prob1 |
| CYP1A2_Prob1 | CYP1A2_Prob1 |
| CYP2C19_Prob1 | CYP2C19_Prob1 |
| CYP2C9_Prob1 | CYP2C9_Prob1 |
| CYP2D6_Prob1 | CYP2D6_Prob1 |
| CYP3A4_Prob1 | CYP3A4_Prob1 |
| DILI_Prob1 | DILI_Prob1 |
| Hepato_Prob1 | Hepato_Prob1 |
| hERG_Prob1 | hERG_Prob1 |
| HLM_Prob1 | HLM_Prob1 |
| MMP_Prob1 | MMP_Prob1 |
| P-gp_Inhibitor_Prob1 | P.gp_Inhibitor_Prob1 |
| P-gp_Substrate_Prob1 | P.gp_Substrate_Prob1 |
| MRTD (mg/day) | MRTD..mg.day. |
| MRTD (uMol) | MRTD..uMol. |
# Identify column groups based on sanitized names
# These indices/names must match your actual dataset structure after sanitization
anti_aging_col <- colnames(data)[2]
hallmarks_cols <- colnames(data)[3:11]
toxicity_metabolism_cols <- colnames(data)[12:25]
mrtd_mg_col <- colnames(data)[26]
mrtd_umol_col <- colnames(data)[27]
# Ensure the identified columns exist
# Adding checks to prevent errors if columns are not as expected
expected_col_indices <- c(2, 3:11, 12:25, 26, 27)
if (max(expected_col_indices) > ncol(data)) {
stop("The dataset does not have the expected number of columns. Please check the input file and column definitions.")
}
stopifnot(anti_aging_col %in% colnames(data))
stopifnot(all(hallmarks_cols %in% colnames(data)))
stopifnot(all(toxicity_metabolism_cols %in% colnames(data)))
stopifnot(mrtd_mg_col %in% colnames(data))
stopifnot(mrtd_umol_col %in% colnames(data))
probability_cols <- c(anti_aging_col, hallmarks_cols, toxicity_metabolism_cols)
# Convert probability columns to numeric
for (col in probability_cols) {
if (!is.numeric(data[[col]])) {
data[[col]] <- as.numeric(as.character(data[[col]]))
}
}
# Convert MRTD columns to numeric
if (!is.numeric(data[[mrtd_mg_col]])) {
data[[mrtd_mg_col]] <- as.numeric(as.character(data[[mrtd_mg_col]]))
}
if (!is.numeric(data[[mrtd_umol_col]])) {
data[[mrtd_umol_col]] <- as.numeric(as.character(data[[mrtd_umol_col]]))
}
# Check for missing values
missing_values <- sapply(data, function(x) sum(is.na(x)))
missing_summary <- data.frame(Column = names(missing_values), MissingCount = missing_values)
missing_summary <- missing_summary[missing_summary$MissingCount > 0, ]
if (nrow(missing_summary) > 0) {
cat("\nColumns with missing values:\n")
kable(missing_summary, caption = "Missing Value Summary.")
warning("Missing values detected. Consider imputation or removal.")
} else {
cat("\nNo missing values found in the dataset.\n")
}
##
## No missing values found in the dataset.
cat("\nData types of columns:\n")
##
## Data types of columns:
kable(data.frame(Column = colnames(data), DataType = sapply(data, class)), caption = "Column Data Types.")
| Column | DataType | |
|---|---|---|
| SMILES | SMILES | character |
| Anti_Aging_Prob | Anti_Aging_Prob | numeric |
| AIC_Prob1 | AIC_Prob1 | numeric |
| CS_Prob1 | CS_Prob1 | numeric |
| DNS_Prob1 | DNS_Prob1 | numeric |
| EA_Prob1 | EA_Prob1 | numeric |
| GI_Prob1 | GI_Prob1 | numeric |
| LP_Prob1 | LP_Prob1 | numeric |
| MD_Prob1 | MD_Prob1 | numeric |
| SCE_Prob1 | SCE_Prob1 | numeric |
| TA_Prob1 | TA_Prob1 | numeric |
| AMES_Prob1 | AMES_Prob1 | numeric |
| BBB_Prob1 | BBB_Prob1 | numeric |
| CYP1A2_Prob1 | CYP1A2_Prob1 | numeric |
| CYP2C19_Prob1 | CYP2C19_Prob1 | numeric |
| CYP2C9_Prob1 | CYP2C9_Prob1 | numeric |
| CYP2D6_Prob1 | CYP2D6_Prob1 | numeric |
| CYP3A4_Prob1 | CYP3A4_Prob1 | numeric |
| DILI_Prob1 | DILI_Prob1 | numeric |
| Hepato_Prob1 | Hepato_Prob1 | numeric |
| hERG_Prob1 | hERG_Prob1 | numeric |
| HLM_Prob1 | HLM_Prob1 | numeric |
| MMP_Prob1 | MMP_Prob1 | numeric |
| P.gp_Inhibitor_Prob1 | P.gp_Inhibitor_Prob1 | numeric |
| P.gp_Substrate_Prob1 | P.gp_Substrate_Prob1 | numeric |
| MRTD..mg.day. | MRTD..mg.day. | numeric |
| MRTD..uMol. | MRTD..uMol. | numeric |
The data has been loaded and initial preprocessing steps, including column name sanitization and data type conversion, have been performed. We’ve also checked for missing values.
Let’s look at the summary statistics for the key probability columns and MTRD values.
# Summary for Anti-Aging Probability
cat("\nSummary for Anti-Aging Probability (", anti_aging_col, "):\n")
##
## Summary for Anti-Aging Probability ( Anti_Aging_Prob ):
summary_anti_aging <- summary(data[[anti_aging_col]])
print(summary_anti_aging)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2072 0.8299 0.8691 0.8238 0.9131 0.9607
# Summary for Hallmarks of Aging Probabilities
cat("\nSummary for Hallmarks of Aging Probabilities:\n")
##
## Summary for Hallmarks of Aging Probabilities:
summary_hallmarks_df <- as.data.frame(t(as.matrix(summary(data[, hallmarks_cols]))))
kable(summary_hallmarks_df, caption = "Summary Statistics for Hallmarks of Aging Probabilities.")
| Var1 | Var2 | Freq |
|---|---|---|
| AIC_Prob1 | Min. :0.0002661 | |
| CS_Prob1 | Min. :0.08278 | |
| DNS_Prob1 | Min. :0.0002683 | |
| EA_Prob1 | Min. :0.0009009 | |
| GI_Prob1 | Min. :0.000000 | |
| LP_Prob1 | Min. :0.1344 | |
| MD_Prob1 | Min. :0.0002777 | |
| SCE_Prob1 | Min. :0.0470 | |
| TA_Prob1 | Min. :0.2640 | |
| AIC_Prob1 | 1st Qu.:0.0172949 | |
| CS_Prob1 | 1st Qu.:0.17746 | |
| DNS_Prob1 | 1st Qu.:0.0028486 | |
| EA_Prob1 | 1st Qu.:0.0035450 | |
| GI_Prob1 | 1st Qu.:0.000000 | |
| LP_Prob1 | 1st Qu.:0.6324 | |
| MD_Prob1 | 1st Qu.:0.0032534 | |
| SCE_Prob1 | 1st Qu.:0.3944 | |
| TA_Prob1 | 1st Qu.:0.6915 | |
| AIC_Prob1 | Median :0.0912741 | |
| CS_Prob1 | Median :0.23274 | |
| DNS_Prob1 | Median :0.0067976 | |
| EA_Prob1 | Median :0.0044528 | |
| GI_Prob1 | Median :0.000000 | |
| LP_Prob1 | Median :0.7949 | |
| MD_Prob1 | Median :0.0191052 | |
| SCE_Prob1 | Median :0.4936 | |
| TA_Prob1 | Median :0.7820 | |
| AIC_Prob1 | Mean :0.2797469 | |
| CS_Prob1 | Mean :0.29327 | |
| DNS_Prob1 | Mean :0.1225560 | |
| EA_Prob1 | Mean :0.0721175 | |
| GI_Prob1 | Mean :0.002186 | |
| LP_Prob1 | Mean :0.7371 | |
| MD_Prob1 | Mean :0.1220043 | |
| SCE_Prob1 | Mean :0.4907 | |
| TA_Prob1 | Mean :0.7649 | |
| AIC_Prob1 | 3rd Qu.:0.5041392 | |
| CS_Prob1 | 3rd Qu.:0.38338 | |
| DNS_Prob1 | 3rd Qu.:0.0222361 | |
| EA_Prob1 | 3rd Qu.:0.0103979 | |
| GI_Prob1 | 3rd Qu.:0.000000 | |
| LP_Prob1 | 3rd Qu.:0.8857 | |
| MD_Prob1 | 3rd Qu.:0.1225927 | |
| SCE_Prob1 | 3rd Qu.:0.6025 | |
| TA_Prob1 | 3rd Qu.:0.8885 | |
| AIC_Prob1 | Max. :0.9962562 | |
| CS_Prob1 | Max. :0.76162 | |
| DNS_Prob1 | Max. :0.9971994 | |
| EA_Prob1 | Max. :1.0000000 | |
| GI_Prob1 | Max. :0.064815 | |
| LP_Prob1 | Max. :0.9855 | |
| MD_Prob1 | Max. :0.9722046 | |
| SCE_Prob1 | Max. :0.9358 | |
| TA_Prob1 | Max. :1.0000 |
# Summary for Toxicity and Metabolism Probabilities
cat("\nSummary for Toxicity and Metabolism Probabilities:\n")
##
## Summary for Toxicity and Metabolism Probabilities:
summary_tox_met_df <- as.data.frame(t(as.matrix(summary(data[, toxicity_metabolism_cols]))))
kable(summary_tox_met_df, caption = "Summary Statistics for Toxicity and Metabolism Probabilities.")
| Var1 | Var2 | Freq |
|---|---|---|
| AMES_Prob1 | Min. :6.050e-06 | |
| BBB_Prob1 | Min. :4.000e-08 | |
| CYP1A2_Prob1 | Min. :9.000e-08 | |
| CYP2C19_Prob1 | Min. :0.01416 | |
| CYP2C9_Prob1 | Min. :0.001778 | |
| CYP2D6_Prob1 | Min. :0.04077 | |
| CYP3A4_Prob1 | Min. :0.02139 | |
| DILI_Prob1 | Min. :0.0879 | |
| Hepato_Prob1 | Min. :0.03889 | |
| hERG_Prob1 | Min. :0.02836 | |
| HLM_Prob1 | Min. :0.003325 | |
| MMP_Prob1 | Min. :0.002182 | |
| P.gp_Inhibitor_Prob1 | Min. :0.00000 | |
| P.gp_Substrate_Prob1 | Min. :0.05803 | |
| AMES_Prob1 | 1st Qu.:2.942e-03 | |
| BBB_Prob1 | 1st Qu.:1.790e-06 | |
| CYP1A2_Prob1 | 1st Qu.:4.595e-05 | |
| CYP2C19_Prob1 | 1st Qu.:0.05406 | |
| CYP2C9_Prob1 | 1st Qu.:0.007856 | |
| CYP2D6_Prob1 | 1st Qu.:0.05197 | |
| CYP3A4_Prob1 | 1st Qu.:0.03908 | |
| DILI_Prob1 | 1st Qu.:0.2882 | |
| Hepato_Prob1 | 1st Qu.:0.12593 | |
| hERG_Prob1 | 1st Qu.:0.04394 | |
| HLM_Prob1 | 1st Qu.:0.048662 | |
| MMP_Prob1 | 1st Qu.:0.005662 | |
| P.gp_Inhibitor_Prob1 | 1st Qu.:0.00000 | |
| P.gp_Substrate_Prob1 | 1st Qu.:0.09943 | |
| AMES_Prob1 | Median :3.526e-02 | |
| BBB_Prob1 | Median :2.625e-05 | |
| CYP1A2_Prob1 | Median :4.387e-04 | |
| CYP2C19_Prob1 | Median :0.06117 | |
| CYP2C9_Prob1 | Median :0.012958 | |
| CYP2D6_Prob1 | Median :0.05788 | |
| CYP3A4_Prob1 | Median :0.04641 | |
| DILI_Prob1 | Median :0.4574 | |
| Hepato_Prob1 | Median :0.18109 | |
| hERG_Prob1 | Median :0.05959 | |
| HLM_Prob1 | Median :0.111367 | |
| MMP_Prob1 | Median :0.016341 | |
| P.gp_Inhibitor_Prob1 | Median :0.01442 | |
| P.gp_Substrate_Prob1 | Median :0.14518 | |
| AMES_Prob1 | Mean :2.344e-01 | |
| BBB_Prob1 | Mean :1.656e-01 | |
| CYP1A2_Prob1 | Mean :8.967e-02 | |
| CYP2C19_Prob1 | Mean :0.11475 | |
| CYP2C9_Prob1 | Mean :0.083196 | |
| CYP2D6_Prob1 | Mean :0.06372 | |
| CYP3A4_Prob1 | Mean :0.07823 | |
| DILI_Prob1 | Mean :0.4595 | |
| Hepato_Prob1 | Mean :0.19746 | |
| hERG_Prob1 | Mean :0.07897 | |
| HLM_Prob1 | Mean :0.198558 | |
| MMP_Prob1 | Mean :0.199316 | |
| P.gp_Inhibitor_Prob1 | Mean :0.16521 | |
| P.gp_Substrate_Prob1 | Mean :0.26951 | |
| AMES_Prob1 | 3rd Qu.:3.566e-01 | |
| BBB_Prob1 | 3rd Qu.:8.426e-03 | |
| CYP1A2_Prob1 | 3rd Qu.:7.214e-03 | |
| CYP2C19_Prob1 | 3rd Qu.:0.07844 | |
| CYP2C9_Prob1 | 3rd Qu.:0.096712 | |
| CYP2D6_Prob1 | 3rd Qu.:0.06174 | |
| CYP3A4_Prob1 | 3rd Qu.:0.05838 | |
| DILI_Prob1 | 3rd Qu.:0.6586 | |
| Hepato_Prob1 | 3rd Qu.:0.25466 | |
| hERG_Prob1 | 3rd Qu.:0.08283 | |
| HLM_Prob1 | 3rd Qu.:0.272523 | |
| MMP_Prob1 | 3rd Qu.:0.310225 | |
| P.gp_Inhibitor_Prob1 | 3rd Qu.:0.16587 | |
| P.gp_Substrate_Prob1 | 3rd Qu.:0.25494 | |
| AMES_Prob1 | Max. :9.987e-01 | |
| BBB_Prob1 | Max. :9.997e-01 | |
| CYP1A2_Prob1 | Max. :9.446e-01 | |
| CYP2C19_Prob1 | Max. :0.91341 | |
| CYP2C9_Prob1 | Max. :0.779949 | |
| CYP2D6_Prob1 | Max. :0.65029 | |
| CYP3A4_Prob1 | Max. :0.91251 | |
| DILI_Prob1 | Max. :0.8059 | |
| Hepato_Prob1 | Max. :0.91012 | |
| hERG_Prob1 | Max. :0.42637 | |
| HLM_Prob1 | Max. :0.873926 | |
| MMP_Prob1 | Max. :0.973326 | |
| P.gp_Inhibitor_Prob1 | Max. :1.00000 | |
| P.gp_Substrate_Prob1 | Max. :0.93609 |
# Summary for MRTD values
cat("\nSummary for MRTD (", mrtd_mg_col, "):\n")
##
## Summary for MRTD ( MRTD..mg.day. ):
summary_mrtd_mg <- summary(data[[mrtd_mg_col]])
print(summary_mrtd_mg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.334 -5.148 -4.422 -4.589 -3.842 -3.842
cat("\nSummary for MRTD (", mrtd_umol_col, "):\n")
##
## Summary for MRTD ( MRTD..uMol. ):
summary_mrtd_umol <- summary(data[[mrtd_umol_col]])
print(summary_mrtd_umol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4631 7.1163 37.8386 57.7677 143.7607 143.7607
The summary statistics provide an overview of the central tendency, spread, and range of the predicted probabilities and MTRD values.
Visualizing the distribution of probabilities can reveal patterns and outliers.
ggplot(data, aes_string(x = anti_aging_col)) +
geom_histogram(binwidth = 0.05, fill = "steelblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Anti-Aging Probability",
x = "Anti-Aging Probability",
y = "Frequency") +
custom_theme()
Distribution of Anti-Aging Probability.
ggplot(data, aes_string(y = anti_aging_col)) +
geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7, width=0.3) +
labs(title = "Boxplot of Anti-Aging Probability",
y = "Anti-Aging Probability", x="") +
custom_theme() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
Distribution of Anti-Aging Probability.
The histogram and boxplot show the distribution of the Anti-Aging probabilities.
data_hallmarks_long <- data %>%
select(all_of(hallmarks_cols)) %>%
pivot_longer(everything(), names_to = "Hallmark", values_to = "Probability")
ggplot(data_hallmarks_long, aes(x = Probability)) +
geom_histogram(binwidth = 0.05, fill = "coral", color = "black", alpha = 0.7) +
facet_wrap(~ Hallmark, scales = "free_y", ncol = 3) +
labs(title = "Distributions of Hallmarks of Aging Probabilities",
x = "Probability",
y = "Frequency") +
custom_theme() +
theme(strip.text = element_text(size = 7))
Distributions of Hallmarks of Aging Probabilities.
ggplot(data_hallmarks_long, aes(x = Hallmark, y = Probability, fill = Hallmark)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE) +
labs(title = "Boxplots of Hallmarks of Aging Probabilities",
x = "Hallmark of Aging",
y = "Probability") +
custom_theme() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, size=7))
Distributions of Hallmarks of Aging Probabilities.
These plots display the distributions for each of the 9 “Hallmarks of Aging” probabilities.
data_tox_met_long <- data %>%
select(all_of(toxicity_metabolism_cols)) %>%
pivot_longer(everything(), names_to = "ToxMet_Property", values_to = "Probability")
ggplot(data_tox_met_long, aes(x = Probability)) +
geom_histogram(binwidth = 0.05, fill = "lightgreen", color = "black", alpha = 0.7) +
facet_wrap(~ ToxMet_Property, scales = "free_y", ncol = 3) +
labs(title = "Distributions of Toxicity and Metabolism Probabilities",
x = "Probability",
y = "Frequency") +
custom_theme() +
theme(strip.text = element_text(size = 6))
Distributions of Toxicity and Metabolism Probabilities.
ggplot(data_tox_met_long, aes(x = ToxMet_Property, y = Probability, fill = ToxMet_Property)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE) +
labs(title = "Boxplots of Toxicity and Metabolism Probabilities",
x = "Toxicity/Metabolism Property",
y = "Probability") +
custom_theme() +
theme(axis.text.x = element_text(angle = 75, hjust = 1, size=6))
Distributions of Toxicity and Metabolism Probabilities.
These visualizations show the distributions for various toxicity and metabolism-related probabilities.
probability_data <- data[, probability_cols]
cor_matrix <- cor(probability_data, use = "pairwise.complete.obs")
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black", tl.srt = 45, tl.cex = 0.5,
addCoef.col = "black", number.cex = 0.4,
col = brewer.pal(n = 8, name = "RdYlBu"),
diag = FALSE,
title = "Correlation Plot of All Probabilities", mar=c(0,0,1,0))
Correlation Matrix of All Predicted Probabilities.
The correlation matrix visualizes pairwise Pearson correlations.
anti_aging_hallmarks_data <- data[, c(anti_aging_col, hallmarks_cols)]
cor_matrix_ah <- cor(anti_aging_hallmarks_data, use = "pairwise.complete.obs")
corrplot(cor_matrix_ah, method = "number", type = "upper", order = "original",
tl.col = "black", tl.srt = 45, tl.cex = 0.7,
number.cex = 0.7,
title = "Correlations: Anti-Aging vs. Hallmarks", mar=c(0,0,1,0))
Correlations: Anti-Aging vs. Hallmarks.
This plot focuses on correlations between Anti-Aging probability and individual Hallmarks of Aging.
num_compounds_heatmap <- min(nrow(data), 30)
# Ensure probability_data is defined if not run globally
if (!exists("probability_data")) {
probability_data <- data[, probability_cols]
}
heatmap_data_subset <- probability_data[1:num_compounds_heatmap, ]
# Use short identifiers for rownames if SMILES are too long for PDF
# Use original SMILES from 'data_orig' or 'data' before sanitization for actual SMILES if needed for other purposes
# but for rownames in heatmap, use sanitized or placeholder
short_ids <- if ("SMILES" %in% colnames(data) && num_compounds_heatmap > 0) {
data$SMILES[1:num_compounds_heatmap] # Use the sanitized SMILES from the 'data' df
} else if (num_compounds_heatmap > 0) {
rownames(heatmap_data_subset)
} else {
character(0)
}
# Sanitize these short_ids if they are SMILES, for display in heatmap (pheatmap might handle some)
# However, pheatmap usually shows rownames as is.
# For safety, if these are actual SMILES, they might need sanitization for LaTeX context if pheatmap generates LaTeX code.
# But pheatmap generates a plot, not LaTeX table code. So, raw SMILES as rownames should be fine here.
if (length(short_ids) == nrow(heatmap_data_subset)) { # Check if short_ids generation was successful
rownames(heatmap_data_subset) <- short_ids
}
pheatmap(heatmap_data_subset,
scale = "none",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
cutree_rows = if(num_compounds_heatmap > 1) min(3, num_compounds_heatmap-1) else 1,
cutree_cols = 2,
fontsize_row = 5,
fontsize_col = 6,
angle_col = "45",
main = paste("Heatmap for first", num_compounds_heatmap, "Compounds"),
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100))
Heatmap of Probabilities.
The heatmap clusters compounds and probability models. Note: Displays a subset of compounds.
clustering_data_kmeans <- data[, c(anti_aging_col, hallmarks_cols)]
clustering_data_kmeans <- na.omit(clustering_data_kmeans)
if(nrow(clustering_data_kmeans) < 2) { # Changed from clustering_data to clustering_data_kmeans
cat("Not enough data points for clustering after NA removal.\n")
} else {
clustering_data_scaled <- scale(clustering_data_kmeans) # Changed
cat("Optimal k determination plots omitted for brevity in PDF. Run interactively if needed.\n")
k_optimal <- 3
if(nrow(clustering_data_scaled) < k_optimal) k_optimal <- nrow(clustering_data_scaled)
if(k_optimal == 0 && nrow(clustering_data_scaled) > 0) k_optimal <- 1 # Handle edge case for very few rows
if(k_optimal > 0) { # Proceed only if k_optimal is valid
set.seed(123)
kmeans_result <- kmeans(clustering_data_scaled, centers = k_optimal, nstart = 25)
data_for_pca <- data[rownames(clustering_data_kmeans), ]
data_for_pca$Cluster_Hallmarks <- as.factor(kmeans_result$cluster)
pca_result <- prcomp(clustering_data_scaled, center = TRUE, scale. = TRUE)
print(fviz_pca_ind(pca_result, # Explicitly print ggplot object
geom.ind = "point",
col.ind = data_for_pca$Cluster_Hallmarks,
palette = "jco",
addEllipses = TRUE, ellipse.type = "confidence",
legend.title = "Clusters",
title = "PCA of Compounds (Anti-Aging & Hallmarks)") + custom_theme())
cat("\nSummary of compounds per cluster (based on Hallmarks):\n")
kable(table(data_for_pca$Cluster_Hallmarks), col.names = c("Cluster", "Count"), caption = "Compound Counts per Cluster.")
} else {
cat("Cannot perform K-Means clustering with k_optimal <= 0.\n")
}
}
## Optimal k determination plots omitted for brevity in PDF. Run interactively if needed.
K-Means Clustering of Compounds based on Hallmarks.
##
## Summary of compounds per cluster (based on Hallmarks):
| Cluster | Count |
|---|---|
| 1 | 23 |
| 2 | 27 |
| 3 | 70 |
K-Means clustering partitions compounds. PCA visualizes these clusters.
toxicity_threshold <- 0.7
specific_toxicity_cols_pattern <- "AMES|DILI|Hepato|hERG"
# Use original toxicity_metabolism_cols for grep, as it's based on original column structure assumptions
specific_toxicity_cols <- grep(specific_toxicity_cols_pattern, colnames(data), value = TRUE, ignore.case = TRUE)
# Filter to ensure these are indeed from the intended block of columns
specific_toxicity_cols <- intersect(specific_toxicity_cols, toxicity_metabolism_cols)
if (length(specific_toxicity_cols) > 0 && all(specific_toxicity_cols %in% colnames(data))) {
high_toxicity_matrix <- data[, specific_toxicity_cols, drop=FALSE] > toxicity_threshold
data$Num_High_Toxicity_Flags <- rowSums(high_toxicity_matrix, na.rm = TRUE)
cat("\nDistribution of Number of High Toxicity Flags (Threshold > ", toxicity_threshold, "):\n")
print(table(data$Num_High_Toxicity_Flags))
# Select SMILES from original data (data_orig) if available, or current data
smiles_col_name <- names(data_orig)[1] # Assuming first col is SMILES in original
if (!("SMILES" %in% colnames(data))) data$SMILES <- data_orig[[smiles_col_name]]
compounds_with_high_toxicity <- data[data$Num_High_Toxicity_Flags > 0, c("SMILES", anti_aging_col, specific_toxicity_cols, "Num_High_Toxicity_Flags")]
cat("\nCompounds with at least one high toxicity probability (Threshold > ", toxicity_threshold, "):\n")
if (nrow(compounds_with_high_toxicity) > 0) {
if ("SMILES" %in% colnames(compounds_with_high_toxicity)) {
compounds_with_high_toxicity$SMILES <- sanitize_smiles_for_latex(compounds_with_high_toxicity$SMILES)
}
kable(compounds_with_high_toxicity, caption = paste("Compounds with High Toxicity Flags (Prob >", toxicity_threshold,")."), escape = FALSE)
} else {
cat("No compounds found with high toxicity flags above the threshold.\n")
}
} else {
cat("No specific toxicity columns found matching AMES, DILI, Hepato, hERG patterns or they are not in dataframe. Skipping this section.\n")
}
##
## Distribution of Number of High Toxicity Flags (Threshold > 0.7 ):
##
## 0 1 2
## 82 32 6
##
## Compounds with at least one high toxicity probability (Threshold > 0.7 ):
| SMILES | Anti_Aging_Prob | AMES_Prob1 | DILI_Prob1 | Hepato_Prob1 | hERG_Prob1 | Num_High_Toxicity_Flags | |
|---|---|---|---|---|---|---|---|
| 1 | 0.4153507 | 0.3541652 | 0.8058858 | 0.1062473 | 0.0709472 | 1 | |
| 3 | 0.8675090 | 0.8564523 | 0.5432760 | 0.1559419 | 0.0519310 | 1 | |
| 7 | 0.8134314 | 0.9755830 | 0.6875242 | 0.0980618 | 0.0316115 | 1 | |
| 13 | 0.8845954 | 0.0561883 | 0.7032266 | 0.0877149 | 0.0521452 | 1 | |
| 14 | 0.9026542 | 0.9954975 | 0.4241290 | 0.0953081 | 0.0827177 | 1 | |
| 16 | 0.8674786 | 0.1039045 | 0.7423505 | 0.9101161 | 0.2024326 | 2 | |
| 18 | 0.9066839 | 0.8257204 | 0.2085714 | 0.2077957 | 0.0696443 | 1 | |
| 29 | 0.8690741 | 0.8275343 | 0.6846678 | 0.1842713 | 0.0573852 | 1 | |
| 31 | 0.8772374 | 0.1013908 | 0.7551078 | 0.0991374 | 0.0406885 | 1 | |
| 33 | 0.3721187 | 0.9938579 | 0.2888746 | 0.1843909 | 0.0331029 | 1 | |
| 36 | 0.7298995 | 0.0501221 | 0.7141211 | 0.1311200 | 0.0607621 | 1 | |
| 37 | 0.8675245 | 0.0273033 | 0.8018489 | 0.0511540 | 0.0484780 | 1 | |
| 39 | 0.8422391 | 0.9978563 | 0.7096740 | 0.1136693 | 0.0393497 | 2 | |
| 40 | 0.9339135 | 0.9514049 | 0.7388092 | 0.0388904 | 0.0614048 | 2 | |
| 43 | 0.9068187 | 0.0030837 | 0.7131538 | 0.1783630 | 0.0596383 | 1 | |
| 47 | 0.9213936 | 0.9496986 | 0.2405188 | 0.1720684 | 0.1071536 | 1 | |
| 53 | 0.9522983 | 0.7597901 | 0.6579833 | 0.0877450 | 0.0521926 | 1 | |
| 54 | 0.8674711 | 0.8817641 | 0.1550104 | 0.2075963 | 0.0556442 | 1 | |
| 56 | 0.7576527 | 0.8118979 | 0.1186264 | 0.2327908 | 0.0904809 | 1 | |
| 57 | 0.8675203 | 0.8689586 | 0.1803063 | 0.2515460 | 0.0579422 | 1 | |
| 61 | 0.9028072 | 0.7229171 | 0.1823889 | 0.2739984 | 0.0467560 | 1 | |
| 68 | 0.9147427 | 0.9972081 | 0.1696082 | 0.2545630 | 0.1695946 | 1 | |
| 69 | 0.9147427 | 0.9972081 | 0.1696082 | 0.2545630 | 0.1695946 | 1 | |
| 77 | 0.6541684 | 0.7139102 | 0.6066611 | 0.2645012 | 0.0391153 | 1 | |
| 79 | 0.8674408 | 0.8867352 | 0.3191527 | 0.2622778 | 0.0508184 | 1 | |
| 81 | 0.8309126 | 0.0123644 | 0.7163282 | 0.1081725 | 0.0308414 | 1 | |
| 84 | 0.7383333 | 0.0075317 | 0.7585896 | 0.2179448 | 0.1036391 | 1 | |
| 86 | 0.9113007 | 0.8626630 | 0.7229768 | 0.1576709 | 0.0401219 | 2 | |
| 88 | 0.9358224 | 0.9195161 | 0.6822173 | 0.0468319 | 0.0584787 | 1 | |
| 89 | 0.8745806 | 0.2100843 | 0.7459675 | 0.7528580 | 0.1423502 | 2 | |
| 90 | 0.8675542 | 0.0055737 | 0.7233558 | 0.0877022 | 0.0452627 | 1 | |
| 93 | 0.8819882 | 0.0104895 | 0.8025137 | 0.2549616 | 0.3522341 | 1 | |
| 94 | 0.8885711 | 0.7305439 | 0.7487818 | 0.0703219 | 0.0451110 | 2 | |
| 99 | 0.9362787 | 0.9986765 | 0.6636929 | 0.3792285 | 0.0351686 | 1 | |
| 102 | 0.8087053 | 0.0028934 | 0.7678359 | 0.1265243 | 0.1031895 | 1 | |
| 103 | 0.8675542 | 0.0055737 | 0.7233558 | 0.0877022 | 0.0452627 | 1 | |
| 117 | 0.9141520 | 0.9216489 | 0.2223560 | 0.2563339 | 0.0521820 | 1 | |
| 119 | 0.8708766 | 0.0697755 | 0.7433143 | 0.1443709 | 0.0373102 | 1 |
This section identifies compounds with high predicted toxicity.
cyp_cols_pattern <- "CYP"
cyp_cols <- grep(cyp_cols_pattern, colnames(data), value = TRUE, ignore.case = TRUE)
cyp_cols <- intersect(cyp_cols, toxicity_metabolism_cols)
if (length(cyp_cols) > 0 && all(cyp_cols %in% colnames(data))) {
# Ensure SMILES column is present for pivot_longer if it's used as id_cols implicitly
# It's better to explicitly select columns for pivot_longer
data_cyp_long <- data %>%
select(all_of(cyp_cols)) %>% # Select only CYP columns for pivoting
pivot_longer(cols = everything(), names_to = "CYP_Enzyme", values_to = "Probability")
print(ggplot(data_cyp_long, aes(x = CYP_Enzyme, y = Probability, fill = CYP_Enzyme)) + # Explicitly print
geom_boxplot(show.legend = FALSE, alpha = 0.7) +
labs(title = "CYP Interaction Probabilities",
x = "CYP Model",
y = "Probability") +
custom_theme() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, size = 7)))
cyp_threshold <- 0.7
high_cyp_matrix <- data[, cyp_cols, drop=FALSE] > cyp_threshold
data$Num_High_CYP_Flags <- rowSums(high_cyp_matrix, na.rm = TRUE)
# Ensure SMILES column exists in data for selection
if (!("SMILES" %in% colnames(data)) && ("SMILES" %in% colnames(data_orig))) {
data$SMILES <- data_orig$SMILES # Or the appropriate original SMILES column
}
compounds_high_cyp_cols_to_select <- c("SMILES", anti_aging_col, cyp_cols, "Num_High_CYP_Flags")
# Ensure all selected columns exist
compounds_high_cyp_cols_to_select <- intersect(compounds_high_cyp_cols_to_select, colnames(data))
compounds_high_cyp <- data[data$Num_High_CYP_Flags > 0, compounds_high_cyp_cols_to_select]
cat("\nCompounds with high probability (>", cyp_threshold, ") for interacting with one or more CYP enzymes:\n")
if (nrow(compounds_high_cyp) > 0) {
if ("SMILES" %in% colnames(compounds_high_cyp)) {
compounds_high_cyp$SMILES <- sanitize_smiles_for_latex(compounds_high_cyp$SMILES)
}
kable(compounds_high_cyp, caption = "Compounds with High CYP Interaction Flags.", escape = FALSE)
} else {
cat("No compounds found with high CYP interaction flags above the threshold.\n")
}
} else {
cat("No CYP-related columns found or they are not in dataframe. Skipping CYP profile.\n")
}
Profile of CYP Interaction Probabilities.
##
## Compounds with high probability (> 0.7 ) for interacting with one or more CYP enzymes:
| SMILES | Anti_Aging_Prob | CYP1A2_Prob1 | CYP2C19_Prob1 | CYP2C9_Prob1 | CYP2D6_Prob1 | CYP3A4_Prob1 | Num_High_CYP_Flags | |
|---|---|---|---|---|---|---|---|---|
| 8 | 0.9203264 | 0.9325538 | 0.9134086 | 0.3057511 | 0.0593774 | 0.8780224 | 3 | |
| 34 | 0.9120270 | 0.2669952 | 0.8276216 | 0.7600514 | 0.0616459 | 0.0584414 | 2 | |
| 36 | 0.7298995 | 0.6925360 | 0.8886024 | 0.4093160 | 0.0619508 | 0.0465492 | 1 | |
| 40 | 0.9339135 | 0.7148754 | 0.0593912 | 0.2246064 | 0.0620323 | 0.0383697 | 1 | |
| 43 | 0.9068187 | 0.3091101 | 0.8622365 | 0.2279927 | 0.0616864 | 0.0584747 | 1 | |
| 45 | 0.6510599 | 0.2099211 | 0.5525266 | 0.7440415 | 0.0641048 | 0.3993978 | 1 | |
| 53 | 0.9522983 | 0.9445683 | 0.6748180 | 0.3142129 | 0.0874351 | 0.0583887 | 1 | |
| 70 | 0.9182808 | 0.9216798 | 0.1105934 | 0.2611086 | 0.0575166 | 0.0584665 | 1 | |
| 85 | 0.9465679 | 0.3998069 | 0.7839862 | 0.5709382 | 0.0745055 | 0.2090866 | 1 | |
| 88 | 0.9358224 | 0.8233892 | 0.0782701 | 0.7799495 | 0.0568602 | 0.0843939 | 2 | |
| 90 | 0.8675542 | 0.9144105 | 0.1819703 | 0.2400610 | 0.0463176 | 0.9125103 | 2 | |
| 103 | 0.8675542 | 0.9144105 | 0.1819703 | 0.2400610 | 0.0463176 | 0.9125103 | 2 |
Boxplots show CYP interaction probabilities.
if (mrtd_umol_col %in% colnames(data) && is.numeric(data[[mrtd_umol_col]]) &&
anti_aging_col %in% colnames(data) && is.numeric(data[[anti_aging_col]])) {
print(ggplot(data, aes_string(x = anti_aging_col, y = mrtd_umol_col)) + # Explicitly print
geom_point(alpha = 0.6, color = "purple") +
geom_smooth(method = "lm", se = FALSE, color = "darkred") +
labs(title = "Anti-Aging Probability vs. MRTD (uMol)",
x = "Anti-Aging Probability",
y = paste("MRTD (", sub("\\.", " ", sub("\\.$", "", mrtd_umol_col)),")")) + # Clean name for label
custom_theme())
} else {
cat("MRTD (uMol) or Anti-Aging column not found or not numeric. Skipping Anti-Aging vs MRTD plot.\n")
}
Relationship between Anti-Aging Probability and MRTD (uMol).
ames_col_actual_grep <- grep("^AMES.Prob", toxicity_metabolism_cols, value = TRUE, ignore.case = TRUE)[1]
# Ensure ames_col_actual_grep is a valid column name in 'data'
ames_col_actual <- if(!is.na(ames_col_actual_grep) && ames_col_actual_grep %in% colnames(data)) ames_col_actual_grep else NA
if (!is.na(ames_col_actual) && is.numeric(data[[ames_col_actual]]) &&
mrtd_umol_col %in% colnames(data) && is.numeric(data[[mrtd_umol_col]])) {
print(ggplot(data, aes_string(x = ames_col_actual, y = mrtd_umol_col)) + # Explicitly print
geom_point(alpha = 0.6, color = "orangered") +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste(sub("\\.", " ", sub("\\.$", "", ames_col_actual)), "vs. MRTD (uMol)"), # Clean name
x = paste(sub("\\.", " ", sub("\\.$", "", ames_col_actual)), "Probability"),
y = paste("MRTD (", sub("\\.", " ", sub("\\.$", "", mrtd_umol_col)),")")) +
custom_theme())
} else {
cat("\nAMES probability or MRTD (uMol) column not found/numeric, skipping specific toxicity vs MRTD plot.\n")
}
Relationship between Anti-Aging Probability and MRTD (uMol).
if (mrtd_umol_col %in% colnames(data) && is.numeric(data[[mrtd_umol_col]])) {
valid_prob_cols <- probability_cols[sapply(probability_cols, function(p_col) p_col %in% colnames(data) && is.numeric(data[[p_col]]))]
if(length(valid_prob_cols) > 0) {
cor_with_mrtd <- cor(data[, valid_prob_cols, drop=FALSE], data[[mrtd_umol_col]], use = "pairwise.complete.obs")
cor_with_mrtd_df <- data.frame(Probability_Metric = rownames(cor_with_mrtd), Correlation_with_MRTD_uMol = cor_with_mrtd[,1])
cor_with_mrtd_df <- cor_with_mrtd_df[order(-abs(cor_with_mrtd_df$Correlation_with_MRTD_uMol)), ]
cat("\nCorrelation of Probabilities with MRTD (", mrtd_umol_col, "):\n")
kable(cor_with_mrtd_df, caption = "Correlation of Probabilities with MRTD (uMol).", row.names = FALSE)
} else {
cat("No valid numeric probability columns found for MRTD correlation.\n")
}
} else {
cat("MRTD (uMol) column not found or not numeric. Skipping correlation with MRTD.\n")
}
##
## Correlation of Probabilities with MRTD ( MRTD..uMol. ):
| Probability_Metric | Correlation_with_MRTD_uMol |
|---|---|
| MMP_Prob1 | -0.4407394 |
| P.gp_Substrate_Prob1 | -0.4321212 |
| TA_Prob1 | 0.3954930 |
| AIC_Prob1 | -0.3855687 |
| P.gp_Inhibitor_Prob1 | -0.3811990 |
| CYP2C9_Prob1 | -0.3526189 |
| BBB_Prob1 | -0.3419256 |
| SCE_Prob1 | 0.3256494 |
| HLM_Prob1 | -0.3130249 |
| DNS_Prob1 | -0.3129866 |
| hERG_Prob1 | -0.2753722 |
| CS_Prob1 | -0.2613587 |
| CYP1A2_Prob1 | -0.2541875 |
| CYP2C19_Prob1 | -0.2193765 |
| CYP3A4_Prob1 | -0.2084049 |
| DILI_Prob1 | -0.2049589 |
| MD_Prob1 | -0.1904055 |
| EA_Prob1 | -0.1886101 |
| GI_Prob1 | -0.1078672 |
| Anti_Aging_Prob | 0.0900432 |
| CYP2D6_Prob1 | -0.0869530 |
| AMES_Prob1 | 0.0694103 |
| LP_Prob1 | 0.0537533 |
| Hepato_Prob1 | 0.0408371 |
Scatter plots and correlations explore MTRD relationships.
min_anti_aging_prob <- 0.7
max_ames_prob <- 0.3
max_herg_prob <- 0.3
max_dili_prob <- 0.5
min_mrtd_umol <- -4.5
anti_aging_col_actual_filter <- anti_aging_col
ames_col_actual_filter <- grep("^AMES.Prob", colnames(data), value = TRUE, ignore.case = TRUE)[1]
herg_col_actual_filter <- grep("^hERG.Prob", colnames(data), value = TRUE, ignore.case = TRUE)[1]
dili_col_actual_filter <- grep("^DILI.Prob", colnames(data), value = TRUE, ignore.case = TRUE)[1]
mrtd_umol_col_actual_filter <- mrtd_umol_col
required_cols_for_filtering <- c(anti_aging_col_actual_filter, ames_col_actual_filter, herg_col_actual_filter, dili_col_actual_filter, mrtd_umol_col_actual_filter)
# Check they are not NA and exist in colnames(data)
required_cols_for_filtering_valid <- sapply(required_cols_for_filtering, function(x) !is.na(x) && x %in% colnames(data))
required_cols_for_filtering <- required_cols_for_filtering[required_cols_for_filtering_valid]
# Further check if these valid columns are numeric
cols_exist_and_numeric <- sapply(required_cols_for_filtering, function(cn) {
is.numeric(data[[cn]])
})
if (all(cols_exist_and_numeric) && length(required_cols_for_filtering) == 5) { # Ensure all 5 are valid
# Ensure SMILES column exists in data for selection
if (!("SMILES" %in% colnames(data)) && ("SMILES" %in% colnames(data_orig))) {
data$SMILES <- data_orig$SMILES # Or the appropriate original SMILES column
}
promising_candidates <- data %>%
filter(
.data[[anti_aging_col_actual_filter]] >= min_anti_aging_prob &
.data[[ames_col_actual_filter]] <= max_ames_prob &
.data[[herg_col_actual_filter]] <= max_herg_prob &
.data[[dili_col_actual_filter]] <= max_dili_prob &
.data[[mrtd_umol_col_actual_filter]] >= min_mrtd_umol
) %>%
select(any_of(c("SMILES",
anti_aging_col_actual_filter,
hallmarks_cols[1:min(3, length(hallmarks_cols))],
ames_col_actual_filter, herg_col_actual_filter, dili_col_actual_filter,
mrtd_umol_col_actual_filter))) %>%
arrange(desc(.data[[anti_aging_col_actual_filter]]))
cat("\nPotential Promising Compounds based on Defined Criteria:\n")
if (nrow(promising_candidates) > 0) {
if ("SMILES" %in% colnames(promising_candidates)) {
promising_candidates$SMILES <- sanitize_smiles_for_latex(promising_candidates$SMILES)
}
kable(promising_candidates, caption = "Promising Compounds Meeting Selection Criteria.", escape = FALSE)
} else {
cat("No compounds met all the specified criteria for promising candidates.\nConsider adjusting thresholds or criteria.\n")
}
} else {
cat("\nSkipping identification of promising compounds due to missing, non-numeric, or invalid required columns:\n")
# print(required_cols_for_filtering[!cols_exist_and_numeric])
cat("Required columns for filtering were not all valid and numeric.\n")
cat("Anti-Aging Col:", anti_aging_col_actual_filter, " (Valid & Numeric):", !is.na(anti_aging_col_actual_filter) && anti_aging_col_actual_filter %in% colnames(data) && is.numeric(data[[anti_aging_col_actual_filter]]), "\n")
cat("AMES Col:", ames_col_actual_filter, " (Valid & Numeric):", !is.na(ames_col_actual_filter) && ames_col_actual_filter %in% colnames(data) && is.numeric(data[[ames_col_actual_filter]]), "\n")
cat("hERG Col:", herg_col_actual_filter, " (Valid & Numeric):", !is.na(herg_col_actual_filter) && herg_col_actual_filter %in% colnames(data) && is.numeric(data[[herg_col_actual_filter]]), "\n")
cat("DILI Col:", dili_col_actual_filter, " (Valid & Numeric):", !is.na(dili_col_actual_filter) && dili_col_actual_filter %in% colnames(data) && is.numeric(data[[dili_col_actual_filter]]), "\n")
cat("MRTD uMol Col:", mrtd_umol_col_actual_filter, " (Valid & Numeric):", !is.na(mrtd_umol_col_actual_filter) && mrtd_umol_col_actual_filter %in% colnames(data) && is.numeric(data[[mrtd_umol_col_actual_filter]]), "\n")
}
##
## Potential Promising Compounds based on Defined Criteria:
| SMILES | Anti_Aging_Prob | AIC_Prob1 | CS_Prob1 | DNS_Prob1 | AMES_Prob1 | hERG_Prob1 | DILI_Prob1 | MRTD..uMol. |
|---|---|---|---|---|---|---|---|---|
| 0.9606567 | 0.0006490 | 0.2034362 | 0.0039286 | 0.1204800 | 0.0594412 | 0.1322649 | 37.8386447 | |
| 0.9495786 | 0.9859577 | 0.7616227 | 0.0114824 | 0.1426770 | 0.1134946 | 0.4244976 | 7.1163400 | |
| 0.9446054 | 0.0164659 | 0.5620683 | 0.0398025 | 0.0335747 | 0.0584063 | 0.1793079 | 0.4631483 | |
| 0.9406279 | 0.9928249 | 0.7415772 | 0.0252990 | 0.0118021 | 0.1402008 | 0.4668214 | 7.1163400 | |
| 0.9405055 | 0.0240730 | 0.5064515 | 0.0123505 | 0.0106469 | 0.0689665 | 0.1417550 | 0.4631483 | |
| 0.9404736 | 0.0311768 | 0.2085971 | 0.0064316 | 0.0369214 | 0.0357686 | 0.2666871 | 143.7607268 | |
| 0.9382746 | 0.0782590 | 0.1542431 | 0.0049789 | 0.0273427 | 0.0399441 | 0.2351469 | 143.7607268 | |
| 0.9301339 | 0.1079332 | 0.1822700 | 0.0076125 | 0.0044863 | 0.0520316 | 0.4187248 | 37.8386447 | |
| 0.9295920 | 0.1179067 | 0.2875334 | 0.9805530 | 0.0674937 | 0.0384176 | 0.2963561 | 14.6569753 | |
| 0.9295920 | 0.1179067 | 0.2875334 | 0.9805530 | 0.0674938 | 0.0384176 | 0.2963561 | 14.6569753 | |
| 0.9235864 | 0.0303878 | 0.4370652 | 0.7006165 | 0.1460527 | 0.0570755 | 0.3112628 | 14.6569753 | |
| 0.9212959 | 0.2429528 | 0.5325694 | 0.0355690 | 0.0058756 | 0.0778167 | 0.2893628 | 143.7607268 | |
| 0.9203276 | 0.0339941 | 0.5515035 | 0.4313030 | 0.0562884 | 0.0554071 | 0.4157930 | 14.6569753 | |
| 0.9173410 | 0.0013687 | 0.2106018 | 0.0049234 | 0.0007173 | 0.0366860 | 0.1743596 | 14.6569753 | |
| 0.9160256 | 0.9962563 | 0.3491672 | 0.0016692 | 0.0862971 | 0.0799461 | 0.4669442 | 7.1163400 | |
| 0.9127392 | 0.0557633 | 0.1630866 | 0.0035775 | 0.1254400 | 0.0504829 | 0.2995601 | 143.7607268 | |
| 0.9110941 | 0.0172950 | 0.1774829 | 0.0141625 | 0.0233437 | 0.0717458 | 0.2086416 | 143.7607268 | |
| 0.9110941 | 0.0172950 | 0.1774829 | 0.0141625 | 0.0233437 | 0.0717458 | 0.2086416 | 143.7607268 | |
| 0.9098582 | 0.3543780 | 0.2370539 | 0.0964858 | 0.0062034 | 0.0629043 | 0.2629216 | 0.4631483 | |
| 0.9069069 | 0.8692317 | 0.2914393 | 0.0146634 | 0.0070438 | 0.0707657 | 0.4678919 | 37.8386447 | |
| 0.9065206 | 0.0341412 | 0.1985217 | 0.0062654 | 0.0796881 | 0.0420328 | 0.2085687 | 143.7607268 | |
| 0.9041121 | 0.0044656 | 0.4256449 | 0.9521700 | 0.0829155 | 0.0301958 | 0.1725316 | 14.6569753 | |
| 0.8909587 | 0.1037825 | 0.1533232 | 0.0157941 | 0.0286923 | 0.0534852 | 0.2939204 | 143.7607268 | |
| 0.8860976 | 0.1298841 | 0.4931064 | 0.0151950 | 0.0193143 | 0.0576898 | 0.4279263 | 14.6569753 | |
| 0.8820072 | 0.9254341 | 0.4331401 | 0.0018980 | 0.0368616 | 0.1089475 | 0.3992252 | 7.1163400 | |
| 0.8740544 | 0.0250310 | 0.4150418 | 0.0117098 | 0.0002724 | 0.0731115 | 0.4904194 | 143.7607268 | |
| 0.8740108 | 0.0170628 | 0.1947050 | 0.0015556 | 0.0000439 | 0.0438192 | 0.3600591 | 143.7607268 | |
| 0.8690624 | 0.0218581 | 0.3847809 | 0.0123923 | 0.0004378 | 0.0759701 | 0.4824726 | 143.7607268 | |
| 0.8675602 | 0.0159322 | 0.1808688 | 0.0016309 | 0.0336665 | 0.0377437 | 0.1922034 | 143.7607268 | |
| 0.8675582 | 0.0104090 | 0.1560525 | 0.0065604 | 0.1791295 | 0.0435770 | 0.3559380 | 37.8386447 | |
| 0.8675356 | 0.0469760 | 0.2333721 | 0.0033365 | 0.0022990 | 0.0595369 | 0.3350227 | 143.7607268 | |
| 0.8675213 | 0.0149039 | 0.1791014 | 0.0013412 | 0.0001235 | 0.0350544 | 0.3186908 | 143.7607268 | |
| 0.8675030 | 0.1906531 | 0.7086589 | 0.0888641 | 0.0106776 | 0.0753400 | 0.4017709 | 143.7607268 | |
| 0.8674796 | 0.2518352 | 0.2211419 | 0.0079309 | 0.0029587 | 0.0853267 | 0.1555119 | 143.7607268 | |
| 0.8674669 | 0.0673885 | 0.1724098 | 0.0018398 | 0.2561207 | 0.0593936 | 0.2268561 | 143.7607268 | |
| 0.8674634 | 0.0094531 | 0.1458147 | 0.0047085 | 0.0126105 | 0.0336180 | 0.2520279 | 14.6569753 | |
| 0.8674296 | 0.0793617 | 0.2277593 | 0.0047512 | 0.0008885 | 0.0422137 | 0.0878976 | 3.3764935 | |
| 0.8634661 | 0.0707941 | 0.1232061 | 0.0195375 | 0.0024867 | 0.0559752 | 0.3830829 | 143.7607268 | |
| 0.8601621 | 0.0015100 | 0.1796195 | 0.0020561 | 0.0038247 | 0.0409310 | 0.2049868 | 143.7607268 | |
| 0.8269706 | 0.0164234 | 0.1609001 | 0.0076340 | 0.0001885 | 0.0386295 | 0.1531995 | 14.6569753 | |
| 0.7944862 | 0.0966277 | 0.5920661 | 0.0138010 | 0.0009227 | 0.2311515 | 0.2995382 | 7.1163400 | |
| 0.7857426 | 0.0997597 | 0.6192587 | 0.0147344 | 0.0001977 | 0.2195423 | 0.2521518 | 7.1163400 | |
| 0.7592205 | 0.1638351 | 0.1449177 | 0.0044882 | 0.0040976 | 0.0646026 | 0.4647681 | 37.8386447 | |
| 0.7040097 | 0.0027814 | 0.1907928 | 0.0054280 | 0.0139302 | 0.0643139 | 0.3060135 | 143.7607268 |
This section provides a preliminary filter for promising compounds.
This report provided an initial exploration of the AgeXtend output data.
Key Observations (Example - to be filled based on actual data):
Further Steps could include:
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_IN.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_IN.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Asia/Kolkata
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tinytex_0.57 factoextra_1.0.7 cluster_2.1.8.1 reshape2_1.4.4
## [5] DT_0.33 knitr_1.50 corrplot_0.95 RColorBrewer_1.1-3
## [9] pheatmap_1.0.12 lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [13] dplyr_1.1.4 purrr_1.0.4 readr_2.1.5 tidyr_1.3.1
## [17] tibble_3.2.1 ggplot2_3.5.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.52 bslib_0.9.0 htmlwidgets_1.6.4
## [5] ggrepel_0.9.6 rstatix_0.7.2 lattice_0.22-5 tzdb_0.5.0
## [9] vctrs_0.6.5 tools_4.5.0 generics_0.1.4 pkgconfig_2.0.3
## [13] Matrix_1.7-3 lifecycle_1.0.4 compiler_4.5.0 farver_2.1.2
## [17] ggsci_3.2.0 carData_3.0-5 htmltools_0.5.8.1 sass_0.4.10
## [21] yaml_2.3.10 Formula_1.2-5 pillar_1.10.2 car_3.1-3
## [25] ggpubr_0.6.0 jquerylib_0.1.4 cachem_1.1.0 abind_1.4-8
## [29] nlme_3.1-168 tidyselect_1.2.1 digest_0.6.37 stringi_1.8.7
## [33] labeling_0.4.3 splines_4.5.0 fastmap_1.2.0 grid_4.5.0
## [37] cli_3.6.5 magrittr_2.0.3 broom_1.0.8 withr_3.0.2
## [41] scales_1.4.0 backports_1.5.0 timechange_0.3.0 rmarkdown_2.29
## [45] ggsignif_0.6.4 hms_1.1.3 evaluate_1.0.3 mgcv_1.9-1
## [49] rlang_1.1.6 Rcpp_1.0.14 glue_1.8.0 rstudioapi_0.17.1
## [53] jsonlite_2.0.0 R6_2.6.1 plyr_1.8.9