library(rcdk)
## Loading required package: rcdklibs
## Loading required package: rJava
# 目的:计算“芳香环数”
# 思路:
# 1) 用 CDK Cycles API 找到一组环(默认 SSSR,也可选 relevant)
# 2) 对每个环检查:环内所有原子是否都已被标记为 aromatic
# 3) 满足者计为 1 个芳香环
#
# 注意:
# - 该函数假定 mol 已经完成:
# set.atom.types(mol)
# do.isotopes(mol)
# do.aromaticity(mol)
# - 如果你更在意 fused/bridged 系统中的“唯一性”,可把 method 改成 "relevant"
aromatic_ring_count <- function(mol, method = c("sssr", "relevant")) {
method <- match.arg(method)
# 目的:通过 CDK Cycles API 获取环集合
cycles <- switch(
method,
sssr = rJava::.jcall(
"org/openscience/cdk/graph/Cycles",
"Lorg/openscience/cdk/graph/Cycles;",
"sssr",
mol
),
relevant = rJava::.jcall(
"org/openscience/cdk/graph/Cycles",
"Lorg/openscience/cdk/graph/Cycles;",
"relevant",
mol
)
)
# 目的:把 cycles 转成 IRingSet,便于逐个 ring 读取
ring_set <- rJava::.jcall(
cycles,
"Lorg/openscience/cdk/interfaces/IRingSet;",
"toRingSet"
)
# 目的:获取环总数;如果没有环,直接返回 0
n_rings <- rJava::.jcall(ring_set, "I", "getAtomContainerCount")
if (n_rings == 0) return(0L)
# 目的:逐个环检查“该环是否为芳香环”
aromatic_n <- 0L
for (i in seq_len(n_rings)) {
# 目的:取出第 i 个环;Java 下标从 0 开始,所以要减 1
ring <- rJava::.jcall(
ring_set,
"Lorg/openscience/cdk/interfaces/IAtomContainer;",
"getAtomContainer",
as.integer(i - 1)
)
# 目的:获取该环中的全部原子
ring_atoms <- get.atoms(ring)
# 目的:如果环中所有原子都被标记为 aromatic,则记作一个芳香环
# 这里直接依赖 rcdk::is.aromatic(atom)
if (all(vapply(ring_atoms, function(a) isTRUE(is.aromatic(a)), logical(1)))) {
aromatic_n <- aromatic_n + 1L
}
}
aromatic_n
}
check_druglikeness <- function(
smiles,
keep_largest_fragment = TRUE,
allowed_elements = c("C", "H", "O", "N", "S", "F", "Cl", "Br", "I", "P"),
aromatic_ring_method = c("sssr", "relevant")
) {
aromatic_ring_method <- match.arg(aromatic_ring_method)
analyze_one <- function(smi) {
# 目的:构造统一的 NA 输出,保证报错时也能返回相同列结构
na_row <- function(msg = NA_character_) {
data.frame(
SMILES = smi,
Error = msg,
PresentElements = NA_character_,
pass_elements = NA,
ExactMW = NA_real_,
XLogP = NA_real_,
HBD = NA_integer_,
HBA = NA_integer_,
HBD_HBA = NA_integer_,
TPSA = NA_real_,
RotBonds = NA_integer_,
AromaticAtoms = NA_integer_,
AromaticRings = NA_integer_,
Fsp3 = NA_real_,
FormalCharge = NA_integer_,
TotalCharge = NA_real_,
IsNeutral = NA,
Ro5_violations = NA_integer_,
pass_Ro5 = NA,
pass_Veber_TPSA = NA,
pass_Veber_HB = NA,
pass_Veber = NA,
Pfizer_3_75_alert = NA,
pass_GSK_4_400 = NA,
stringsAsFactors = FALSE
)
}
# 目的:检查输入是否为空
if (is.na(smi) || !nzchar(trimws(smi))) {
return(na_row("Empty SMILES"))
}
# 目的:把 SMILES 解析成分子对象
mol <- parse.smiles(smi, omit.nulls = FALSE)[[1]]
if (is.null(mol)) {
return(na_row("Invalid SMILES"))
}
out <- tryCatch({
# 目的:如果是盐型/多组分结构,只保留最大连通组分
if (keep_largest_fragment && !is.connected(mol)) {
mol <- get.largest.component(mol)
}
# 目的:分子预处理,为后续 aromaticity / descriptor 计算做准备
set.atom.types(mol)
do.isotopes(mol)
do.aromaticity(mol)
# 目的:将隐式氢转为显式氢,提升部分描述符计算稳定性
convert.implicit.to.explicit(mol)
# 目的:显式加氢后重新配置
set.atom.types(mol)
do.isotopes(mol)
do.aromaticity(mol)
# 目的:提取原子与元素信息,用于元素过滤
atoms <- get.atoms(mol)
symbols <- vapply(atoms, get.symbol, character(1))
present_elements <- paste(sort(unique(symbols)), collapse = ",")
pass_elements <- all(symbols %in% allowed_elements)
# 目的:集中计算常用描述符
desc_names <- c(
XLogP = "org.openscience.cdk.qsar.descriptors.molecular.XLogPDescriptor",
HBD = "org.openscience.cdk.qsar.descriptors.molecular.HBondDonorCountDescriptor",
HBA = "org.openscience.cdk.qsar.descriptors.molecular.HBondAcceptorCountDescriptor",
TPSA = "org.openscience.cdk.qsar.descriptors.molecular.TPSADescriptor",
RotBonds = "org.openscience.cdk.qsar.descriptors.molecular.RotatableBondsCountDescriptor",
AromaticAtoms = "org.openscience.cdk.qsar.descriptors.molecular.AromaticAtomsCountDescriptor",
Fsp3 = "org.openscience.cdk.qsar.descriptors.molecular.FractionalCSP3Descriptor"
)
desc <- eval.desc(list(mol), unname(desc_names))
names(desc) <- names(desc_names)
# 目的:转成普通数值,便于规则判断
mw <- as.numeric(get.mol.weight(mol))
xlogp <- as.numeric(desc$XLogP[1])
hbd <- as.integer(desc$HBD[1])
hba <- as.integer(desc$HBA[1])
tpsa <- as.numeric(desc$TPSA[1])
rotb <- as.integer(desc$RotBonds[1])
aromatm <- as.integer(desc$AromaticAtoms[1])
fsp3 <- as.numeric(desc$Fsp3[1])
hbsum <- if (all(!is.na(c(hbd, hba)))) hbd + hba else NA_integer_
# 目的:真正计算“芳香环数”
aromring <- aromatic_ring_count(mol, method = aromatic_ring_method)
# 目的:计算电荷信息
formal_charge <- as.integer(get.total.formal.charge(mol))
total_charge <- as.numeric(get.total.charge(mol))
is_neutral <- isTRUE(is.neutral(mol))
# 目的:统计 Ro5 违反项数量
ro5_violations <- if (all(!is.na(c(mw, xlogp, hbd, hba)))) {
sum(c(mw > 500, xlogp > 5, hbd > 5, hba > 10))
} else {
NA_integer_
}
# 目的:Lipinski Rule of Five
pass_Ro5 <- if (!is.na(ro5_violations)) {
ro5_violations == 0
} else {
NA
}
# 目的:Veber(TPSA 版本)
pass_Veber_TPSA <- if (all(!is.na(c(rotb, tpsa)))) {
rotb <= 10 && tpsa <= 140
} else {
NA
}
# 目的:Veber(HBD+HBA 替代表达)
pass_Veber_HB <- if (all(!is.na(c(rotb, hbsum)))) {
rotb <= 10 && hbsum <= 12
} else {
NA
}
# 目的:给出总的 Veber 判断
pass_Veber <- if (all(!is.na(c(rotb, tpsa, hbsum)))) {
rotb <= 10 && (tpsa <= 140 || hbsum <= 12)
} else {
NA
}
# 目的:Pfizer 3/75 预警
Pfizer_3_75_alert <- if (all(!is.na(c(xlogp, tpsa)))) {
xlogp > 3 && tpsa < 75
} else {
NA
}
# 目的:GSK 4/400
pass_GSK_4_400 <- if (all(!is.na(c(mw, xlogp)))) {
mw <= 400 && xlogp <= 4
} else {
NA
}
# 目的:汇总输出
data.frame(
SMILES = smi,
Error = NA_character_,
PresentElements = present_elements,
pass_elements = pass_elements,
ExactMW = round(mw, 4),
XLogP = round(xlogp, 3),
HBD = hbd,
HBA = hba,
HBD_HBA = hbsum,
TPSA = round(tpsa, 2),
RotBonds = rotb,
AromaticAtoms = aromatm,
AromaticRings = aromring,
Fsp3 = round(fsp3, 3),
FormalCharge = formal_charge,
TotalCharge = round(total_charge, 3),
IsNeutral = is_neutral,
Ro5_violations = ro5_violations,
pass_Ro5 = pass_Ro5,
pass_Veber_TPSA = pass_Veber_TPSA,
pass_Veber_HB = pass_Veber_HB,
pass_Veber = pass_Veber,
Pfizer_3_75_alert = Pfizer_3_75_alert,
pass_GSK_4_400 = pass_GSK_4_400,
stringsAsFactors = FALSE
)
}, error = function(e) {
na_row(conditionMessage(e))
})
out
}
# 目的:支持单个或多个 SMILES 批量输入
if (!is.character(smiles) || length(smiles) < 1) {
stop("Please provide one or more SMILES strings.")
}
res <- lapply(smiles, analyze_one)
do.call(rbind, res)
}
######################################################################
check_druglikeness("c1ccccc1") # benzene
## SMILES Error PresentElements
## 1 c1ccccc1 could not find function "get.mol.weight" <NA>
## pass_elements ExactMW XLogP HBD HBA HBD_HBA TPSA RotBonds AromaticAtoms
## 1 NA NA NA NA NA NA NA NA NA
## AromaticRings Fsp3 FormalCharge TotalCharge IsNeutral Ro5_violations pass_Ro5
## 1 NA NA NA NA NA NA NA
## pass_Veber_TPSA pass_Veber_HB pass_Veber Pfizer_3_75_alert pass_GSK_4_400
## 1 NA NA NA NA NA
check_druglikeness("c1ccc2ccccc2c1") # naphthalene
## SMILES Error PresentElements
## 1 c1ccc2ccccc2c1 could not find function "get.mol.weight" <NA>
## pass_elements ExactMW XLogP HBD HBA HBD_HBA TPSA RotBonds AromaticAtoms
## 1 NA NA NA NA NA NA NA NA NA
## AromaticRings Fsp3 FormalCharge TotalCharge IsNeutral Ro5_violations pass_Ro5
## 1 NA NA NA NA NA NA NA
## pass_Veber_TPSA pass_Veber_HB pass_Veber Pfizer_3_75_alert pass_GSK_4_400
## 1 NA NA NA NA NA
check_druglikeness(
c("c1ccccc1", "c1ccc2ccccc2c1"),
aromatic_ring_method = "sssr"
)
## SMILES Error PresentElements
## 1 c1ccccc1 could not find function "get.mol.weight" <NA>
## 2 c1ccc2ccccc2c1 could not find function "get.mol.weight" <NA>
## pass_elements ExactMW XLogP HBD HBA HBD_HBA TPSA RotBonds AromaticAtoms
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## AromaticRings Fsp3 FormalCharge TotalCharge IsNeutral Ro5_violations pass_Ro5
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## pass_Veber_TPSA pass_Veber_HB pass_Veber Pfizer_3_75_alert pass_GSK_4_400
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
# 如果你更想要“更唯一”的环集合定义:
check_druglikeness(
"c1ccc2ccccc2c1",
aromatic_ring_method = "relevant"
)
## SMILES Error PresentElements
## 1 c1ccc2ccccc2c1 could not find function "get.mol.weight" <NA>
## pass_elements ExactMW XLogP HBD HBA HBD_HBA TPSA RotBonds AromaticAtoms
## 1 NA NA NA NA NA NA NA NA NA
## AromaticRings Fsp3 FormalCharge TotalCharge IsNeutral Ro5_violations pass_Ro5
## 1 NA NA NA NA NA NA NA
## pass_Veber_TPSA pass_Veber_HB pass_Veber Pfizer_3_75_alert pass_GSK_4_400
## 1 NA NA NA NA NA
smiles_list <- c("CC(=O)OC1=CC=CC=C1C(=O)O", "CC(=O)OC1", "CCO")
check_druglikeness(
smiles_list,
aromatic_ring_method = "relevant"
)
## Warning in parse.smiles(smi, omit.nulls = FALSE): 1 out of 1 SMILES were not
## successfully parsed, resulting in NULLs.
## SMILES Error
## 1 CC(=O)OC1=CC=CC=C1C(=O)O could not find function "get.mol.weight"
## 2 CC(=O)OC1 Invalid SMILES
## 3 CCO could not find function "get.mol.weight"
## PresentElements pass_elements ExactMW XLogP HBD HBA HBD_HBA TPSA RotBonds
## 1 <NA> NA NA NA NA NA NA NA NA
## 2 <NA> NA NA NA NA NA NA NA NA
## 3 <NA> NA NA NA NA NA NA NA NA
## AromaticAtoms AromaticRings Fsp3 FormalCharge TotalCharge IsNeutral
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA NA
## 3 NA NA NA NA NA NA
## Ro5_violations pass_Ro5 pass_Veber_TPSA pass_Veber_HB pass_Veber
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## Pfizer_3_75_alert pass_GSK_4_400
## 1 NA NA
## 2 NA NA
## 3 NA NA
###################################################
#Rule ColumnQuestion Answered
#Element Filter pass_elements Does the molecule contain only conventional medicinal elements? Molecules containing rare or heavy metals are generally unsuitable for drug development.
#Lipinski Rule of Five pass_Ro5 Can the molecule be passively absorbed in the gastrointestinal tract? The classic predictor of oral bioavailability.
#Veber Rules pass_Veber Does the molecule have sufficient membrane permeability and metabolic stability in vivo? Complements Ro5, especially for macrocycles and peptide-like structures.
#Pfizer 3/75 AlertPfizer_3_75_alert Does the molecule fall into the "high lipophilicity + low polarity" danger zone? This region is strongly associated with elevated cardiotoxicity and hepatotoxicity risk.
#GSK 4/400 Rule pass_GSK_4_400 Is the molecule sufficiently small and light? Lighter molecules tend to exhibit better selectivity and lower toxicity risk.
####################################################