library(stringr)
input_file <- "C:\\Users\\liyix\\OneDrive\\Desktop\\729 features.csv"
df <- read.csv(input_file)
dim(df)
## [1] 729 1
head(df)
## 锘縈_NAME
## 1 atom:element_main_group
## 2 atom:element_metal_group_I_II
## 3 atom:element_metal_group_III
## 4 atom:element_metal_metalloid
## 5 atom:element_metal_poor_metal
## 6 atom:element_metal_transistion_metal
colnames(df)[1] <- "M_NAME"
categories <- list(
"Functional Groups" = c("hydroxyl", "carbonyl", "carboxyl", "amine", "amide", "ester", "ether",
"sulfide", "sulfoxide", "sulfone", "phosphate", "thiol", "halide"),
"Metal Chains" = c("alkali", "alkaline", "transition metal", "metalloid", "lanthanide", "actinide"),
"Nucleobase/Nucleotide Analogs" = c("purine", "pyrimidine", "nucleobase", "nucleotide", "nucleoside"),
"Heterocyclic Systems" = c("heterocycle", "heteroaromatic", "imidazole", "pyrazole", "pyridine",
"pyrimidine", "thiazole", "oxazole", "indole", "quinoline"),
"Aromatic Systems" = c("benzene", "phenyl", "naphthalene", "polycyclic", "aromatic ring"),
"Uncategorized" = c()
)
categorize_entry <- function(entry) {
if (is.null(entry) || is.na(entry) || entry == "" || !is.character(entry)) {
return("Uncategorized")
}
entry_lower <- tolower(entry)
for (category in names(categories)) {
if (any(sapply(categories[[category]], function(keyword) grepl(keyword, entry_lower, fixed = TRUE)))) {
return(category)
}
}
return("Uncategorized")
}
str(df)
## 'data.frame': 729 obs. of 1 variable:
## $ M_NAME: chr "atom:element_main_group" "atom:element_metal_group_I_II" "atom:element_metal_group_III" "atom:element_metal_metalloid" ...
categorize_entry(df$M_NAME[1])
## [1] "Uncategorized"
df$Category <- unlist(sapply(df$M_NAME, categorize_entry))
subcategories <- list(
"Functional Groups" = list(
"Hydroxyl" = c("hydroxyl"),
"Carbonyl" = c("carbonyl", "aldehyde", "ketone"),
"Carboxyl" = c("carboxyl", "acid"),
"Amine" = c("amine", "amino"),
"Amide" = c("amide", "carbamoyl", "urea"),
"Ester" = c("ester", "lactone"),
"Ether" = c("ether", "epoxide"),
"Sulfide" = c("sulfide", "thiol"),
"Sulfoxide/Sulfone" = c("sulfoxide", "sulfone"),
"Phosphate" = c("phosphate", "phosphonate"),
"Halide" = c("fluoro", "chloro", "bromo", "iodo", "halogen")
),
"Metal Chains" = list(
"Alkali Metals" = c("alkali"),
"Alkaline Earth Metals" = c("alkaline"),
"Transition Metals" = c("transition metal"),
"Metalloids" = c("boron", "silicon", "germanium", "arsenic", "antimony", "tellurium"),
"Lanthanides" = c("lanthanide"),
"Actinides" = c("actinide")
),
"Nucleobase/Nucleotide Analogs" = list(
"Purine" = c("purine", "adenine", "guanine", "hypoxanthine", "xanthine"),
"Pyrimidine" = c("pyrimidine", "cytosine", "thymine", "uracil"),
"Nucleosides" = c("nucleoside"),
"Modified Nucleotides" = c("nucleotide", "ribonucleotide", "deoxynucleotide")
),
"Heterocyclic Systems" = list(
"Five-Membered Rings" = c("imidazole", "pyrazole", "pyrrole", "thiazole", "oxazole", "furan", "thiophene"),
"Six-Membered Rings" = c("pyridine", "pyrimidine", "pyrazine", "quinoline"),
"Fused Systems" = c("benzimidazole", "benzothiazole", "benzofuran", "indole", "quinoxaline")
),
"Aromatic Systems" = list(
"Single-Ring Aromatics" = c("benzene", "phenyl"),
"Polycyclic Aromatics" = c("naphthalene", "polycyclic", "PAH")
)
)
assign_subcategory <- function(entry, main_category) {
if (main_category %in% names(subcategories)) {
for (subcat in names(subcategories[[main_category]])) {
if (any(str_detect(tolower(entry), subcategories[[main_category]][[subcat]]))) {
return(subcat)
}
}
}
return("Other")
}
df$Subcategory <- unlist(mapply(assign_subcategory, df$M_NAME, df$Category))
head(df)
## M_NAME Category Subcategory
## 1 atom:element_main_group Uncategorized Other
## 2 atom:element_metal_group_I_II Uncategorized Other
## 3 atom:element_metal_group_III Uncategorized Other
## 4 atom:element_metal_metalloid Metal Chains Other
## 5 atom:element_metal_poor_metal Uncategorized Other
## 6 atom:element_metal_transistion_metal Uncategorized Other
unique(df$Category)
## [1] "Uncategorized" "Metal Chains"
## [3] "Functional Groups" "Aromatic Systems"
## [5] "Nucleobase/Nucleotide Analogs" "Heterocyclic Systems"
dim(df)
## [1] 729 3
write.csv(df, paste0(Sys.Date(),"-","729_features.csv"),row.names = FALSE)