# Load necessary libraries
#install.packages("readr")      # Install if not already installed
#install.packages("writexl")    # Install for saving Excel files
library(stringr)
# Read the original CSV file (Modify the file path accordingly)
input_file <- "C:\\Users\\liyix\\OneDrive\\Desktop\\729 features.csv"  # Replace with the actual file path
df <- read.csv(input_file)
dim(df)
## [1] 729   1
head(df)
##                              锘縈_NAME
## 1              atom:element_main_group
## 2        atom:element_metal_group_I_II
## 3         atom:element_metal_group_III
## 4         atom:element_metal_metalloid
## 5        atom:element_metal_poor_metal
## 6 atom:element_metal_transistion_metal
colnames(df)[1] <- "M_NAME"
# Define main category classification rules
categories <- list(
  "Functional Groups" = c("hydroxyl", "carbonyl", "carboxyl", "amine", "amide", "ester", "ether", 
                          "sulfide", "sulfoxide", "sulfone", "phosphate", "thiol", "halide"),
  "Metal Chains" = c("alkali", "alkaline", "transition metal", "metalloid", "lanthanide", "actinide"),
  "Nucleobase/Nucleotide Analogs" = c("purine", "pyrimidine", "nucleobase", "nucleotide", "nucleoside"),
  "Heterocyclic Systems" = c("heterocycle", "heteroaromatic", "imidazole", "pyrazole", "pyridine", 
                             "pyrimidine", "thiazole", "oxazole", "indole", "quinoline"),
  "Aromatic Systems" = c("benzene", "phenyl", "naphthalene", "polycyclic", "aromatic ring"),
  "Uncategorized" = c()  # Features not fitting into structured categories
)

# Function to assign the main category based on keywords
categorize_entry <- function(entry) {
  if (is.null(entry) || is.na(entry) || entry == "" || !is.character(entry)) {
    return("Uncategorized")  # Handle missing or invalid values
  }
  
  entry_lower <- tolower(entry)  # Convert to lowercase for case-insensitive matching
  
  for (category in names(categories)) {
    if (any(sapply(categories[[category]], function(keyword) grepl(keyword, entry_lower, fixed = TRUE)))) {
      return(category)
    }
  }
  return("Uncategorized")  # Default category if no match found
}



str(df)
## 'data.frame':    729 obs. of  1 variable:
##  $ M_NAME: chr  "atom:element_main_group" "atom:element_metal_group_I_II" "atom:element_metal_group_III" "atom:element_metal_metalloid" ...
categorize_entry(df$M_NAME[1])
## [1] "Uncategorized"
# Apply the categorization function and ensure output is a vector
df$Category <- unlist(sapply(df$M_NAME, categorize_entry))

# Define subcategories within each main category
subcategories <- list(
  "Functional Groups" = list(
    "Hydroxyl" = c("hydroxyl"),
    "Carbonyl" = c("carbonyl", "aldehyde", "ketone"),
    "Carboxyl" = c("carboxyl", "acid"),
    "Amine" = c("amine", "amino"),
    "Amide" = c("amide", "carbamoyl", "urea"),
    "Ester" = c("ester", "lactone"),
    "Ether" = c("ether", "epoxide"),
    "Sulfide" = c("sulfide", "thiol"),
    "Sulfoxide/Sulfone" = c("sulfoxide", "sulfone"),
    "Phosphate" = c("phosphate", "phosphonate"),
    "Halide" = c("fluoro", "chloro", "bromo", "iodo", "halogen")
  ),
  "Metal Chains" = list(
    "Alkali Metals" = c("alkali"),
    "Alkaline Earth Metals" = c("alkaline"),
    "Transition Metals" = c("transition metal"),
    "Metalloids" = c("boron", "silicon", "germanium", "arsenic", "antimony", "tellurium"),
    "Lanthanides" = c("lanthanide"),
    "Actinides" = c("actinide")
  ),
  "Nucleobase/Nucleotide Analogs" = list(
    "Purine" = c("purine", "adenine", "guanine", "hypoxanthine", "xanthine"),
    "Pyrimidine" = c("pyrimidine", "cytosine", "thymine", "uracil"),
    "Nucleosides" = c("nucleoside"),
    "Modified Nucleotides" = c("nucleotide", "ribonucleotide", "deoxynucleotide")
  ),
  "Heterocyclic Systems" = list(
    "Five-Membered Rings" = c("imidazole", "pyrazole", "pyrrole", "thiazole", "oxazole", "furan", "thiophene"),
    "Six-Membered Rings" = c("pyridine", "pyrimidine", "pyrazine", "quinoline"),
    "Fused Systems" = c("benzimidazole", "benzothiazole", "benzofuran", "indole", "quinoxaline")
  ),
  "Aromatic Systems" = list(
    "Single-Ring Aromatics" = c("benzene", "phenyl"),
    "Polycyclic Aromatics" = c("naphthalene", "polycyclic", "PAH")
  )
)

# Function to assign subcategories based on keywords within each main category
assign_subcategory <- function(entry, main_category) {
  if (main_category %in% names(subcategories)) {
    for (subcat in names(subcategories[[main_category]])) {
      if (any(str_detect(tolower(entry), subcategories[[main_category]][[subcat]]))) {
        return(subcat)
      }
    }
  }
  return("Other")  # Default to "Other" if no specific subcategory match is found
}

# Apply the subcategory function and ensure output is a vector
df$Subcategory <- unlist(mapply(assign_subcategory, df$M_NAME, df$Category))

# Save the categorized data as an Excel file
head(df)
##                                 M_NAME      Category Subcategory
## 1              atom:element_main_group Uncategorized       Other
## 2        atom:element_metal_group_I_II Uncategorized       Other
## 3         atom:element_metal_group_III Uncategorized       Other
## 4         atom:element_metal_metalloid  Metal Chains       Other
## 5        atom:element_metal_poor_metal Uncategorized       Other
## 6 atom:element_metal_transistion_metal Uncategorized       Other
unique(df$Category)
## [1] "Uncategorized"                 "Metal Chains"                 
## [3] "Functional Groups"             "Aromatic Systems"             
## [5] "Nucleobase/Nucleotide Analogs" "Heterocyclic Systems"
dim(df)
## [1] 729   3
#View(df)
write.csv(df, paste0(Sys.Date(),"-","729_features.csv"),row.names = FALSE)