rm(list = ls()); ls()
## character(0)
###############################input data_1
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\2023\\2023\\2023_DILI_DICT\\20230606_DILI\\"
dir_path_name <- dir(dir_path,pattern = ".*.csv",full.names = T, recursive = F)
#dir_path_name
########
data_1 <- read.csv(grep("2023-06-05-dili_cadidate-step_1_1085_compounds.csv",dir_path_name,value = T),header = T,stringsAsFactors = F)
dim(data_1) #[1] 1085 11
## [1] 1085 11
colnames(data_1)
## [1] "Mapping.ID" "Sample.ID" "TOX21_ID"
## [4] "SMILES" "probability_0_NB" "probability_1_NB"
## [7] "class_NB" "probability_1_XGBOOST" "class_XGBOOST"
## [10] "sum_class" "score"
data_ecfp4 <- read.csv(grep("tox21_ecfp4.csv",dir_path_name,value = T),header = T,stringsAsFactors = F)
dim(data_ecfp4) #[1] 8016 1025
## [1] 8016 1025
colnames(data_ecfp4)[1]
## [1] "Mapping.ID"
data_k <- data_ecfp4[data_ecfp4$Mapping.ID %in% data_1$Mapping.ID, ]
dim(data_k) #[1] 1085 1025
## [1] 1085 1025
length(unique(data_k$Mapping.ID)) #1085
## [1] 1085
#str(data_k)
#library(factoextra)
#install.packages("factoextra")
#fviz_nbclust(df[1:100,], FUN = kmeans, method = "wss")
#ggsave(filename = paste0(Sys.Date(),"-sars_optimal_number.tif"),
## plot = last_plot(),
# device = "tiff", path = dir_path,
# scale = 1, width = 30, height = 15,
# units = "cm",
# dpi = 300, limitsize = TRUE,
# compression = "lzw")
############################
#wss <- (nrow(df)-1)*sum(apply(df,2,var))
#for (i in 2:1000) wss[i] <- sum(kmeans(df,centers=i)$withinss)
#plot(1:622, wss, type="b", xlab="Number of Clusters",
# ylab="Within groups sum of squares")
#View(wss)
#write.csv(wss,paste0(dir_path,Sys.Date(),"-sars_ECFP4_11314_wss.csv"),row.names = F)
set.seed(123)
#?kmeans
rownames(data_k) <- data_k$Mapping.ID
data_k$Mapping.ID <- NULL
####################################################2____kmeans
km.res <- kmeans(data_k, centers = 50, algorithm = c("Hartigan-Wong"))
# Print the results
#print(km.res)
class(km.res$cluster)
## [1] "integer"
dd <- data.frame(km.res$cluster)
head(dd)
## km.res.cluster
## AADCDMQTJNYOSS 15
## AAEVYOVXGOFMJO 21
## AAIBYZBZXNWTPP 30
## AAXVEMMRQDVLJB 42
## ABLACSIRCKEUOB 5
## ABRIMXGLNHCLIP 2
#plot(data_k, col = km.res$cluster)
table(dd$cluster)
## < table of extent 0 >
table(dd$km.res.cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 20 39 7 9 17 5 18 47 9 12 25 15 26 45 6 10 3 2 35 8 9 47 8 34 5 52
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 87 28 8 59 39 10 20 3 27 9 42 2 24 38 25 11 10 6 9 13 16 15 39 32
dim(dd) #[1] 1085 1
## [1] 1085 1
dd$Mapping.ID <- row.names(dd)
head(dd)
## km.res.cluster Mapping.ID
## AADCDMQTJNYOSS 15 AADCDMQTJNYOSS
## AAEVYOVXGOFMJO 21 AAEVYOVXGOFMJO
## AAIBYZBZXNWTPP 30 AAIBYZBZXNWTPP
## AAXVEMMRQDVLJB 42 AAXVEMMRQDVLJB
## ABLACSIRCKEUOB 5 ABLACSIRCKEUOB
## ABRIMXGLNHCLIP 2 ABRIMXGLNHCLIP
data_k_result <- merge(data_1, dd, by = "Mapping.ID")
dim(data_k_result) #[1] 1085 12
## [1] 1085 12
head(data_k_result)
## Mapping.ID Sample.ID TOX21_ID
## 1 AADCDMQTJNYOSS NCGC00016963-01 Tox21_113354
## 2 AAEVYOVXGOFMJO NCGC00254711-01 Tox21_300807
## 3 AAIBYZBZXNWTPP NCGC00255584-01 Tox21_301942
## 4 AAXVEMMRQDVLJB NCGC00181081-01 Tox21_112700
## 5 ABLACSIRCKEUOB NCGC00160275-02 Tox21_111753
## 6 ABRIMXGLNHCLIP NCGC00256894-01 Tox21_302532
## SMILES
## 1 Cl.CCN2CCC[C@H]2CNC(=O)c1c(O)c(CC)cc(Cl)c1OC
## 2 CSc1nc(NC(C)C)nc(NC(C)C)n1
## 3 OC2CCCCC2c1ccccc1
## 4 OCC(=O)[C@@]4(O)CC[C@@H]2[C@]4(C)C[C@H](O)[C@]1(F)[C@@]3(C)CCC(=O)C=C3CC[C@H]12
## 5 Oc5cc1c3c4c(C(=O)c2c(O)cc(O)c(C(=O)C1(C)C)c23)c(O)cc(C)c45
## 6 O=C1CCCCCCCCCCC=CCCC1
## probability_0_NB probability_1_NB class_NB probability_1_XGBOOST
## 1 2.241933e-04 0.9997758 1 0.9817706
## 2 1.204430e-07 0.9999999 1 0.9974270
## 3 1.180512e-02 0.9881949 1 0.7877684
## 4 9.800258e-03 0.9901997 1 0.7297937
## 5 9.354090e-06 0.9999906 1 0.9736186
## 6 6.599148e-01 0.3400852 1 0.9333614
## class_XGBOOST sum_class score km.res.cluster
## 1 1 2 1.4463488 15
## 2 1 2 1.4580959 21
## 3 1 2 1.2944489 30
## 4 1 2 1.2529912 42
## 5 1 2 1.4404710 5
## 6 1 2 0.9355488 2
############################
dd1 <- data_k_result
str(dd1)
## 'data.frame': 1085 obs. of 12 variables:
## $ Mapping.ID : chr "AADCDMQTJNYOSS" "AAEVYOVXGOFMJO" "AAIBYZBZXNWTPP" "AAXVEMMRQDVLJB" ...
## $ Sample.ID : chr "NCGC00016963-01" "NCGC00254711-01" "NCGC00255584-01" "NCGC00181081-01" ...
## $ TOX21_ID : chr "Tox21_113354" "Tox21_300807" "Tox21_301942" "Tox21_112700" ...
## $ SMILES : chr "Cl.CCN2CCC[C@H]2CNC(=O)c1c(O)c(CC)cc(Cl)c1OC" "CSc1nc(NC(C)C)nc(NC(C)C)n1" "OC2CCCCC2c1ccccc1" "OCC(=O)[C@@]4(O)CC[C@@H]2[C@]4(C)C[C@H](O)[C@]1(F)[C@@]3(C)CCC(=O)C=C3CC[C@H]12" ...
## $ probability_0_NB : num 2.24e-04 1.20e-07 1.18e-02 9.80e-03 9.35e-06 ...
## $ probability_1_NB : num 1 1 0.988 0.99 1 ...
## $ class_NB : int 1 1 1 1 1 1 1 1 1 1 ...
## $ probability_1_XGBOOST: num 0.982 0.997 0.788 0.73 0.974 ...
## $ class_XGBOOST : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sum_class : int 2 2 2 2 2 2 2 2 2 2 ...
## $ score : num 1.45 1.46 1.29 1.25 1.44 ...
## $ km.res.cluster : int 15 21 30 42 5 2 30 7 14 21 ...
df2 <- lapply(split(dd1, dd1$km.res.cluster),
function(x) x[order(x$score,decreasing = T),][1:round(nrow(x)*0.23),])
df3 <- do.call('rbind', df2)
dim(df3) #[1] 251 12
## [1] 252 12
#View(df3)
df3 <- na.omit(df3)
dim(df3) #[1] 251 12
## [1] 252 12
write.csv(df3,paste0(dir_path,Sys.Date(),"-dili_ECFP4_251_50kmeans_step_2.csv"),row.names = F)