rm(list = ls())
###############################input data_1 
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dir_path <- "C:\\Users\\xut2\\Desktop\\"
dir_path_name <- dir(dir_path,pattern = ".*.csv",full.names = T)
#dir_path_name
###############################1.1 bath input data 
data_1 <- read.csv(grep("2025-09-30-582_predict_merge_all.csv",dir_path_name,value = T),header = T,stringsAsFactors = F)
# dim(data_1) #[1] 582  16
# table(data_1$antagonist)
# head(data_1)
#################################################
data_list <- list()
# head(data_1)
# colnames(data_1)

df1 <- data_1[, c( "Molecules", grep("class_1", colnames(data_1), value = T), "antagonist" )]
head(df1)
##        Molecules class_1_NB class_1_NNET class_1_RF class_1_SVM class_1_XGB
## 1 FZYUKBLYECHAGN          1            0          0           0           0
## 2 FNKBVTBXFLSTPB          1            0          1           1           1
## 3 HTSNFXAICLXZMA          1            1          0           1           0
## 4 KKZGFVAZUKHFAC          1            1          1           1           1
## 5 RKXSQHTWNMDLJM          0            1          0           1           1
## 6 GHWJEDJMOVUXEC          1            1          1           1           1
##   class_1_GCN antagonist
## 1           0          0
## 2           0          0
## 3           0          0
## 4           1          1
## 5           0          0
## 6           1          1
df2 <- gather(df1, key = key, value = value, -c(1, 8))
# head(df2)
# unique(df2$key)

for (i in 1:6) {
  #i = 1
  #i = 1
  print(i)
  tryCatch({
    df_one <- df2[df2$key == unique(df2$key)[i], ]
    
    dim(df_one[df_one$antagonist == 1 & df_one$value == 0, ])
    #View(data_2)
    data_3 <- data.frame(table(df_one$antagonist, df_one$value))
    colnames(data_3) <- c("reality", "predict", "Freq")
    #head(data_3)
    
    data_3$geneid <- unique(df_one$key)
    data_3$label[data_3$reality == 1 & data_3$predict == 1] <- "TP"
    data_3$label[data_3$reality == 1 & data_3$predict == 0] <- "FN"
    data_3$label[data_3$reality == 0 & data_3$predict == 1] <- "FP"
    data_3$label[data_3$reality == 0 & data_3$predict == 0] <- "TN"
    #data_list[[i]] <- data_3
    result_1 <-  data.frame(TP = data_3$Freq[data_3$label == "TP"], 
                            TN = data_3$Freq[data_3$label == "TN"], 
                            FP = data_3$Freq[data_3$label == "FP"], 
                            FN = data_3$Freq[data_3$label == "FN"])
    ################calculate  
    #result_1$MCC <- calc_mcc(tp = result_1$TP, tn = result_1$TN, fp = result_1$FP, fn = result_1$FN)
    result_1$sensitivity <- result_1$TP/c(result_1$TP + result_1$FN)
    result_1$specificity <- result_1$TN/c(result_1$TN + result_1$FP)
    result_1$accuracy <- c(result_1$TP + result_1$TN)/
      c(result_1$TP + result_1$FN + result_1$TN + result_1$FP)
    result_1$PPV <- result_1$TP/c(result_1$TP + result_1$FP)
    result_1$NPV <- result_1$TN/c(result_1$TN + result_1$FN)
    result_1$BA <- c(result_1$sensitivity + result_1$specificity)/2
    result_1$F1_score <- 2*result_1$TP/c(2*result_1$TP + result_1$FP + result_1$FN)
    result_1$model <- unique(df_one$key)
    data_list[[i]] <- result_1
    
  }, error = function(e) {
    cat("ERROR :",conditionMessage(e), "\n")
    cat("ERROR :", conditionMessage(e),"---",i,"---",gsub("\\:","-",Sys.time()),file = "error.txt", append = TRUE, "\n")
    #cat(message('** ERR at ', Sys.time(), " **"),file = "test_1.txt", append = TRUE)
    #print(e)
  })
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
df_pre <- do.call("rbind", data_list)
df_pre
##    TP  TN  FP FN sensitivity specificity  accuracy       PPV       NPV
## 1  76 301 114 91   0.4550898   0.7253012 0.6477663 0.4000000 0.7678571
## 2  74 316  99 93   0.4431138   0.7614458 0.6701031 0.4277457 0.7726161
## 3 113 309 106 54   0.6766467   0.7445783 0.7250859 0.5159817 0.8512397
## 4 124 276 139 43   0.7425150   0.6650602 0.6872852 0.4714829 0.8652038
## 5 104 301 114 63   0.6227545   0.7253012 0.6958763 0.4770642 0.8269231
## 6 138 238 177 29   0.8263473   0.5734940 0.6460481 0.4380952 0.8913858
##          BA  F1_score        model
## 1 0.5901955 0.4257703   class_1_NB
## 2 0.6022798 0.4352941 class_1_NNET
## 3 0.7106125 0.5854922   class_1_RF
## 4 0.7037876 0.5767442  class_1_SVM
## 5 0.6740278 0.5402597  class_1_XGB
## 6 0.6999206 0.5726141  class_1_GCN
write.csv(df_pre, paste0(dir_path, Sys.Date(),"-ppv-", "step_1.csv"),row.names = T)