rm(list = ls())
################# ##############input data 
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\"
dir_path_name <- dir(dir_path,pattern = "*.xlsx",full.names = T)
dir_path_name
## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\data.xlsx"
library(openxlsx)
getSheetNames(grep("data.xlsx",dir_path_name,value = T))
## [1] "data"
data_1 <- read.xlsx(grep("data.xlsx",dir_path_name,value = T), sheet = 1)
dim(data_1) #[1] 8781    9
## [1] 8781    9
head(data_1)
##       Mapping.ID a b c d cpe  e  f     g
## 1 AAALVYBICLMAMA 0 x x x   2  x  x    -2
## 2 AAAQFGUYHFJNHI 0 0 0 0   0 -2 -1 -5.75
## 3 AABFWJDLCCDJJN 0 x x x   0  x  x    -3
## 4 AACFPJSJOWQNBN x x x x   0  x  x     0
## 5 AACVPYUISGWNOU 0 x x x   0  x  x     0
## 6 AAEQQXGDRKJWJW x x x x   0  x  x     0
#View(head(data_1))
length(unique(data_1$Mapping.ID)) #[1] 8781
## [1] 8781
#################################
row.names(data_1) <- data_1$Mapping.ID
data_1$Mapping.ID <- NULL
colnames(data_1)
## [1] "a"   "b"   "c"   "d"   "cpe" "e"   "f"   "g"
dim(data_1) #[1] 8781    8
## [1] 8781    8
##############################predict cpe
non_cpe <- c(1:ncol(data_1))[-grep("cpe",colnames(data_1))]
non_cpe
## [1] 1 2 3 4 6 7 8
####################################corr analysis
data_list <- list()
for (i in 1:length(non_cpe)) {
  data_2 <- data_1[,c(non_cpe[i],grep("cpe",colnames(data_1)))]
  print(colnames(data_2)) #
  #str(data_2)
  data_2[data_2 == "x"] <- NA
  data_2[] <- sapply(data_2[], as.numeric)
  #head(data_2)
  data_2[data_2 == 0] <- NA
  #str(data_2)
  data_3 <- na.omit(data_2)
  #dim(data_3) #[1] 53  2
  #head(data_3)
  #View(data_3)
  #str(data_3)
  #predict  colnames(data_3)[2]
  data_3$label <- NA
  for (j in 1:nrow(data_3)) {
    #j= 1
    if(data_3[j,1] > 0 & data_3[j,2] > 0) {
      data_3$label[j] <- "TP"
    } else if (data_3[j,1] < 0 & data_3[j,2] < 0){
      data_3$label[j] <- "TN" 
    } else if (data_3[j,1] > 0 & data_3[j,2] < 0){
      data_3$label[j] <- "FN" 
    } else {data_3$label[j] <- "FP"}
  }
  ####################
  result_1 <-  data.frame(actual = colnames(data_3)[1], 
                          predicted = colnames(data_3)[2], 
                          TP = length(grep("TP",data_3$label)),
                          TN = length(grep("TN",data_3$label)),
                          FP = length(grep("FP",data_3$label)),
                          FN = length(grep("FN",data_3$label)))
  ################calculate  
  result_1$sensitivity <- result_1$TP/c(result_1$TP + result_1$FN)
  result_1$specificity <- result_1$TN/c(result_1$TN + result_1$FP)
  result_1$accuracy <- c(result_1$TP + result_1$TN)/
    c(result_1$TP + result_1$FN + result_1$TN + result_1$FP)
  result_1$PPV <- result_1$TP/c(result_1$TP + result_1$FP)
  result_1$NPV <- result_1$TN/c(result_1$TN + result_1$FN)
  result_1$BA <- c(result_1$sensitivity + result_1$specificity)/2
  result_1$F1_score <- 2*result_1$TP/c(2*result_1$TP + result_1$FP + result_1$FN)
  #ref https://en.wikipedia.org/wiki/Sensitivity_and_specificity
  #sensitivity, recall, hit rate, or true positive rate (TPR)
  #specificity, selectivity or true negative rate (TNR)
  #precision or positive predictive value (PPV)
  #negative predictive value (NPV)
  #false discovery rate (FDR)
  #F1 score
  #Matthews correlation coefficient (MCC)
  #
  data_list[[i]] <- result_1
}
## [1] "a"   "cpe"
## [1] "b"   "cpe"
## [1] "c"   "cpe"
## [1] "d"   "cpe"
## [1] "e"   "cpe"
## [1] "f"   "cpe"
## [1] "g"   "cpe"
data_5 <- do.call("rbind",data_list)
data_5
##   actual predicted TP TN  FP FN sensitivity specificity    accuracy         PPV
## 1      a       cpe  0  0  53  0         NaN           0 0.000000000 0.000000000
## 2      b       cpe  0  0 158  0         NaN           0 0.000000000 0.000000000
## 3      c       cpe  3  0  19  0           1           0 0.136363636 0.136363636
## 4      d       cpe  0  0 185  0         NaN           0 0.000000000 0.000000000
## 5      e       cpe  5  0 365  0           1           0 0.013513514 0.013513514
## 6      f       cpe  1  0 208  0           1           0 0.004784689 0.004784689
## 7      g       cpe  0  0 347  0         NaN           0 0.000000000 0.000000000
##   NPV  BA   F1_score
## 1 NaN NaN 0.00000000
## 2 NaN NaN 0.00000000
## 3 NaN 0.5 0.24000000
## 4 NaN NaN 0.00000000
## 5 NaN 0.5 0.02666667
## 6 NaN 0.5 0.00952381
## 7 NaN NaN 0.00000000
write.csv(data_5, paste0(dir_path,Sys.Date(),"-","sensitivity_specificity_calculate.csv"),row.names = T)