rm(list = ls())
################# ##############input data
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\"
dir_path_name <- dir(dir_path,pattern = "*.xlsx",full.names = T)
dir_path_name
## [1] "C:\\Users\\liyix\\OneDrive\\Desktop\\data.xlsx"
library(openxlsx)
getSheetNames(grep("data.xlsx",dir_path_name,value = T))
## [1] "data"
data_1 <- read.xlsx(grep("data.xlsx",dir_path_name,value = T), sheet = 1)
dim(data_1) #[1] 8781 9
## [1] 8781 9
head(data_1)
## Mapping.ID a b c d cpe e f g
## 1 AAALVYBICLMAMA 0 x x x 2 x x -2
## 2 AAAQFGUYHFJNHI 0 0 0 0 0 -2 -1 -5.75
## 3 AABFWJDLCCDJJN 0 x x x 0 x x -3
## 4 AACFPJSJOWQNBN x x x x 0 x x 0
## 5 AACVPYUISGWNOU 0 x x x 0 x x 0
## 6 AAEQQXGDRKJWJW x x x x 0 x x 0
#View(head(data_1))
length(unique(data_1$Mapping.ID)) #[1] 8781
## [1] 8781
#################################
row.names(data_1) <- data_1$Mapping.ID
data_1$Mapping.ID <- NULL
colnames(data_1)
## [1] "a" "b" "c" "d" "cpe" "e" "f" "g"
dim(data_1) #[1] 8781 8
## [1] 8781 8
##############################predict cpe
non_cpe <- c(1:ncol(data_1))[-grep("cpe",colnames(data_1))]
non_cpe
## [1] 1 2 3 4 6 7 8
####################################corr analysis
data_list <- list()
for (i in 1:length(non_cpe)) {
data_2 <- data_1[,c(non_cpe[i],grep("cpe",colnames(data_1)))]
print(colnames(data_2)) #
#str(data_2)
data_2[data_2 == "x"] <- NA
data_2[] <- sapply(data_2[], as.numeric)
#head(data_2)
data_2[data_2 == 0] <- NA
#str(data_2)
data_3 <- na.omit(data_2)
#dim(data_3) #[1] 53 2
#head(data_3)
#View(data_3)
#str(data_3)
#predict colnames(data_3)[2]
data_3$label <- NA
for (j in 1:nrow(data_3)) {
#j= 1
if(data_3[j,1] > 0 & data_3[j,2] > 0) {
data_3$label[j] <- "TP"
} else if (data_3[j,1] < 0 & data_3[j,2] < 0){
data_3$label[j] <- "TN"
} else if (data_3[j,1] > 0 & data_3[j,2] < 0){
data_3$label[j] <- "FN"
} else {data_3$label[j] <- "FP"}
}
####################
result_1 <- data.frame(actual = colnames(data_3)[1],
predicted = colnames(data_3)[2],
TP = length(grep("TP",data_3$label)),
TN = length(grep("TN",data_3$label)),
FP = length(grep("FP",data_3$label)),
FN = length(grep("FN",data_3$label)))
################calculate
result_1$sensitivity <- result_1$TP/c(result_1$TP + result_1$FN)
result_1$specificity <- result_1$TN/c(result_1$TN + result_1$FP)
result_1$accuracy <- c(result_1$TP + result_1$TN)/
c(result_1$TP + result_1$FN + result_1$TN + result_1$FP)
result_1$PPV <- result_1$TP/c(result_1$TP + result_1$FP)
result_1$NPV <- result_1$TN/c(result_1$TN + result_1$FN)
result_1$BA <- c(result_1$sensitivity + result_1$specificity)/2
result_1$F1_score <- 2*result_1$TP/c(2*result_1$TP + result_1$FP + result_1$FN)
#ref https://en.wikipedia.org/wiki/Sensitivity_and_specificity
#sensitivity, recall, hit rate, or true positive rate (TPR)
#specificity, selectivity or true negative rate (TNR)
#precision or positive predictive value (PPV)
#negative predictive value (NPV)
#false discovery rate (FDR)
#F1 score
#Matthews correlation coefficient (MCC)
#
data_list[[i]] <- result_1
}
## [1] "a" "cpe"
## [1] "b" "cpe"
## [1] "c" "cpe"
## [1] "d" "cpe"
## [1] "e" "cpe"
## [1] "f" "cpe"
## [1] "g" "cpe"
data_5 <- do.call("rbind",data_list)
data_5
## actual predicted TP TN FP FN sensitivity specificity accuracy PPV
## 1 a cpe 0 0 53 0 NaN 0 0.000000000 0.000000000
## 2 b cpe 0 0 158 0 NaN 0 0.000000000 0.000000000
## 3 c cpe 3 0 19 0 1 0 0.136363636 0.136363636
## 4 d cpe 0 0 185 0 NaN 0 0.000000000 0.000000000
## 5 e cpe 5 0 365 0 1 0 0.013513514 0.013513514
## 6 f cpe 1 0 208 0 1 0 0.004784689 0.004784689
## 7 g cpe 0 0 347 0 NaN 0 0.000000000 0.000000000
## NPV BA F1_score
## 1 NaN NaN 0.00000000
## 2 NaN NaN 0.00000000
## 3 NaN 0.5 0.24000000
## 4 NaN NaN 0.00000000
## 5 NaN 0.5 0.02666667
## 6 NaN 0.5 0.00952381
## 7 NaN NaN 0.00000000
write.csv(data_5, paste0(dir_path,Sys.Date(),"-","sensitivity_specificity_calculate.csv"),row.names = T)