rm(list = ls())
###############################input data_1
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\curve rank_cal\\"
dir_path_name <- dir(dir_path,pattern = ".*.txt",full.names = T, recursive = T)
#dir_path_name
data_list <- list()
for (i in 1:1) {
data_1 <- read.delim(dir_path_name[i],header = T,stringsAsFactors = F)
#print(dim(data_1)) #[1] 65982 117
data_list[[i]] <- data_1
}
#View(data_1)
data_2 <- do.call("rbind", data_list)
#data_2 <- unique(data_2)
#dim(data_2) #[1] 65982 117[1] 65982 117
#colnames(data_2)
data_3 <- data_2[, c(1:6,8,19,57,52)]
#data_3 <- unique(data_3)
#dim(data_3) #[1] 65982 10
#View(data_3)
#data_3$NCGC.Unique.ID <- NULL
#length(unique(data_3$Mapping.ID)) #[1] 435970
#table(data_3$Protocol.Name) #
data_4 <- data.frame(table(data_3$Protocol.Name, data_3$Sample.Data.Type))
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data_4 <- spread(data_4, key = Var2, value = Freq)
write.csv(data_4, paste0(dir_path,Sys.Date(),"-","-GPCR_65982_stat.csv"),row.names = FALSE)
data_3 <- data_3[data_3$Structure != "", ]
#sum(data_3$Structure == "") #0
####################################cal curve
#head(data_3)
data <- data_3
#################################
data <- data[1:10000, ]
data$curve_rank <- NA
head(data)
## Protocol.Name Sample.ID Sample.Data.Type AC50..uM. CC.v2 Efficacy
## 109 tox21-htr2a-v1 NCGC00261584-01 agonist NA 4 0.000
## 110 tox21-htr2a-v1 NCGC00261584-01 agonist NA 4 0.000
## 111 tox21-htr2a-v1 NCGC00261584-01 agonist NA 4 0.000
## 112 tox21-htr2a-v1 NCGC00261400-01 agonist NA 4 0.000
## 113 tox21-htr2a-v1 NCGC00261400-01 agonist NA 4 0.000
## 114 tox21-htr2a-v1 NCGC00261400-01 agonist NA 4 -5.428
## Structure
## 109 Cl.CC(CN1C2=C(SC3=C1C=CC=C3)C=CC=C2)N(C)C
## 110 Cl.CC(CN1C2=C(SC3=C1C=CC=C3)C=CC=C2)N(C)C
## 111 Cl.CC(CN1C2=C(SC3=C1C=CC=C3)C=CC=C2)N(C)C
## 112 [H]N(O)C(=O)C(C)(C)N([H])S(=O)(=O)C1=CC=C(OC2=CC=C(F)C=C2)C=C1
## 113 [H]N(O)C(=O)C(C)(C)N([H])S(=O)(=O)C1=CC=C(OC2=CC=C(F)C=C2)C=C1
## 114 [H]N(O)C(=O)C(C)(C)N([H])S(=O)(=O)C1=CC=C(OC2=CC=C(F)C=C2)C=C1
## Sample.Name Primary.MOA Mapping.ID curve_rank
## 109 Promethazine hydrochloride PWWVAXIEGOYWEE NA
## 110 Promethazine hydrochloride PWWVAXIEGOYWEE NA
## 111 Promethazine hydrochloride PWWVAXIEGOYWEE NA
## 112 CP-471474 QCOQJYRPDUMCNP NA
## 113 CP-471474 QCOQJYRPDUMCNP NA
## 114 CP-471474 QCOQJYRPDUMCNP NA
#View(data)
for (i in 1:nrow(data)) {
#print(i)
if(is.na(data[i,grep("CC.v2", colnames(data))]) | is.na(data[i,grep("Efficacy", colnames(data))])) data[i,ncol(data)] <- NA
if(data[i,grep("CC.v2", colnames(data))] == 1.1) data[i,ncol(data)] <- 9
if(data[i,grep("CC.v2", colnames(data))] == 1.2 & data[i,grep("Efficacy", colnames(data))] > 50 ) data[i,ncol(data)] <- 8
if(data[i,grep("CC.v2", colnames(data))] == 2.1) data[i,ncol(data)] <- 7
if(data[i,grep("CC.v2", colnames(data))] == 1.2 & data[i,grep("Efficacy", colnames(data))] <= 50) data[i,ncol(data)] <- 6
if(data[i,grep("CC.v2", colnames(data))] == 2.2 & data[i,grep("Efficacy", colnames(data))] > 50) data[i,ncol(data)] <- 5
if(data[i,grep("CC.v2", colnames(data))] == 2.2 & data[i,grep("Efficacy", colnames(data))] <= 50) data[i,ncol(data)] <- 4
if(data[i,grep("CC.v2", colnames(data))] == 1.3) data[i,ncol(data)] <- 3
if(data[i,grep("CC.v2", colnames(data))] == 1.4) data[i,ncol(data)] <- 3
if(data[i,grep("CC.v2", colnames(data))] == 2.3) data[i,ncol(data)] <- 2
if(data[i,grep("CC.v2", colnames(data))] == 2.4) data[i,ncol(data)] <- 2
if(data[i,grep("CC.v2", colnames(data))] == 3) data[i,ncol(data)] <- 2
if(data[i,grep("CC.v2", colnames(data))] == 5) data[i,ncol(data)] <- 1
if(data[i,grep("CC.v2", colnames(data))] == 4) data[i,ncol(data)] <- 0
#
if(data[i,grep("CC.v2", colnames(data))] == -2.3) data[i,ncol(data)] <- -2
if(data[i,grep("CC.v2", colnames(data))] == -2.4) data[i,ncol(data)] <- -2
if(data[i,grep("CC.v2", colnames(data))] == -3) data[i,ncol(data)] <- -2
if(data[i,grep("CC.v2", colnames(data))] == -1.3) data[i,ncol(data)] <- -3
if(data[i,grep("CC.v2", colnames(data))] == -1.4) data[i,ncol(data)] <- -3
if(data[i,grep("CC.v2", colnames(data))] == -2.2 & data[i,grep("Efficacy", colnames(data))] >= -50) data[i,ncol(data)] <- -4
if(data[i,grep("CC.v2", colnames(data))] == -2.2 & data[i,grep("Efficacy", colnames(data))] < -50) data[i,ncol(data)] <- -5
if(data[i,grep("CC.v2", colnames(data))] == -1.2 & data[i,grep("Efficacy", colnames(data))] >= -50) data[i,ncol(data)] <- -6
if(data[i,grep("CC.v2", colnames(data))] == -2.1) data[i,ncol(data)] <- -7
if(data[i,grep("CC.v2", colnames(data))] == -1.2 & data[i,grep("Efficacy", colnames(data))] < -50 ) data[i,ncol(data)] <- -8
if(data[i,grep("CC.v2", colnames(data))] == -1.1) data[i,ncol(data)] <- -9
}
#sum(is.na(data[,grep("CC.v2", colnames(data))]))
table(data$curve_rank)
##
## -9 -8 -7 -5 -4 -3 -2 0 1 2 3 4 5 6 7 8
## 3 1 7 3 1 33 71 8675 3 653 66 59 49 46 230 21
## 9
## 79
#colnames(data)
#data[, c(5,6,11)]
#View(data)
write.csv(data, paste0(dir_path,Sys.Date(),"-","-data_curverank.csv"),row.names = FALSE)