rm(list = ls())
###############################input data 
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\"
dir_path_name <- dir(dir_path,pattern = ".*.",full.names = T)
#dir_path_name
data_1 <- read.delim(grep("data.txt",dir_path_name,value = T),header = T,stringsAsFactors = F)
#View(head(data_1))
data_1 <- data_1[1:100,]
#data_1[] <- lapply(data_1, as.numeric)
dim(data_1) #[1] 100 731
## [1] 100 731
table(data_1$Outcome) # 0  1 87 13 
## 
##  0  1 
## 87 13
#View(data_1)
colnames(data_1)[1:10]
##  [1] "Mapping.ID"                          
##  [2] "atom.element_main_group"             
##  [3] "atom.element_metal_group_I_II"       
##  [4] "atom.element_metal_group_III"        
##  [5] "atom.element_metal_metalloid"        
##  [6] "atom.element_metal_poor_metal"       
##  [7] "atom.element_metal_transistion_metal"
##  [8] "atom.element_noble_gas"              
##  [9] "bond.C.N_cyano_acylcyanide"          
## [10] "bond.C.N_cyano_cyanamide"
data_1 <- data_1[,-1]
colnames(data_1)[730] #[1] "Outcome"
## [1] "Outcome"
colnames(data_1)[730] <- "M_NAME"
###########################################################split
data_active <- data_1[grep("1",data_1$M_NAME),]
data_nonactive <- data_1[grep("0",data_1$M_NAME),]
dim(data_active);dim(data_nonactive) #[1]  13 730  [1]  87 730
## [1]  13 730
## [1]  87 730
data_active$M_NAME
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1
##both
data_all <- data_1
data_all$M_NAME[grep("0",data_1$M_NAME)] <- paste0("nonactive_",1:nrow(data_nonactive))
data_all$M_NAME[grep("1",data_1$M_NAME)] <- paste0("active_",1:nrow(data_active))
length(grep("nonactive",data_all$M_NAME)) #[1] 87
## [1] 87
dim(data_all) #[1] 100 730
## [1] 100 730
rownames(data_all) <- data_all$M_NAME
#View(head(data_all))
data_all$M_NAME <- NULL
dim(data_all) #[1] 729 100
## [1] 100 729
data_all <- data.frame(t(data_all))
dim(data_all) #[1]  729 7214
## [1] 729 100
#View(head(data_all))
length(grep("^active",colnames(data_all))) #[1] 13
## [1] 13
#######################################################method_EXAMPLE
tanimoto <- function(x, similarity=F) {
  res<-sapply(x, function(x1){
    sapply(x, function(x2) {i=length(which(x1 & x2)) / length(which(x1 | x2)); ifelse(is.na(i), 0, i)})
  })
  if(similarity==T) return(res)
  else return(1-res)
}

x <- data.frame(Samp1=c(0,0,0,1,0,0,1,1,1), Samp2=c(1,1,1,1,1,1,0,0,1))
tanimoto(x,similarity=T) #
##           Samp1     Samp2
## Samp1 1.0000000 0.2222222
## Samp2 0.2222222 1.0000000
######################################################################TONIMOTO
str(data_all)
## 'data.frame':    729 obs. of  100 variables:
##  $ nonactive_1 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_2 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_3 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_1    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_4 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_5 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_6 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_7 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_8 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_9 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_10: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_11: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_12: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_13: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_14: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_2    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_15: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_16: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_17: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ nonactive_18: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_19: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_20: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_21: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_22: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_23: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_24: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_25: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_26: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_27: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_28: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_29: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_3    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_30: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_31: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_32: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ nonactive_33: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_34: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_35: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_36: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_37: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_38: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_39: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_40: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_41: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ nonactive_42: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_43: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_44: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_45: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_46: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_47: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_48: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_49: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_50: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_4    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_51: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_52: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_5    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_53: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_54: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_55: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_56: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_6    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_7    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_57: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_58: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_59: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_60: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_61: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_62: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_63: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_64: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_65: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_66: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_67: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_8    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_68: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_9    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_69: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_70: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_71: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_72: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_73: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_10   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_74: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_75: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_76: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_77: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_78: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_79: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_80: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_81: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_82: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_11   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_83: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_84: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ active_12   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_85: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_86: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonactive_87: int  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]
data_tanimoto <- tanimoto(data_all,similarity=T) #
dim(data_tanimoto) #[1] 100 100
## [1] 100 100
#View(data_tanimoto)
#View(data_tanimoto)
#data_tanimoto[upper.tri(data_tanimoto,diag = T)] <- NA ##do not use
diag(data_tanimoto) = NA
str(data_tanimoto)
##  num [1:100, 1:100] NA 0.25 0.0204 0.1944 0.1071 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:100] "nonactive_1" "nonactive_2" "nonactive_3" "active_1" ...
##   ..$ : chr [1:100] "nonactive_1" "nonactive_2" "nonactive_3" "active_1" ...
class(data_tanimoto)
## [1] "matrix" "array"
data_tanimoto_1 <- data.frame(data_tanimoto)
str(data_tanimoto_1)
## 'data.frame':    100 obs. of  100 variables:
##  $ nonactive_1 : num  NA 0.25 0.0204 0.1944 0.1071 ...
##  $ nonactive_2 : num  0.25 NA 0.0192 0.2432 0.0968 ...
##  $ nonactive_3 : num  0.0204 0.0192 NA 0.0517 0.1395 ...
##  $ active_1    : num  0.1944 0.2432 0.0517 NA 0.2 ...
##  $ nonactive_4 : num  0.1071 0.0968 0.1395 0.2 NA ...
##  $ nonactive_5 : num  0.1818 0.2727 0.0364 0.1905 0.1515 ...
##  $ nonactive_6 : num  0.1034 0.0606 0.1111 0.1026 0.1071 ...
##  $ nonactive_7 : num  0.2941 0.175 0.0877 0.3095 0.1316 ...
##  $ nonactive_8 : num  0 0 0.25 0.132 0.292 ...
##  $ nonactive_9 : num  0.1379 0.125 0.0408 0.1 0.0667 ...
##  $ nonactive_10: num  0.0606 0.027 0.325 0.0952 0.2593 ...
##  $ nonactive_11: num  0.2 0.258 0.149 0.306 0.129 ...
##  $ nonactive_12: num  0.25 0.5 0.0213 0.1714 0.16 ...
##  $ nonactive_13: num  0.0851 0.08 0.3529 0.2157 0.0638 ...
##  $ nonactive_14: num  0.2424 0.2222 0.0536 0.3 0.1429 ...
##  $ active_2    : num  0.0645 0.1613 0.02 0.2222 0.0323 ...
##  $ nonactive_15: num  0.025 0.0476 0.4048 0.0833 0.1111 ...
##  $ nonactive_16: num  0.1562 0.2121 0.0577 0.2308 0.3846 ...
##  $ nonactive_17: num  0.0976 0.0909 0.125 0.098 0.0233 ...
##  $ nonactive_18: num  0.3043 0.2692 0.0435 0.2812 0.16 ...
##  $ nonactive_19: num  0.0909 0.1143 0.125 0.0682 0.1667 ...
##  $ nonactive_20: num  0.1 0.0909 0.0851 0.2222 0.0323 ...
##  $ nonactive_21: num  0.105 0.184 0.111 0.325 0.414 ...
##  $ nonactive_22: num  0.0625 0.0385 0.4375 0.1273 0.0638 ...
##  $ nonactive_23: num  0.025 0.0233 0.6857 0.0612 0.1765 ...
##  $ nonactive_24: num  0.0465 0.0667 0.4318 0.1429 0.0732 ...
##  $ nonactive_25: num  0 0 0.0513 0 0.1579 ...
##  $ nonactive_26: num  0.219 0.135 0.188 0.136 0.118 ...
##  $ nonactive_27: num  0.4 0.24 0.0222 0.2581 0.125 ...
##  $ nonactive_28: num  0.167 0.118 0.152 0.122 0.259 ...
##  $ nonactive_29: num  0.075 0.0698 0.1091 0.3171 0.2727 ...
##  $ active_3    : num  0.36 0.1562 0.0196 0.2857 0.1379 ...
##  $ nonactive_30: num  0.0714 0.0645 0.0667 0.1714 0.16 ...
##  $ nonactive_31: num  0.1739 0.1111 0.0227 0.1176 0.1304 ...
##  $ nonactive_32: num  0.1304 0.0784 0.1864 0.1455 0.2439 ...
##  $ nonactive_33: num  0.0455 0.0833 0 0.1724 0 ...
##  $ nonactive_34: num  0 0.0333 0.0222 0.1818 0.1739 ...
##  $ nonactive_35: num  0.194 0.122 0.173 0.2 0.273 ...
##  $ nonactive_36: num  0.0357 0.1429 0 0.1765 0 ...
##  $ nonactive_37: num  0.1351 0.0465 0.3333 0.1277 0.2059 ...
##  $ nonactive_38: num  0.16 0.1034 0.0444 0.0526 0.037 ...
##  $ nonactive_39: num  0.0645 0.1613 0.0851 0.2571 0.0667 ...
##  $ nonactive_40: num  0.0435 0.0385 0.1667 0.0606 0.2105 ...
##  $ nonactive_41: num  0 0.0357 0 0.0278 0 ...
##  $ nonactive_42: num  0.025 0.0233 0.4048 0.1556 0.2903 ...
##  $ nonactive_43: num  0.05 0.0435 0.0263 0.0323 0.0526 ...
##  $ nonactive_44: num  0.1034 0.1667 0.0417 0.1316 0.069 ...
##  $ nonactive_45: num  0.346 0.188 0.06 0.15 0.308 ...
##  $ nonactive_46: num  0.1176 0.1389 0.0182 0.225 0.1212 ...
##  $ nonactive_47: num  0.2759 0.25 0.0784 0.2632 0.0909 ...
##  $ nonactive_48: num  0.037 0.0333 0.2105 0.0833 0.08 ...
##  $ nonactive_49: num  0.167 0.226 0.06 0.211 0.259 ...
##  $ nonactive_50: num  0.0465 0.0213 0.2857 0.098 0.1282 ...
##  $ active_4    : num  0.1053 0.125 0.0909 0.325 0.2812 ...
##  $ nonactive_51: num  0.0769 0.1071 0.122 0.1818 0.1739 ...
##  $ nonactive_52: num  0 0 0.2821 0.0488 0.1071 ...
##  $ active_5    : num  0.0968 0.0571 0.0196 0.2162 0.1379 ...
##  $ nonactive_53: num  0.04 0.0357 0.0233 0.0571 0.0417 ...
##  $ nonactive_54: num  0.125 0.114 0.08 0.205 0.346 ...
##  $ nonactive_55: num  0.29 0.194 0.16 0.186 0.219 ...
##  $ nonactive_56: num  0.0588 0.0364 0.4694 0.1207 0.1042 ...
##  $ active_6    : num  0.1714 0.0732 0.0351 0.3 0.1429 ...
##  $ active_7    : num  0.2059 0.1579 0.0536 0.3 0.2121 ...
##  $ nonactive_57: num  0.0213 0.02 0.3469 0.0727 0.093 ...
##  $ nonactive_58: num  0.0714 0.0213 0.3404 0.0769 0.1 ...
##  $ nonactive_59: num  0.1304 0.1154 0.0233 0.1562 0.087 ...
##  $ nonactive_60: num  0.0323 0.0294 0.2821 0.0488 0.1481 ...
##  $ nonactive_61: num  0 0.0385 0.05 0.0606 0.15 ...
##  $ nonactive_62: num  0.0345 0.0645 0.0667 0.1714 0 ...
##  $ nonactive_63: num  0.0698 0.0426 0.4545 0.14 0.125 ...
##  $ nonactive_64: num  0.1053 0.0465 0.4286 0.1042 0.0789 ...
##  $ nonactive_65: num  0.2069 0.2258 0.0192 0.3529 0.0968 ...
##  $ nonactive_66: num  0 0.08 0 0.0606 0 ...
##  $ nonactive_67: num  0.1351 0.1538 0.0169 0.2619 0.0513 ...
##  $ active_8    : num  0.0333 0.0968 0 0.2 0 ...
##  $ nonactive_68: num  0.1071 0.0968 0.1136 0.1351 0.1538 ...
##  $ active_9    : num  0.1944 0.0698 0.0339 0.1739 0.1053 ...
##  $ nonactive_69: num  0.0816 0.0566 0.4792 0.1228 0.1304 ...
##  $ nonactive_70: num  0.0952 0.04 0 0.0303 0 ...
##  $ nonactive_71: num  0.1154 0.0323 0 0.1765 0 ...
##  $ nonactive_72: num  0.04 0.0741 0.0233 0.0882 0.0417 ...
##  $ nonactive_73: num  0.1 0.0909 0.2143 0.1282 0.0667 ...
##  $ active_10   : num  0.1034 0.129 0.0638 0.3438 0.1481 ...
##  $ nonactive_74: num  0.0303 0.0571 0.0612 0.0714 0.0312 ...
##  $ nonactive_75: num  0.214 0.156 0.156 0.125 0.1 ...
##  $ nonactive_76: num  0.2903 0.303 0.0357 0.2143 0.1471 ...
##  $ nonactive_77: num  0.0357 0.1034 0 0.1111 0 ...
##  $ nonactive_78: num  0.0345 0.0645 0.0213 0.1081 0.0357 ...
##  $ nonactive_79: num  0.0286 0.0541 0.0385 0.093 0.0294 ...
##  $ nonactive_80: num  0.1852 0.1667 0.0638 0.2647 0.2917 ...
##  $ nonactive_81: num  0 0 0.2308 0.0513 0.0357 ...
##  $ nonactive_82: num  0.1667 0.1515 0.0192 0.2105 0.0303 ...
##  $ active_11   : num  0.1111 0.375 0.0435 0.0789 0.0741 ...
##  $ nonactive_83: num  0.2 0.375 0.0213 0.3667 0.1154 ...
##  $ nonactive_84: num  0.1935 0.2121 0.0185 0.3333 0.0909 ...
##  $ active_12   : num  0.3462 0.2258 0.0392 0.3143 0.1724 ...
##  $ nonactive_85: num  0.146 0.087 0.182 0.137 0.438 ...
##  $ nonactive_86: num  0.1852 0.0938 0.087 0.2647 0.069 ...
##  $ nonactive_87: num  0.1111 0.0312 0.2 0.1081 0.0741 ...
##   [list output truncated]
max(data_tanimoto_1$nonactive_1,na.rm = T)
## [1] 0.4
#View(data_tanimoto_1[1:10,1:10])
write.csv(data_tanimoto_1,paste0(dir_path,Sys.Date(),"-data_tanimoto_all.csv"),row.names = T)
##########1.1_active - active
active_2 <- data_tanimoto_1[grep("^active",rownames(data_tanimoto_1)),grep("^active",colnames(data_tanimoto_1))]
dim(active_2) #[1] 13 13
## [1] 13 13
max(active_2$active_1,na.rm = TRUE) #[1] 0.6551724
## [1] 0.34375
active_2[nrow(active_2)+1,] <- apply(active_2,2,function(x) max(x, na.rm = TRUE))
active_2_1 <- data.frame(t(active_2[nrow(active_2),]))
dim(active_2_1) #[1] 446   1
## [1] 13  1
head(active_2_1) 
##                X14
## active_1 0.3437500
## active_2 0.3548387
## active_3 0.3870968
## active_4 0.3548387
## active_5 0.3437500
## active_6 0.3684211
active_2_1$source <- "active"
##########1.2_nonactive - nonactive
nonactive_2 <- data_tanimoto_1[grep("^nonactive",rownames(data_tanimoto_1)),grep("^nonactive",colnames(data_tanimoto_1))]
dim(nonactive_2) #[1] 6539 6539
## [1] 87 87
#View(nonactive_2[1:10,1:10])
nonactive_2[nrow(nonactive_2)+1,] <- apply(nonactive_2,2,function(x) max(x, na.rm = TRUE))
nonactive_2_1 <-  data.frame(t(nonactive_2[nrow(nonactive_2),]))
nonactive_2_1$source <- "nonactive"
dim(nonactive_2_1) #[1] 6539    2
## [1] 87  2
head(nonactive_2_1)
##                   X88    source
## nonactive_1 0.4000000 nonactive
## nonactive_2 0.5000000 nonactive
## nonactive_3 0.6857143 nonactive
## nonactive_4 0.4375000 nonactive
## nonactive_5 0.3548387 nonactive
## nonactive_6 0.3200000 nonactive
#####1.3 active vs. nonactive
active_vs_nonactive <- data_tanimoto_1[grep("^nonactive",rownames(data_tanimoto_1)),grep("^active",colnames(data_tanimoto_1))]
dim(active_vs_nonactive) #[1] 6539  675
## [1] 87 13
active_vs_nonactive[nrow(active_vs_nonactive)+1,] <- apply(active_vs_nonactive,2,function(x) max(x, na.rm = TRUE))
active_vs_nonactive_1 <-  data.frame(t(active_vs_nonactive[nrow(active_vs_nonactive),]))
active_vs_nonactive_1$source <- "active_vs_nonactive"
dim(active_vs_nonactive_1) #[1] 675   2
## [1] 13  2
head(active_vs_nonactive_1)
##                X88              source
## active_1 0.3666667 active_vs_nonactive
## active_2 0.3870968 active_vs_nonactive
## active_3 0.5000000 active_vs_nonactive
## active_4 0.4444444 active_vs_nonactive
## active_5 0.4062500 active_vs_nonactive
## active_6 0.3076923 active_vs_nonactive
#####1.4  nonactive vs.active 
nonactive_vs_active <- data_tanimoto_1[grep("^active",rownames(data_tanimoto_1)),grep("^nonactive",colnames(data_tanimoto_1))]
dim(nonactive_vs_active) #[1]  675 6539
## [1] 13 87
nonactive_vs_active[nrow(nonactive_vs_active)+1,] <- apply(nonactive_vs_active,2,function(x) max(x, na.rm = TRUE))
nonactive_vs_active_1 <-  data.frame(t(nonactive_vs_active[nrow(nonactive_vs_active),]))
nonactive_vs_active_1$source <- "nonactive_vs_active"
dim(nonactive_vs_active_1) #[1] 6539    2
## [1] 87  2
head(nonactive_vs_active_1)
##                    X14              source
## nonactive_1 0.36000000 nonactive_vs_active
## nonactive_2 0.37500000 nonactive_vs_active
## nonactive_3 0.09090909 nonactive_vs_active
## nonactive_4 0.28125000 nonactive_vs_active
## nonactive_5 0.31250000 nonactive_vs_active
## nonactive_6 0.28571429 nonactive_vs_active
###2.1 merge
head(active_2_1)
##                X14 source
## active_1 0.3437500 active
## active_2 0.3548387 active
## active_3 0.3870968 active
## active_4 0.3548387 active
## active_5 0.3437500 active
## active_6 0.3684211 active
colnames(active_2_1) <- colnames(nonactive_2_1) <- 
  colnames(active_vs_nonactive_1) <- colnames(nonactive_vs_active_1) <- 
  c("tanimoto","source")
dim(active_2_1);dim(nonactive_2_1);dim(active_vs_nonactive_1);dim(nonactive_vs_active_1)
## [1] 13  2
## [1] 87  2
## [1] 13  2
## [1] 87  2
#[1] 675   2 [1] 6539    2 [1] 675   2 [1] 6539    2
data_tanimo_all <- rbind(active_2_1, nonactive_2_1,active_vs_nonactive_1, nonactive_vs_active_1)
#############################plot
dim(data_tanimo_all) #[1] 13986     2
## [1] 200   2
unique(data_tanimo_all$source)
## [1] "active"              "nonactive"           "active_vs_nonactive"
## [4] "nonactive_vs_active"
data_tanimo_all$source <- gsub("active_vs_nonactive","nonactive_vs_active",data_tanimo_all$source)
data_stat <- aggregate(data_tanimo_all[1],by = list(source = data_tanimo_all$source), FUN = summary)
write.csv(data_stat,paste0(dir_path,Sys.Date(),"-data_for_stat.csv"),row.names = T)
write.csv(data_tanimo_all,paste0(dir_path,Sys.Date(),"-data_for_plot.csv"),row.names = T)
#############################plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.0
max(data_tanimo_all$tanimoto) #[1] 1
## [1] 0.9047619
a_1 <- unique(data_tanimo_all$source)
a_2 <- gsub("^active$","Activity vs. Activity",a_1)
a_3 <- gsub("^nonactive$","Non-activity vs.Non-activity",a_2)
a_4 <- gsub("^nonactive_vs_active$","Activity vs. Non-activity",a_3)
##
data_tanimo_all$source <- gsub("^active$","Activity vs Activity",data_tanimo_all$source)
data_tanimo_all$source <- gsub("^nonactive$","Non-activity vs Non-activity",data_tanimo_all$source)
data_tanimo_all$source <- gsub("^nonactive_vs_active$","Activity vs Non-activity",data_tanimo_all$source)
unique(data_tanimo_all$source)
## [1] "Activity vs Activity"         "Non-activity vs Non-activity"
## [3] "Activity vs Non-activity"
head(data_tanimo_all)
##           tanimoto               source
## active_1 0.3437500 Activity vs Activity
## active_2 0.3548387 Activity vs Activity
## active_3 0.3870968 Activity vs Activity
## active_4 0.3548387 Activity vs Activity
## active_5 0.3437500 Activity vs Activity
## active_6 0.3684211 Activity vs Activity
str(data_tanimo_all)
## 'data.frame':    200 obs. of  2 variables:
##  $ tanimoto: num  0.344 0.355 0.387 0.355 0.344 ...
##  $ source  : chr  "Activity vs Activity" "Activity vs Activity" "Activity vs Activity" "Activity vs Activity" ...
data_tanimo_all$source <- factor(data_tanimo_all$source,levels = unique(data_tanimo_all$source))
######################plot
########################################adjust =1
ggplot(data_tanimo_all, aes(x = tanimoto, color = source)) +
  geom_density(alpha = .3,adjust =1,fill="white",size = 1) +
  labs(x = "Tanimoto coefficient", y = "Density") +
  scale_color_manual(values=c("#68c4e4", "#e64d49","#d6d9d8")) +
  theme(panel.background = element_rect(fill = "white",
                                        colour = "white",
                                        size = 3, linetype = "solid"),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        legend.position = c(0.27, 0.9),
        legend.box = "vertical",
        legend.title = element_blank(),
        axis.line =element_line(colour = "black", 
                                size = 0.5, linetype = "solid"),
        legend.background = element_rect(fill="white",
                                         size=0.5, linetype="solid", 
                                         colour ="white"),
        legend.text=element_text(size=13,family = "sans"),
        #axis.text.x = element_blank(),
        axis.text.x = element_text(colour="black",size=16,face="plain",family = "sans"),
        axis.text.y = element_text(colour="black",size=16,face="plain",family = "sans"),
        axis.title.x = element_text(colour="black",size=16,face="plain",family = "sans"),
        axis.title.y = element_text(colour="black",size=16,face="plain",family = "sans")) +
  scale_y_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 4.5 ,by = 1),limits = c(0,4.5)) +
  scale_x_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 1 ,by = 0.2),limits = c(0,1)) 

#  annotate(geom="text", x=0.86, y=0.98, label="AChE-ToxPrint",
#           color="black",size=5,face="plain",family = "sans")
ggsave(filename = paste0(Sys.Date(),"-bche_activity_adjust_1.tif"), plot = last_plot(),
       device = "tiff", path = dir_path,
       width = 15, height = 15, units = "cm",
       dpi = 300, limitsize = TRUE)
#############adjust =3
ggplot(data_tanimo_all, aes(x = tanimoto, color = source)) +
  geom_density(alpha = .3,adjust = 3,fill="white",size = 1) +
  labs(x = "Tanimoto coefficient", y = "Density") +
  scale_color_manual(values=c("#68c4e4", "#e64d49","#d6d9d8")) +
  theme(panel.background = element_rect(fill = "white",
                                        colour = "white",
                                        size = 3, linetype = "solid"),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        legend.position = c(0.27, 0.9),
        legend.box = "vertical",
        legend.title = element_blank(),
        axis.line =element_line(colour = "black", 
                                size = 0.5, linetype = "solid"),
        legend.background = element_rect(fill="white",
                                         size=0.5, linetype="solid", 
                                         colour ="white"),
        legend.text=element_text(size=13,family = "sans"),
        #axis.text.x = element_blank(),
        axis.text.x = element_text(colour="black",size=16,face="plain",family = "sans"),
        axis.text.y = element_text(colour="black",size=16,face="plain",family = "sans"),
        axis.title.x = element_text(colour="black",size=16,face="plain",family = "sans"),
        axis.title.y = element_text(colour="black",size=16,face="plain",family = "sans")) +
  scale_y_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 4,by = 1),limits = c(0,4)) +
  scale_x_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 1 ,by = 0.2),limits = c(0,1)) 

#  annotate(geom="text", x=0.86, y=0.98, label="AChE-ToxPrint",
#           color="black",size=5,face="plain",family = "sans")
ggsave(filename = paste0(Sys.Date(),"-bche_activity-adjust_3.tif"), plot = last_plot(),
       device = "tiff", path = dir_path,
       width = 15, height = 15, units = "cm",
       dpi = 300, limitsize = TRUE)
####violin
unique(data_tanimo_all$source)
## [1] Activity vs Activity         Non-activity vs Non-activity
## [3] Activity vs Non-activity    
## 3 Levels: Activity vs Activity ... Activity vs Non-activity
data_violin <- data_tanimo_all
str(data_tanimo_all)
## 'data.frame':    200 obs. of  2 variables:
##  $ tanimoto: num  0.344 0.355 0.387 0.355 0.344 ...
##  $ source  : Factor w/ 3 levels "Activity vs Activity",..: 1 1 1 1 1 1 1 1 1 1 ...
data_violin$source  <- as.character(data_violin $source)
str(data_violin)
## 'data.frame':    200 obs. of  2 variables:
##  $ tanimoto: num  0.344 0.355 0.387 0.355 0.344 ...
##  $ source  : chr  "Activity vs Activity" "Activity vs Activity" "Activity vs Activity" "Activity vs Activity" ...
unique(data_violin$source)
## [1] "Activity vs Activity"         "Non-activity vs Non-activity"
## [3] "Activity vs Non-activity"
data_violin$source <- gsub("vs","\n vs \n",data_violin $source)
?geom_violin
## starting httpd help server ... done
data_violin$source <-  factor(data_violin$source,levels = unique(data_violin$source)[c(3,1,2)])
ggplot(data_violin, aes(x = source,y = tanimoto,fill = source)) + 
  geom_violin(trim=T)+
  labs(x = "", y = "Tanimoto Coefficient") +
  geom_boxplot(width=0.2, outlier.alpha = 1,outlier.size =0.8,outlier.colour = "black") +
  theme_classic()+
  scale_fill_brewer(palette="Accent") +
  theme(plot.margin=unit(c(0,0,1,0),"cm"),
        legend.position = "none",
        panel.background =  element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        strip.text.x = element_text(colour = "black", face = "plain",family = "sans"),
        axis.text.x = element_text(angle = 0,vjust = 0.8,size=12,hjust = 0.5,colour = "black",family = "sans"),
        strip.text.y = element_blank(),
        strip.placement = "outside",
        panel.spacing = unit(0.2, "lines"),
        axis.title = element_text(color="black", size=12, face="plain",family = "sans"),
        axis.text.y  = element_text(color="black", size=12, face="plain",family = "sans"),
        plot.title = element_text(hjust = 0.5,color="black", size= 12, face="bold")) +
  scale_y_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 1 ,by = 0.2),limits = c(0,1)) 

# scale_color_manual(values=c("#68c4e4", "#e64d49","#d6d9d8")) 
#geom_hline(yintercept=20, linetype="dashed", color = "red")
#min(data_4$value)
#max(data_4$value)
ggsave(filename = paste0(Sys.Date(),"-bche_activity-VIOLIN_2.tif"), 
       plot = last_plot(), device = "tiff", path = dir_path,
       scale = 1, width = 10, height = 12, units = "cm",compression = "lzw",
       dpi = 300, limitsize = TRUE)