rm(list = ls())
###############################input data
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\"
dir_path_name <- dir(dir_path,pattern = ".*.",full.names = T)
#dir_path_name
data_1 <- read.delim(grep("data.txt",dir_path_name,value = T),header = T,stringsAsFactors = F)
#View(head(data_1))
data_1 <- data_1[1:100,]
#data_1[] <- lapply(data_1, as.numeric)
dim(data_1) #[1] 100 731
## [1] 100 731
table(data_1$Outcome) # 0 1 87 13
##
## 0 1
## 87 13
#View(data_1)
colnames(data_1)[1:10]
## [1] "Mapping.ID"
## [2] "atom.element_main_group"
## [3] "atom.element_metal_group_I_II"
## [4] "atom.element_metal_group_III"
## [5] "atom.element_metal_metalloid"
## [6] "atom.element_metal_poor_metal"
## [7] "atom.element_metal_transistion_metal"
## [8] "atom.element_noble_gas"
## [9] "bond.C.N_cyano_acylcyanide"
## [10] "bond.C.N_cyano_cyanamide"
data_1 <- data_1[,-1]
colnames(data_1)[730] #[1] "Outcome"
## [1] "Outcome"
colnames(data_1)[730] <- "M_NAME"
###########################################################split
data_active <- data_1[grep("1",data_1$M_NAME),]
data_nonactive <- data_1[grep("0",data_1$M_NAME),]
dim(data_active);dim(data_nonactive) #[1] 13 730 [1] 87 730
## [1] 13 730
## [1] 87 730
data_active$M_NAME
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1
##both
data_all <- data_1
data_all$M_NAME[grep("0",data_1$M_NAME)] <- paste0("nonactive_",1:nrow(data_nonactive))
data_all$M_NAME[grep("1",data_1$M_NAME)] <- paste0("active_",1:nrow(data_active))
length(grep("nonactive",data_all$M_NAME)) #[1] 87
## [1] 87
dim(data_all) #[1] 100 730
## [1] 100 730
rownames(data_all) <- data_all$M_NAME
#View(head(data_all))
data_all$M_NAME <- NULL
dim(data_all) #[1] 729 100
## [1] 100 729
data_all <- data.frame(t(data_all))
dim(data_all) #[1] 729 7214
## [1] 729 100
#View(head(data_all))
length(grep("^active",colnames(data_all))) #[1] 13
## [1] 13
#######################################################method_EXAMPLE
tanimoto <- function(x, similarity=F) {
res<-sapply(x, function(x1){
sapply(x, function(x2) {i=length(which(x1 & x2)) / length(which(x1 | x2)); ifelse(is.na(i), 0, i)})
})
if(similarity==T) return(res)
else return(1-res)
}
x <- data.frame(Samp1=c(0,0,0,1,0,0,1,1,1), Samp2=c(1,1,1,1,1,1,0,0,1))
tanimoto(x,similarity=T) #
## Samp1 Samp2
## Samp1 1.0000000 0.2222222
## Samp2 0.2222222 1.0000000
######################################################################TONIMOTO
str(data_all)
## 'data.frame': 729 obs. of 100 variables:
## $ nonactive_1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_9 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_10: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_11: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_12: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_13: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_14: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_15: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_16: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_17: int 0 1 0 0 0 0 0 0 0 0 ...
## $ nonactive_18: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_19: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_20: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_21: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_22: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_23: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_24: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_25: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_26: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_27: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_28: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_29: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_30: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_31: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_32: int 0 1 0 0 0 0 0 0 0 0 ...
## $ nonactive_33: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_34: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_35: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_36: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_37: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_38: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_39: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_40: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_41: int 0 1 0 0 0 0 0 0 0 0 ...
## $ nonactive_42: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_43: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_44: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_45: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_46: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_47: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_48: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_49: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_50: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_51: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_52: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_53: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_54: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_55: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_56: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_57: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_58: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_59: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_60: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_61: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_62: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_63: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_64: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_65: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_66: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_67: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_68: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_9 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_69: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_70: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_71: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_72: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_73: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_10 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_74: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_75: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_76: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_77: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_78: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_79: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_80: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_81: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_82: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_83: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_84: int 0 0 0 0 0 0 0 0 0 0 ...
## $ active_12 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_85: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_86: int 0 0 0 0 0 0 0 0 0 0 ...
## $ nonactive_87: int 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
data_tanimoto <- tanimoto(data_all,similarity=T) #
dim(data_tanimoto) #[1] 100 100
## [1] 100 100
#View(data_tanimoto)
#View(data_tanimoto)
#data_tanimoto[upper.tri(data_tanimoto,diag = T)] <- NA ##do not use
diag(data_tanimoto) = NA
str(data_tanimoto)
## num [1:100, 1:100] NA 0.25 0.0204 0.1944 0.1071 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:100] "nonactive_1" "nonactive_2" "nonactive_3" "active_1" ...
## ..$ : chr [1:100] "nonactive_1" "nonactive_2" "nonactive_3" "active_1" ...
class(data_tanimoto)
## [1] "matrix" "array"
data_tanimoto_1 <- data.frame(data_tanimoto)
str(data_tanimoto_1)
## 'data.frame': 100 obs. of 100 variables:
## $ nonactive_1 : num NA 0.25 0.0204 0.1944 0.1071 ...
## $ nonactive_2 : num 0.25 NA 0.0192 0.2432 0.0968 ...
## $ nonactive_3 : num 0.0204 0.0192 NA 0.0517 0.1395 ...
## $ active_1 : num 0.1944 0.2432 0.0517 NA 0.2 ...
## $ nonactive_4 : num 0.1071 0.0968 0.1395 0.2 NA ...
## $ nonactive_5 : num 0.1818 0.2727 0.0364 0.1905 0.1515 ...
## $ nonactive_6 : num 0.1034 0.0606 0.1111 0.1026 0.1071 ...
## $ nonactive_7 : num 0.2941 0.175 0.0877 0.3095 0.1316 ...
## $ nonactive_8 : num 0 0 0.25 0.132 0.292 ...
## $ nonactive_9 : num 0.1379 0.125 0.0408 0.1 0.0667 ...
## $ nonactive_10: num 0.0606 0.027 0.325 0.0952 0.2593 ...
## $ nonactive_11: num 0.2 0.258 0.149 0.306 0.129 ...
## $ nonactive_12: num 0.25 0.5 0.0213 0.1714 0.16 ...
## $ nonactive_13: num 0.0851 0.08 0.3529 0.2157 0.0638 ...
## $ nonactive_14: num 0.2424 0.2222 0.0536 0.3 0.1429 ...
## $ active_2 : num 0.0645 0.1613 0.02 0.2222 0.0323 ...
## $ nonactive_15: num 0.025 0.0476 0.4048 0.0833 0.1111 ...
## $ nonactive_16: num 0.1562 0.2121 0.0577 0.2308 0.3846 ...
## $ nonactive_17: num 0.0976 0.0909 0.125 0.098 0.0233 ...
## $ nonactive_18: num 0.3043 0.2692 0.0435 0.2812 0.16 ...
## $ nonactive_19: num 0.0909 0.1143 0.125 0.0682 0.1667 ...
## $ nonactive_20: num 0.1 0.0909 0.0851 0.2222 0.0323 ...
## $ nonactive_21: num 0.105 0.184 0.111 0.325 0.414 ...
## $ nonactive_22: num 0.0625 0.0385 0.4375 0.1273 0.0638 ...
## $ nonactive_23: num 0.025 0.0233 0.6857 0.0612 0.1765 ...
## $ nonactive_24: num 0.0465 0.0667 0.4318 0.1429 0.0732 ...
## $ nonactive_25: num 0 0 0.0513 0 0.1579 ...
## $ nonactive_26: num 0.219 0.135 0.188 0.136 0.118 ...
## $ nonactive_27: num 0.4 0.24 0.0222 0.2581 0.125 ...
## $ nonactive_28: num 0.167 0.118 0.152 0.122 0.259 ...
## $ nonactive_29: num 0.075 0.0698 0.1091 0.3171 0.2727 ...
## $ active_3 : num 0.36 0.1562 0.0196 0.2857 0.1379 ...
## $ nonactive_30: num 0.0714 0.0645 0.0667 0.1714 0.16 ...
## $ nonactive_31: num 0.1739 0.1111 0.0227 0.1176 0.1304 ...
## $ nonactive_32: num 0.1304 0.0784 0.1864 0.1455 0.2439 ...
## $ nonactive_33: num 0.0455 0.0833 0 0.1724 0 ...
## $ nonactive_34: num 0 0.0333 0.0222 0.1818 0.1739 ...
## $ nonactive_35: num 0.194 0.122 0.173 0.2 0.273 ...
## $ nonactive_36: num 0.0357 0.1429 0 0.1765 0 ...
## $ nonactive_37: num 0.1351 0.0465 0.3333 0.1277 0.2059 ...
## $ nonactive_38: num 0.16 0.1034 0.0444 0.0526 0.037 ...
## $ nonactive_39: num 0.0645 0.1613 0.0851 0.2571 0.0667 ...
## $ nonactive_40: num 0.0435 0.0385 0.1667 0.0606 0.2105 ...
## $ nonactive_41: num 0 0.0357 0 0.0278 0 ...
## $ nonactive_42: num 0.025 0.0233 0.4048 0.1556 0.2903 ...
## $ nonactive_43: num 0.05 0.0435 0.0263 0.0323 0.0526 ...
## $ nonactive_44: num 0.1034 0.1667 0.0417 0.1316 0.069 ...
## $ nonactive_45: num 0.346 0.188 0.06 0.15 0.308 ...
## $ nonactive_46: num 0.1176 0.1389 0.0182 0.225 0.1212 ...
## $ nonactive_47: num 0.2759 0.25 0.0784 0.2632 0.0909 ...
## $ nonactive_48: num 0.037 0.0333 0.2105 0.0833 0.08 ...
## $ nonactive_49: num 0.167 0.226 0.06 0.211 0.259 ...
## $ nonactive_50: num 0.0465 0.0213 0.2857 0.098 0.1282 ...
## $ active_4 : num 0.1053 0.125 0.0909 0.325 0.2812 ...
## $ nonactive_51: num 0.0769 0.1071 0.122 0.1818 0.1739 ...
## $ nonactive_52: num 0 0 0.2821 0.0488 0.1071 ...
## $ active_5 : num 0.0968 0.0571 0.0196 0.2162 0.1379 ...
## $ nonactive_53: num 0.04 0.0357 0.0233 0.0571 0.0417 ...
## $ nonactive_54: num 0.125 0.114 0.08 0.205 0.346 ...
## $ nonactive_55: num 0.29 0.194 0.16 0.186 0.219 ...
## $ nonactive_56: num 0.0588 0.0364 0.4694 0.1207 0.1042 ...
## $ active_6 : num 0.1714 0.0732 0.0351 0.3 0.1429 ...
## $ active_7 : num 0.2059 0.1579 0.0536 0.3 0.2121 ...
## $ nonactive_57: num 0.0213 0.02 0.3469 0.0727 0.093 ...
## $ nonactive_58: num 0.0714 0.0213 0.3404 0.0769 0.1 ...
## $ nonactive_59: num 0.1304 0.1154 0.0233 0.1562 0.087 ...
## $ nonactive_60: num 0.0323 0.0294 0.2821 0.0488 0.1481 ...
## $ nonactive_61: num 0 0.0385 0.05 0.0606 0.15 ...
## $ nonactive_62: num 0.0345 0.0645 0.0667 0.1714 0 ...
## $ nonactive_63: num 0.0698 0.0426 0.4545 0.14 0.125 ...
## $ nonactive_64: num 0.1053 0.0465 0.4286 0.1042 0.0789 ...
## $ nonactive_65: num 0.2069 0.2258 0.0192 0.3529 0.0968 ...
## $ nonactive_66: num 0 0.08 0 0.0606 0 ...
## $ nonactive_67: num 0.1351 0.1538 0.0169 0.2619 0.0513 ...
## $ active_8 : num 0.0333 0.0968 0 0.2 0 ...
## $ nonactive_68: num 0.1071 0.0968 0.1136 0.1351 0.1538 ...
## $ active_9 : num 0.1944 0.0698 0.0339 0.1739 0.1053 ...
## $ nonactive_69: num 0.0816 0.0566 0.4792 0.1228 0.1304 ...
## $ nonactive_70: num 0.0952 0.04 0 0.0303 0 ...
## $ nonactive_71: num 0.1154 0.0323 0 0.1765 0 ...
## $ nonactive_72: num 0.04 0.0741 0.0233 0.0882 0.0417 ...
## $ nonactive_73: num 0.1 0.0909 0.2143 0.1282 0.0667 ...
## $ active_10 : num 0.1034 0.129 0.0638 0.3438 0.1481 ...
## $ nonactive_74: num 0.0303 0.0571 0.0612 0.0714 0.0312 ...
## $ nonactive_75: num 0.214 0.156 0.156 0.125 0.1 ...
## $ nonactive_76: num 0.2903 0.303 0.0357 0.2143 0.1471 ...
## $ nonactive_77: num 0.0357 0.1034 0 0.1111 0 ...
## $ nonactive_78: num 0.0345 0.0645 0.0213 0.1081 0.0357 ...
## $ nonactive_79: num 0.0286 0.0541 0.0385 0.093 0.0294 ...
## $ nonactive_80: num 0.1852 0.1667 0.0638 0.2647 0.2917 ...
## $ nonactive_81: num 0 0 0.2308 0.0513 0.0357 ...
## $ nonactive_82: num 0.1667 0.1515 0.0192 0.2105 0.0303 ...
## $ active_11 : num 0.1111 0.375 0.0435 0.0789 0.0741 ...
## $ nonactive_83: num 0.2 0.375 0.0213 0.3667 0.1154 ...
## $ nonactive_84: num 0.1935 0.2121 0.0185 0.3333 0.0909 ...
## $ active_12 : num 0.3462 0.2258 0.0392 0.3143 0.1724 ...
## $ nonactive_85: num 0.146 0.087 0.182 0.137 0.438 ...
## $ nonactive_86: num 0.1852 0.0938 0.087 0.2647 0.069 ...
## $ nonactive_87: num 0.1111 0.0312 0.2 0.1081 0.0741 ...
## [list output truncated]
max(data_tanimoto_1$nonactive_1,na.rm = T)
## [1] 0.4
#View(data_tanimoto_1[1:10,1:10])
write.csv(data_tanimoto_1,paste0(dir_path,Sys.Date(),"-data_tanimoto_all.csv"),row.names = T)
##########1.1_active - active
active_2 <- data_tanimoto_1[grep("^active",rownames(data_tanimoto_1)),grep("^active",colnames(data_tanimoto_1))]
dim(active_2) #[1] 13 13
## [1] 13 13
max(active_2$active_1,na.rm = TRUE) #[1] 0.6551724
## [1] 0.34375
active_2[nrow(active_2)+1,] <- apply(active_2,2,function(x) max(x, na.rm = TRUE))
active_2_1 <- data.frame(t(active_2[nrow(active_2),]))
dim(active_2_1) #[1] 446 1
## [1] 13 1
head(active_2_1)
## X14
## active_1 0.3437500
## active_2 0.3548387
## active_3 0.3870968
## active_4 0.3548387
## active_5 0.3437500
## active_6 0.3684211
active_2_1$source <- "active"
##########1.2_nonactive - nonactive
nonactive_2 <- data_tanimoto_1[grep("^nonactive",rownames(data_tanimoto_1)),grep("^nonactive",colnames(data_tanimoto_1))]
dim(nonactive_2) #[1] 6539 6539
## [1] 87 87
#View(nonactive_2[1:10,1:10])
nonactive_2[nrow(nonactive_2)+1,] <- apply(nonactive_2,2,function(x) max(x, na.rm = TRUE))
nonactive_2_1 <- data.frame(t(nonactive_2[nrow(nonactive_2),]))
nonactive_2_1$source <- "nonactive"
dim(nonactive_2_1) #[1] 6539 2
## [1] 87 2
head(nonactive_2_1)
## X88 source
## nonactive_1 0.4000000 nonactive
## nonactive_2 0.5000000 nonactive
## nonactive_3 0.6857143 nonactive
## nonactive_4 0.4375000 nonactive
## nonactive_5 0.3548387 nonactive
## nonactive_6 0.3200000 nonactive
#####1.3 active vs. nonactive
active_vs_nonactive <- data_tanimoto_1[grep("^nonactive",rownames(data_tanimoto_1)),grep("^active",colnames(data_tanimoto_1))]
dim(active_vs_nonactive) #[1] 6539 675
## [1] 87 13
active_vs_nonactive[nrow(active_vs_nonactive)+1,] <- apply(active_vs_nonactive,2,function(x) max(x, na.rm = TRUE))
active_vs_nonactive_1 <- data.frame(t(active_vs_nonactive[nrow(active_vs_nonactive),]))
active_vs_nonactive_1$source <- "active_vs_nonactive"
dim(active_vs_nonactive_1) #[1] 675 2
## [1] 13 2
head(active_vs_nonactive_1)
## X88 source
## active_1 0.3666667 active_vs_nonactive
## active_2 0.3870968 active_vs_nonactive
## active_3 0.5000000 active_vs_nonactive
## active_4 0.4444444 active_vs_nonactive
## active_5 0.4062500 active_vs_nonactive
## active_6 0.3076923 active_vs_nonactive
#####1.4 nonactive vs.active
nonactive_vs_active <- data_tanimoto_1[grep("^active",rownames(data_tanimoto_1)),grep("^nonactive",colnames(data_tanimoto_1))]
dim(nonactive_vs_active) #[1] 675 6539
## [1] 13 87
nonactive_vs_active[nrow(nonactive_vs_active)+1,] <- apply(nonactive_vs_active,2,function(x) max(x, na.rm = TRUE))
nonactive_vs_active_1 <- data.frame(t(nonactive_vs_active[nrow(nonactive_vs_active),]))
nonactive_vs_active_1$source <- "nonactive_vs_active"
dim(nonactive_vs_active_1) #[1] 6539 2
## [1] 87 2
head(nonactive_vs_active_1)
## X14 source
## nonactive_1 0.36000000 nonactive_vs_active
## nonactive_2 0.37500000 nonactive_vs_active
## nonactive_3 0.09090909 nonactive_vs_active
## nonactive_4 0.28125000 nonactive_vs_active
## nonactive_5 0.31250000 nonactive_vs_active
## nonactive_6 0.28571429 nonactive_vs_active
###2.1 merge
head(active_2_1)
## X14 source
## active_1 0.3437500 active
## active_2 0.3548387 active
## active_3 0.3870968 active
## active_4 0.3548387 active
## active_5 0.3437500 active
## active_6 0.3684211 active
colnames(active_2_1) <- colnames(nonactive_2_1) <-
colnames(active_vs_nonactive_1) <- colnames(nonactive_vs_active_1) <-
c("tanimoto","source")
dim(active_2_1);dim(nonactive_2_1);dim(active_vs_nonactive_1);dim(nonactive_vs_active_1)
## [1] 13 2
## [1] 87 2
## [1] 13 2
## [1] 87 2
#[1] 675 2 [1] 6539 2 [1] 675 2 [1] 6539 2
data_tanimo_all <- rbind(active_2_1, nonactive_2_1,active_vs_nonactive_1, nonactive_vs_active_1)
#############################plot
dim(data_tanimo_all) #[1] 13986 2
## [1] 200 2
unique(data_tanimo_all$source)
## [1] "active" "nonactive" "active_vs_nonactive"
## [4] "nonactive_vs_active"
data_tanimo_all$source <- gsub("active_vs_nonactive","nonactive_vs_active",data_tanimo_all$source)
data_stat <- aggregate(data_tanimo_all[1],by = list(source = data_tanimo_all$source), FUN = summary)
write.csv(data_stat,paste0(dir_path,Sys.Date(),"-data_for_stat.csv"),row.names = T)
write.csv(data_tanimo_all,paste0(dir_path,Sys.Date(),"-data_for_plot.csv"),row.names = T)
#############################plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.0
max(data_tanimo_all$tanimoto) #[1] 1
## [1] 0.9047619
a_1 <- unique(data_tanimo_all$source)
a_2 <- gsub("^active$","Activity vs. Activity",a_1)
a_3 <- gsub("^nonactive$","Non-activity vs.Non-activity",a_2)
a_4 <- gsub("^nonactive_vs_active$","Activity vs. Non-activity",a_3)
##
data_tanimo_all$source <- gsub("^active$","Activity vs Activity",data_tanimo_all$source)
data_tanimo_all$source <- gsub("^nonactive$","Non-activity vs Non-activity",data_tanimo_all$source)
data_tanimo_all$source <- gsub("^nonactive_vs_active$","Activity vs Non-activity",data_tanimo_all$source)
unique(data_tanimo_all$source)
## [1] "Activity vs Activity" "Non-activity vs Non-activity"
## [3] "Activity vs Non-activity"
head(data_tanimo_all)
## tanimoto source
## active_1 0.3437500 Activity vs Activity
## active_2 0.3548387 Activity vs Activity
## active_3 0.3870968 Activity vs Activity
## active_4 0.3548387 Activity vs Activity
## active_5 0.3437500 Activity vs Activity
## active_6 0.3684211 Activity vs Activity
str(data_tanimo_all)
## 'data.frame': 200 obs. of 2 variables:
## $ tanimoto: num 0.344 0.355 0.387 0.355 0.344 ...
## $ source : chr "Activity vs Activity" "Activity vs Activity" "Activity vs Activity" "Activity vs Activity" ...
data_tanimo_all$source <- factor(data_tanimo_all$source,levels = unique(data_tanimo_all$source))
######################plot
########################################adjust =1
ggplot(data_tanimo_all, aes(x = tanimoto, color = source)) +
geom_density(alpha = .3,adjust =1,fill="white",size = 1) +
labs(x = "Tanimoto coefficient", y = "Density") +
scale_color_manual(values=c("#68c4e4", "#e64d49","#d6d9d8")) +
theme(panel.background = element_rect(fill = "white",
colour = "white",
size = 3, linetype = "solid"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = c(0.27, 0.9),
legend.box = "vertical",
legend.title = element_blank(),
axis.line =element_line(colour = "black",
size = 0.5, linetype = "solid"),
legend.background = element_rect(fill="white",
size=0.5, linetype="solid",
colour ="white"),
legend.text=element_text(size=13,family = "sans"),
#axis.text.x = element_blank(),
axis.text.x = element_text(colour="black",size=16,face="plain",family = "sans"),
axis.text.y = element_text(colour="black",size=16,face="plain",family = "sans"),
axis.title.x = element_text(colour="black",size=16,face="plain",family = "sans"),
axis.title.y = element_text(colour="black",size=16,face="plain",family = "sans")) +
scale_y_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 4.5 ,by = 1),limits = c(0,4.5)) +
scale_x_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 1 ,by = 0.2),limits = c(0,1))

# annotate(geom="text", x=0.86, y=0.98, label="AChE-ToxPrint",
# color="black",size=5,face="plain",family = "sans")
ggsave(filename = paste0(Sys.Date(),"-bche_activity_adjust_1.tif"), plot = last_plot(),
device = "tiff", path = dir_path,
width = 15, height = 15, units = "cm",
dpi = 300, limitsize = TRUE)
#############adjust =3
ggplot(data_tanimo_all, aes(x = tanimoto, color = source)) +
geom_density(alpha = .3,adjust = 3,fill="white",size = 1) +
labs(x = "Tanimoto coefficient", y = "Density") +
scale_color_manual(values=c("#68c4e4", "#e64d49","#d6d9d8")) +
theme(panel.background = element_rect(fill = "white",
colour = "white",
size = 3, linetype = "solid"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = c(0.27, 0.9),
legend.box = "vertical",
legend.title = element_blank(),
axis.line =element_line(colour = "black",
size = 0.5, linetype = "solid"),
legend.background = element_rect(fill="white",
size=0.5, linetype="solid",
colour ="white"),
legend.text=element_text(size=13,family = "sans"),
#axis.text.x = element_blank(),
axis.text.x = element_text(colour="black",size=16,face="plain",family = "sans"),
axis.text.y = element_text(colour="black",size=16,face="plain",family = "sans"),
axis.title.x = element_text(colour="black",size=16,face="plain",family = "sans"),
axis.title.y = element_text(colour="black",size=16,face="plain",family = "sans")) +
scale_y_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 4,by = 1),limits = c(0,4)) +
scale_x_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 1 ,by = 0.2),limits = c(0,1))

# annotate(geom="text", x=0.86, y=0.98, label="AChE-ToxPrint",
# color="black",size=5,face="plain",family = "sans")
ggsave(filename = paste0(Sys.Date(),"-bche_activity-adjust_3.tif"), plot = last_plot(),
device = "tiff", path = dir_path,
width = 15, height = 15, units = "cm",
dpi = 300, limitsize = TRUE)
####violin
unique(data_tanimo_all$source)
## [1] Activity vs Activity Non-activity vs Non-activity
## [3] Activity vs Non-activity
## 3 Levels: Activity vs Activity ... Activity vs Non-activity
data_violin <- data_tanimo_all
str(data_tanimo_all)
## 'data.frame': 200 obs. of 2 variables:
## $ tanimoto: num 0.344 0.355 0.387 0.355 0.344 ...
## $ source : Factor w/ 3 levels "Activity vs Activity",..: 1 1 1 1 1 1 1 1 1 1 ...
data_violin$source <- as.character(data_violin $source)
str(data_violin)
## 'data.frame': 200 obs. of 2 variables:
## $ tanimoto: num 0.344 0.355 0.387 0.355 0.344 ...
## $ source : chr "Activity vs Activity" "Activity vs Activity" "Activity vs Activity" "Activity vs Activity" ...
unique(data_violin$source)
## [1] "Activity vs Activity" "Non-activity vs Non-activity"
## [3] "Activity vs Non-activity"
data_violin$source <- gsub("vs","\n vs \n",data_violin $source)
?geom_violin
## starting httpd help server ... done
data_violin$source <- factor(data_violin$source,levels = unique(data_violin$source)[c(3,1,2)])
ggplot(data_violin, aes(x = source,y = tanimoto,fill = source)) +
geom_violin(trim=T)+
labs(x = "", y = "Tanimoto Coefficient") +
geom_boxplot(width=0.2, outlier.alpha = 1,outlier.size =0.8,outlier.colour = "black") +
theme_classic()+
scale_fill_brewer(palette="Accent") +
theme(plot.margin=unit(c(0,0,1,0),"cm"),
legend.position = "none",
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
strip.text.x = element_text(colour = "black", face = "plain",family = "sans"),
axis.text.x = element_text(angle = 0,vjust = 0.8,size=12,hjust = 0.5,colour = "black",family = "sans"),
strip.text.y = element_blank(),
strip.placement = "outside",
panel.spacing = unit(0.2, "lines"),
axis.title = element_text(color="black", size=12, face="plain",family = "sans"),
axis.text.y = element_text(color="black", size=12, face="plain",family = "sans"),
plot.title = element_text(hjust = 0.5,color="black", size= 12, face="bold")) +
scale_y_continuous(expand = c(0.01, 0.01),breaks= seq(from = 0 , to = 1 ,by = 0.2),limits = c(0,1))

# scale_color_manual(values=c("#68c4e4", "#e64d49","#d6d9d8"))
#geom_hline(yintercept=20, linetype="dashed", color = "red")
#min(data_4$value)
#max(data_4$value)
ggsave(filename = paste0(Sys.Date(),"-bche_activity-VIOLIN_2.tif"),
plot = last_plot(), device = "tiff", path = dir_path,
scale = 1, width = 10, height = 12, units = "cm",compression = "lzw",
dpi = 300, limitsize = TRUE)