rm(list = ls())
for (i in 1:1) {
###############################input data_1
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\2021\\2021_CYP3A\\1.1_validate_fda\\inhibi\\"
dir_path_name <- dir(dir_path,pattern = ".*.",full.names = T)
dir_path_name
###############################merge data
data_train <- read.csv(grep("2019-11-15-CYP3A4.outcome-0.05_inhibi.csv",dir_path_name,value = T),header = T,stringsAsFactors = F)
data_test <- read.csv(grep("2021-01-07-data_for_model.csv",dir_path_name,value = T),header = T,stringsAsFactors = F)
dim(data_train) #[1] 3913 416
dim(data_test) #[1] 87 1026
unique(data_test$Col2)
unique(data_test$Col1)
data_test <- data_test[data_test$Col1 == "CYP3A Inhibitors", ]
dim(data_test) #[1] 46 1026
data_test$Col1 <- NULL
#View(head(data_test))
colnames(data_test)[1025]
data_test <- data_test[,c(colnames(data_train)[-ncol(data_train)],"Col2")]
setdiff(colnames(data_test),colnames(data_train)) #[1] "Mapping.ID"
setdiff(colnames(data_train),colnames(data_test)) #[1] "X2020.02.24.ecfp4.txt"
dim(data_test) #[[1] 46 416\
dim(data_train) #[1] 3913 416
colnames(data_train)
###################################model
library(randomForest)
library(ROCR)
library(pROC)
library(ROSE)
library(DMwR)
colnames(data_train)[ncol(data_train)] #"X2020.02.24.ecfp4.txt"
colnames(data_train)[ncol(data_train)] <- "num"
data_train <- data.frame(sapply(data_train, as.numeric))
data_train$num <- as.factor(data_train$num)
str(data_train$num)
############################################################################test_cross_validation
dim(data_train) #[1] 4981 143
#View(head(data_train))
colnames(data_train)
table(data_train$num) #4515 466
table(data_train[ncol(data_train)]) ##4515 466
testdata_ori <- data_train[sample(nrow(data_train),size = floor(nrow(data_train)*0.3),replace = F),]
traindata_ori <- data_train[-match(row.names(testdata_ori),row.names(data_train)),]
intersect(row.names(testdata_ori),row.names(traindata_ori)) #character(0)
table(testdata_ori$num) #1348 146
table(traindata_ori$num) #3167 320
fit_rf_ori <- randomForest(num ~ .,data = traindata_ori)
#### auc_rf
n_col_newdata <- ncol(data_train)
pre_auc_rf_ori <- predict(fit_rf_ori, testdata_ori[,-n_col_newdata], type = "prob")
pred_auc_rf_ori <- prediction(pre_auc_rf_ori[,2],testdata_ori[,n_col_newdata])
auc_rf_ori <- performance(pred_auc_rf_ori,"auc")@y.values[[1]]
auc_rf_ori #[1] 0.7592692
pre_probability_ori <- data.frame(pre_auc_rf_ori)
pre_classification_ori <- predict(fit_rf_ori, testdata_ori[,-ncol(data_test)], type="class")
class(pre_classification_ori)
pre_probability_ori$num_1 <- as.character(pre_classification_ori)
pre_probability_ori$num <- testdata_ori$num
#head(pre_probability_ori)
table(pre_probability_ori$num,pre_probability_ori$num_1)
modelroc_rf_ori <- pROC::roc(testdata_ori[,n_col_newdata],as.numeric(pre_auc_rf_ori[,2]))
cutoff_rf_ori <- coords(modelroc_rf_ori, "best", ret = "threshold") #[1] 0.133
predict.results_rf_ori <- ifelse(pre_auc_rf_ori[,2] > as.numeric(cutoff_rf_ori[1]),"1","0")
freq_default_rf_ori <- table(predict.results_rf_ori,testdata_ori[,n_col_newdata])
Sensitivity_rf_ori <- freq_default_rf_ori[1,1]/sum(freq_default_rf_ori[,1])
Specificity_rf_ori <- freq_default_rf_ori[2,2]/sum(freq_default_rf_ori[,2])
accuracy_rf_ori <- mean(testdata_ori[,n_col_newdata] == predict.results_rf_ori)
Balanced_accuracy_rf_ori <- (Sensitivity_rf_ori+Specificity_rf_ori)/2
Balanced_accuracy_rf_ori #[1] 0.7139034
library(caret)
confusionMatrix(factor(predict.results_rf_ori),factor(testdata_ori[,n_col_newdata]))
confusionMatrix(factor(pre_probability_ori$num_1),factor(testdata_ori[,n_col_newdata]))
#####0 1
#0 1350 15
#1 116 13
#################################################################################################
##svm
#data_train <- ovun.sample(num ~ ., data = data_train, method = "over")$data
#################################################probability
#fit_probability <- svm(num ~ .,data = data_train, scale = F, probability=TRUE )
fit_probability <- randomForest(num ~ .,data = data_train)
#### AUC
names(data_test)[ncol(data_test)]
pre_auc_probability <- predict(fit_probability, data_test[,-ncol(data_test)], type="prob")
pre_probability <- data.frame(pre_auc_probability)
#pre_classification <- predict(fit_probability, data_test[,-ncol(data_test)], type="class")
#str(pre_classification)
#pre_probability$num <- as.character(pre_classification)
colnames(pre_probability) <- c("probability_0","probability_1")
dim(pre_probability) #[1] 2409 2
colnames(data_test)
pre_probability$name <- data_test$Col2
head(pre_probability)
str(pre_probability)
print(head(pre_probability))
class(pre_probability)
print(table(cut(pre_probability$probability_1,breaks = c(-0.1,.5,1))))
write.csv(pre_probability,paste0(dir_path,Sys.Date(),"-3a4_inhibitor.csv"),row.names = F)
}
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Loaded ROSE 0.0-3
## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Factor w/ 2 levels "0","1": 2 1 1 1 2 2 1 1 1 1 ...
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Warning in coords.roc(modelroc_rf_ori, "best", ret = "threshold"): The
## 'transpose' argument to FALSE by default since pROC 1.16. Set transpose = TRUE
## explicitly to revert to the previous behavior, or transpose = TRUE to silence
## this warning. Type help(coords_transpose) for additional information.
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## 'data.frame': 46 obs. of 3 variables:
## $ probability_0: num 0.046 0.164 0.066 0.03 0.078 0.154 0.204 0.376 0.102 0.382 ...
## $ probability_1: num 0.954 0.836 0.934 0.97 0.922 0.846 0.796 0.624 0.898 0.618 ...
## $ name : chr "itraconazole" "erythromycin" "fluconazole" "verapamil" ...
## probability_0 probability_1 name
## 42 0.046 0.954 itraconazole
## 43 0.164 0.836 erythromycin
## 44 0.066 0.934 fluconazole
## 45 0.030 0.970 verapamil
## 46 0.078 0.922 Ketoconazole
## 47 0.154 0.846 Clarithromycin
##
## (-0.1,0.5] (0.5,1]
## 4 42