rm(list = ls())
#install.packages(c("e1071","randomForest","ROCR","pROC"))
#install.packages("mltools")
############################################library packages
library(e1071) #naivebayes & SVM
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(nnet)
library(neuralnet)
library(hydroGOF)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##########################################prepare data path
dir_path <- "/Users/xut2/Desktop/untitled folder"
dir_path_target <- list.dirs(dir_path,recursive = F)
dir_path_target
## [1] "/Users/xut2/Desktop/untitled folder/data"
k <- 3 # 5 fold cross validation #check point
loop <- 1
for (nn in 1:length(dir_path_target)) {
tryCatch({
#nn= 1
###creat folders for save results
unlink(paste0(dir_path_target[nn],"/","ann"), recursive=TRUE) #delete file and all involved
unlink(paste0(dir_path_target[nn],"/","glm"), recursive=TRUE) #delete file and all involved
unlink(paste0(dir_path_target[nn],"/","svm"), recursive=TRUE) #delete file and all involved
unlink(paste0(dir_path_target[nn],"/","rf"), recursive=TRUE) #delete file and all involved
unlink(paste0(dir_path_target[nn],"/","nnet"), recursive=TRUE) #delete file and all involved
unlink(paste0(dir_path_target[nn],"/","lm"), recursive=TRUE) #delete file and all involved
dir_path_name <- dir(dir_path_target[nn],pattern = "*.csv") ##input data
dir.create(paste0(dir_path_target[nn],"/","lm"))
dir.create(paste0(dir_path_target[nn],"/","glm"))
dir.create(paste0(dir_path_target[nn],"/","svm"))
dir.create(paste0(dir_path_target[nn],"/","rf"))
dir.create(paste0(dir_path_target[nn],"/","nnet"))
dir.create(paste0(dir_path_target[nn],"/","ann"))
##########################################bath input data
#aa=1
object_file_list <- list()
for (aa in 1:length(dir_path_name)) {
dir_path_file <- paste0(dir_path_target[nn],"/",dir_path_name[aa])
object_file_list[[aa]] <- read.csv(dir_path_file,header = T,stringsAsFactors = F)
object_file_list[[aa]] <- object_file_list[[aa]][, colSums(is.na(object_file_list[[aa]])) ==0]
object_file_list[[aa]] <- na.omit(object_file_list[[aa]])
names(object_file_list)[aa] <- dir_path_name[aa]
}
names(object_file_list)
colnames(object_file_list[[1]])
#############################################################model
for (mm in 1:length(object_file_list)){
#mm=1
#################################################data for model
data <- object_file_list[[mm]] #check point- 1
#colnames(data)
colnames(data) <- c(colnames(data[1:(ncol(data)-1)]),"num")
###############################################parameter setting
n_col_newdata <- ncol(data)
#####################svm_parameter
#newdata_tune_svm <- tune(svm, num ~ .,data = data,tunecontrol = tune.control(cross = k),
# ranges = list(epsilon = seq(0,1,0.1), cost = 10^(-3:3),
# gamma = c(0.01,0.05,0.1,0.5,1)))
#newdata_tune_svm$best.parameters$gamma <- 0.005
#newdata_tune_svm$best.parameters$cost <- 10
#newdata_tune_svm$best.parameters$epsilon
#####################rf_parameter
#newdata_tune_rf <- tune.randomForest(num ~ .,data = data,
# ntree = c(20,50,80,100,150,200,300,500),
# mtry = c(1:10),tunecontrol = tune.control(cross = k))
#################net_parameter
#newdata_tune_rf$best.parameters$ntree <- 80
#newdata_tune_rf$best.parameters$mtry <- 10
#####################nnet_parameter
#newdata_tune_nnet <- tune.nnet(num ~ .,data = data, size = 1:3,
## decay = c(0.0001,0.001,0.01,0.1),
# maxit = c(10,100,200,500,800,1000),
# tunecontrol = tune.control(nrepeat = k))
#################parameters
#newdata_tune_nnet$best.parameters$size <- 80
#newdata_tune_nnet$best.parameters$decay <- 10
#####lm
mae_lm_mean <- numeric(loop)
rmse_lm_mean <- numeric(loop)
mse_lm_mean <- numeric(loop)
r2_lm_mean <- numeric(loop)
######glm
mae_glm_mean <- numeric(loop)
rmse_glm_mean <- numeric(loop)
mse_glm_mean <- numeric(loop)
r2_glm_mean <- numeric(loop)
#####svm
mae_svm_mean <- numeric(loop)
rmse_svm_mean <- numeric(loop)
mse_svm_mean <- numeric(loop)
r2_svm_mean <- numeric(loop)
####rf
mae_rf_mean <- numeric(loop)
rmse_rf_mean <- numeric(loop)
mse_rf_mean <- numeric(loop)
r2_rf_mean <- numeric(loop)
####nnet
mae_nnet_mean <- numeric(loop)
rmse_nnet_mean <- numeric(loop)
mse_nnet_mean <- numeric(loop)
r2_nnet_mean <- numeric(loop)
###ann
mae_ann_mean <- numeric(loop)
rmse_ann_mean <- numeric(loop)
mse_ann_mean <- numeric(loop)
r2_ann_mean <- numeric(loop)
###
###############################################divide data
#loop = 2
for (j in 1:loop) {
#j = 1
#set.seed(j*2018)
data_random <- data[sample(nrow(data)),] # Randomly shuffle the data
index = cut(1:nrow(data_random),breaks = k,labels = FALSE) # Create n equally sized folds
#################################################
###lm
rmse_lm <- numeric(k)
mae_lm <- numeric(k)
mse_lm <- numeric(k)
r2_lm <- numeric(k)
###lm
rmse_glm <- numeric(k)
mae_glm <- numeric(k)
mse_glm <- numeric(k)
r2_glm <- numeric(k)
###svm
rmse_svm <- numeric(k)
mae_svm <- numeric(k)
mse_svm <- numeric(k)
r2_svm <- numeric(k)
###rf
rmse_rf <- numeric(k)
mae_rf <- numeric(k)
mse_rf <- numeric(k)
r2_rf <- numeric(k)
###nnet
rmse_nnet <- numeric(k)
mae_nnet <- numeric(k)
mse_nnet <- numeric(k)
r2_nnet <- numeric(k)
###ann
rmse_ann <- numeric(k)
mae_ann <- numeric(k)
mse_ann <- numeric(k)
r2_ann <- numeric(k)
#####################################
for(i in 1:k){
#i = 1
tryCatch({
#Segement your data by fold using the which() function
test_index <- which(index==i,arr.ind=TRUE) #在index里查找值为i,返回坐标
testdata <- data_random[test_index, ]
traindata <- data_random[-test_index, ]
#dim(testdata); dim(traindata)
#############################################lm_1
fit_lm <- lm(num ~ .,data = traindata)
#### lm_metrics
pre_rmse_lm <- predict(fit_lm, testdata[,-n_col_newdata])
rmse_lm[i] <- hydroGOF::rmse(pre_rmse_lm,testdata[,n_col_newdata]) #[1] 0.1409023
mse_lm[i] <- mean((testdata[,n_col_newdata] - pre_rmse_lm)^2) #[1] 0.01985346
mae_lm[i] <- mean(abs(testdata[,n_col_newdata] - pre_rmse_lm)) #[1] 0.1134223
r2_lm[i] <- cor(testdata[,n_col_newdata], pre_rmse_lm)^2 #[1] 0.5098263
#################################################glm
fit_glm <- glm(num ~ .,data = traindata, family=binomial())
pre_rmse_glm<- predict(fit_glm, testdata[,-n_col_newdata])
rmse_glm[i] <- hydroGOF::rmse(pre_rmse_glm,testdata[,n_col_newdata]) #[1] 0.1409023
mse_glm[i] <- mean((testdata[,n_col_newdata] - pre_rmse_glm)^2) #[1] 0.01985346
mae_glm[i] <- mean(abs(testdata[,n_col_newdata] - pre_rmse_glm)) #[1] 0.1134223
r2_glm[i] <- cor(testdata[,n_col_newdata], pre_rmse_lm)^2 #[1] 0.5098263
##############################################svm_2
fit_svm <- svm(num ~ .,data = traindata,
type = "eps-regression", kernel = "radial",
#gamma = newdata_tune_svm$best.parameters$gamma,
#cost = newdata_tune_svm$best.parameters$cost,
#epsilon = newdata_tune_svm$best.parameters$epsilon
)
#### svm_metrics
pre_rmse_svm <- predict(fit_svm, testdata[,-n_col_newdata])
rmse_svm[i] <- hydroGOF::rmse(pre_rmse_svm,testdata[,n_col_newdata]) #[1] 0.07090698
mse_svm[i] <- mean((testdata[,n_col_newdata] - pre_rmse_svm)^2) #[1] 0.0050278
mae_svm[i] <- mean(abs(testdata[,n_col_newdata] - pre_rmse_svm)) #[1] 0.05490023
r2_svm[i] <- cor(testdata[,n_col_newdata], pre_rmse_svm)^2 #[1] 0.8769199
############################################## rf_3
fit_rf <- randomForest(num ~ .,data = traindata,type = "regression",
#ntree = newdata_tune_rf$best.parameters$ntree,
#mtry = newdata_tune_rf$best.parameters$mtry,
#proximity=TRUE, oob.prox=FALSE
)
####rf_metrics
pre_rmse_rf <- predict(fit_rf, testdata[,-n_col_newdata])
rmse_rf[i] <- hydroGOF::rmse(pre_rmse_rf,testdata[,n_col_newdata]) #[1] 0.06448672
mse_rf[i] <- mean((testdata[,n_col_newdata] - pre_rmse_rf)^2) #[1] 0.004158537
mae_rf[i] <- mean(abs(testdata[,n_col_newdata] - pre_rmse_rf)) #[1] 0.04599916
r2_rf[i] <- cor(testdata[,n_col_newdata], pre_rmse_rf)^2 #[1] 0.9029919
#class(pre_rmse_rf)
##############################################nnet_4
set.seed(2019)
fit_nnet <- nnet(num ~ .,data = traindata,
size= 10,
#decay = newdata_tune_nnet$best.parameters$decay,
#linout=T,skip=T,
maxit=10000)
#class(pre_rmse_nnet)
#### nnet_metrics
pre_rmse_nnet <- predict(fit_nnet, testdata[,-n_col_newdata])
pre_rmse_nnet <- as.numeric(pre_rmse_nnet)
rmse_nnet[i] <- hydroGOF::rmse(pre_rmse_nnet,testdata[,n_col_newdata]) #[1] 0.09568838
mse_nnet[i] <- mean((testdata[,n_col_newdata] - pre_rmse_nnet)^2) #[1] 0.009156266
mae_nnet[i] <- mean(abs(testdata[,n_col_newdata] - pre_rmse_nnet)) #[1] 0.07463675
r2_nnet[i] <- cor(testdata[,n_col_newdata], pre_rmse_nnet)^2 #[1] 0.7712949
##############################################ann_5
set.seed(2019)
fit_ann <- neuralnet(num ~ .,data = traindata,hidden = 5)
#class(pre_rmse_ann)
#### nnet_metrics
pre_rmse_ann <- predict(fit_ann, testdata[,-n_col_newdata])
pre_rmse_ann <- as.numeric(pre_rmse_ann)
rmse_ann[i] <- hydroGOF::rmse(pre_rmse_ann,testdata[,n_col_newdata]) #[1] 0.08046632
mse_ann[i] <- mean((testdata[,n_col_newdata] - pre_rmse_ann)^2) #[1] 0.006474829
mae_ann[i] <- mean(abs(testdata[,n_col_newdata] - pre_rmse_ann)) #[1] 0.06062628
r2_ann[i] <- cor(testdata[,n_col_newdata], pre_rmse_ann)^2 #[1] 0.8395542
},error=function(e){cat("ERROR :",conditionMessage(e),"\n")})
}
#####lm_1
rmse_lm_mean[j] <- mean(rmse_lm[rmse_lm != 0]) # mean of 10 fold cross validation
mae_lm_mean[j] <- mean(mae_lm[mae_lm != 0]) # mean of 10 fold cross validation
mse_lm_mean[j] <- mean(mse_lm[mse_lm != 0])
r2_lm_mean[j] <- mean(r2_lm[r2_lm != 0])
#####lm_1
rmse_glm_mean[j] <- mean(rmse_glm[rmse_glm != 0]) # mean of 10 fold cross validation
mae_glm_mean[j] <- mean(mae_glm[mae_glm != 0]) # mean of 10 fold cross validation
mse_glm_mean[j] <- mean(mse_glm[mse_glm != 0])
r2_glm_mean[j] <- mean(r2_glm[r2_glm != 0])
#####svm_2
#####svm_2
rmse_svm_mean[j] <- mean(rmse_svm[rmse_svm != 0]) # mean of 10 fold cross validation
mae_svm_mean[j] <- mean(mae_svm[mae_svm != 0]) # mean of 10 fold cross validation
mse_svm_mean[j] <- mean(mse_svm[mse_svm != 0])
r2_svm_mean[j] <- mean(r2_svm[r2_svm != 0])
#####rf_3
rmse_rf_mean[j] <- mean(rmse_rf[rmse_rf != 0]) # mean of 10 fold cross validation
mae_rf_mean[j] <- mean(mae_rf[mae_rf != 0]) # mean of 10 fold cross validation
mse_rf_mean[j] <- mean(mse_rf[mse_rf != 0])
r2_rf_mean[j] <- mean(r2_rf[r2_rf != 0])
#####nnet_4
rmse_nnet_mean[j] <- mean(rmse_nnet[rmse_nnet != 0]) # mean of 10 fold cross validation
mae_nnet_mean[j] <- mean(mae_nnet[mae_nnet != 0]) # mean of 10 fold cross validation
mse_nnet_mean[j] <- mean(mse_nnet[mse_nnet != 0])
r2_nnet_mean[j] <- mean(r2_nnet[r2_nnet != 0])
#####ann_5
rmse_ann_mean[j] <- mean(rmse_ann[rmse_ann != 0]) # mean of 10 fold cross validation
mae_ann_mean[j] <- mean(mae_ann[mae_ann != 0]) # mean of 10 fold cross validation
mse_ann_mean[j] <- mean(mse_ann[mse_ann != 0])
r2_ann_mean[j] <- mean(r2_ann[r2_ann != 0])
}
####################################output data
#####lm_1
output_lm <- data.frame(mean(rmse_lm_mean[rmse_lm_mean != 0]),sd(rmse_lm_mean[rmse_lm_mean != 0]),
mean(mae_lm_mean[mae_lm_mean != 0]),sd(mae_lm_mean[mae_lm_mean != 0]),
mean(mse_lm_mean[mse_lm_mean != 0]),sd(mse_lm_mean[mse_lm_mean != 0]),
mean(r2_lm_mean[r2_lm_mean != 0]),sd(r2_lm_mean[r2_lm_mean != 0]))
file_name_data_lm <- paste0(dir_path_target[nn],"/","lm","/","lm_",names(object_file_list)[[mm]])
write.csv(output_lm,file_name_data_lm)
#####glm_1
output_glm <- data.frame(mean(rmse_glm_mean[rmse_glm_mean != 0]),sd(rmse_glm_mean[rmse_glm_mean != 0]),
mean(mae_glm_mean[mae_glm_mean != 0]),sd(mae_glm_mean[mae_glm_mean != 0]),
mean(mse_glm_mean[mse_glm_mean != 0]),sd(mse_glm_mean[mse_glm_mean != 0]),
mean(r2_glm_mean[r2_glm_mean != 0]),sd(r2_glm_mean[r2_glm_mean != 0]))
file_name_data_glm <- paste0(dir_path_target[nn],"/","glm","/","glm_",names(object_file_list)[[mm]])
write.csv(output_glm,file_name_data_glm)
#####svm_2
output_svm <- data.frame(mean(rmse_svm_mean[rmse_svm_mean != 0]),sd(rmse_svm_mean[rmse_svm_mean != 0]),
mean(mae_svm_mean[mae_svm_mean != 0]),sd(mae_svm_mean[mae_svm_mean != 0]),
mean(mse_svm_mean[mse_svm_mean != 0]),sd(mse_svm_mean[mse_svm_mean != 0]),
mean(r2_svm_mean[r2_svm_mean != 0]),sd(r2_svm_mean[r2_svm_mean != 0]))
file_name_data_svm <- paste0(dir_path_target[nn],"/","svm","/","svm_",names(object_file_list)[[mm]])
write.csv(output_svm,file_name_data_svm)
#####rf_3
output_rf <- data.frame(mean(rmse_rf_mean[rmse_rf_mean != 0]),sd(rmse_rf_mean[rmse_rf_mean != 0]),
mean(mae_rf_mean[mae_rf_mean != 0]),sd(mae_rf_mean[mae_rf_mean != 0]),
mean(mse_rf_mean[mse_rf_mean != 0]),sd(mse_rf_mean[mse_rf_mean != 0]),
mean(r2_rf_mean[r2_rf_mean != 0]),sd(r2_rf_mean[r2_rf_mean != 0]))
file_name_data_rf <- paste0(dir_path_target[nn],"/","rf","/","rf_",names(object_file_list)[[mm]])
write.csv(output_rf,file_name_data_rf)
#####nnet_4
output_nnet <- data.frame(mean(rmse_nnet_mean[rmse_nnet_mean != 0]),sd(rmse_nnet_mean[rmse_nnet_mean != 0]),
mean(mae_nnet_mean[mae_nnet_mean != 0]),sd(mae_nnet_mean[mae_nnet_mean != 0]),
mean(mse_nnet_mean[mse_nnet_mean != 0]),sd(mse_nnet_mean[mse_nnet_mean != 0]),
mean(r2_nnet_mean[r2_nnet_mean != 0]),sd(r2_nnet_mean[r2_nnet_mean != 0]))
file_name_data_nnet <- paste0(dir_path_target[nn],"/","nnet","/","nnet_",names(object_file_list)[[mm]])
write.csv(output_nnet,file_name_data_nnet)
#####nnet_5
output_ann <- data.frame(mean(rmse_ann_mean[rmse_ann_mean != 0]),sd(rmse_ann_mean[rmse_ann_mean != 0]),
mean(mae_ann_mean[mae_ann_mean != 0]),sd(mae_ann_mean[mae_ann_mean != 0]),
mean(mse_ann_mean[mse_ann_mean != 0]),sd(mse_ann_mean[mse_ann_mean != 0]),
mean(r2_ann_mean[r2_ann_mean != 0]),sd(r2_ann_mean[r2_ann_mean != 0]))
file_name_data_ann <- paste0(dir_path_target[nn],"/","ann","/","ann_",names(object_file_list)[[mm]])
write.csv(output_ann,file_name_data_ann)
}
},error=function(e){cat("ERROR :",conditionMessage(e),"\n")})
}
## Warning in predict.lm(fit_lm, testdata[, -n_col_newdata]): prediction from a
## rank-deficient fit may be misleading
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## # weights: 301
## initial value 18.326955
## final value 14.480285
## converged
## Warning in cor(testdata[, n_col_newdata], pre_rmse_nnet): the standard deviation
## is zero
## Warning in predict.lm(fit_lm, testdata[, -n_col_newdata]): prediction from a
## rank-deficient fit may be misleading
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## # weights: 301
## initial value 19.032200
## final value 14.124524
## converged
## Warning in cor(testdata[, n_col_newdata], pre_rmse_nnet): the standard deviation
## is zero
## Warning in predict.lm(fit_lm, testdata[, -n_col_newdata]): prediction from a
## rank-deficient fit may be misleading
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## # weights: 301
## initial value 18.359147
## final value 14.987082
## converged
## Warning in cor(testdata[, n_col_newdata], pre_rmse_nnet): the standard deviation
## is zero
#sink()