In the following block, required R libraries are checked for existance and then installed if it’s needed.
rm(list = ls()) # clears global environment (functions and variables)
graphics.off() # clears graphics
if(!("pacman" %in% rownames(installed.packages()))){install.packages("pacman")}# pacman would be installed if it's not existed
library(pacman)
p_load(caret, rio, dtwclust, graph, RBGL, arules, stringr, pROC, DT,papeR, pdc, mlr)
Object oriented programming is employed for this project. In this approach of programming, an empty object is created then in each step some attributes and data would be added to the object. Each step is basically a function that perform some processes over the object.Th following snippet is designed to define the functions.
defining functions and loading the required libraries
pred_three<-function(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds){
#next line is just for finding threshold of prediction
if(method_pred=="regular"){
data$TARGET01<-as.numeric(data$TARGET)
data$TARGET<-as.factor(data$TARGET)
levels(data$TARGET)[1] <- "One"
levels(data$TARGET)[2] <- "Two"
levels(data$TARGET)[3] <- "Three"
data$TARGET<-as.factor(data$TARGET)
n_rows<-nrow(data)
#____________________making the n folds_____________________________________
{
One_No<-length(which(data$TARGET=="One"))
Two_No<-length(which(data$TARGET=="Two"))
Three_No<-length(which(data$TARGET=="Three"))
# Create the training and testing data sets
kept_rows_One<-floor(One_No/n_folds)*n_folds
kept_rows_Two<-floor(Two_No/n_folds)*n_folds
kept_rows_Three<-floor(Three_No/n_folds)*n_folds
data_One <- data[ which(data$TARGET=="One"), ]
data_Two <- data[ which(data$TARGET=="Two"), ]
data_Three <- data[ which(data$TARGET=="Three"), ]
#shuffling
data_One<-data_One[sample(nrow(data_One),kept_rows_One),]
data_Two<-data_Two[sample(nrow(data_Two),kept_rows_Two),]
data_Three<-data_Three[sample(nrow(data_Three),kept_rows_Three),]
#naming the folds ID
folds_One <- cut(seq(1,nrow(data_One)),breaks=n_folds,labels=FALSE)
folds_Two <- cut(seq(1,nrow(data_Two)),breaks=n_folds,labels=FALSE)
folds_Three <- cut(seq(1,nrow(data_Three)),breaks=n_folds,labels=FALSE)
#
data_One$folds<-folds_One
data_Two$folds<-folds_Two
data_Three$folds<-folds_Three
#
data_pred<-rbind(data_One,data_Two,data_Three)
}
}
#next line is just for finding threshold of prediction
if(method_pred=="ensemble"){
x <- vector(mode="character", length=0)
for(i in 1:length(vars)){
x<-c(x,paste(vars[i],c("_1","_2"),sep=""))
}
vars<-x
data_pred<-data}
####
if(cntr_type=="simple"){
control_<- trainControl(method = "none", savePredictions = TRUE,
verboseIter = FALSE,returnResamp = "all",classProbs = TRUE)}
if(cntr_type=="simplegbm"){
control_<- trainControl(method = "cv", number = 1, savePredictions = TRUE,
verboseIter = FALSE,classProbs = TRUE, summaryFunction = twoClassSummary)
}
if(cntr_type=="simple_roc"){
control_<- trainControl(method = "none", savePredictions = TRUE,
verboseIter = FALSE,returnResamp = "all",classProbs = TRUE, summaryFunction = twoClassSummary)}
if(cntr_type=="repeatedcv_roc"){
control_<- trainControl(method="repeatedcv", number=2, repeats=5, savePredictions = TRUE,
verboseIter = FALSE,returnResamp = "all",classProbs = TRUE, summaryFunction = twoClassSummary)}
if(cntr_type=="cv_roc"){
control_<- trainControl(method = "cv", number = 2, savePredictions = TRUE,
verboseIter = FALSE,classProbs = TRUE, summaryFunction = twoClassSummary)}
rows_no<-floor(nrow(data_pred)/n_folds)
resul_svm_raw<-matrix(0, ncol = n_folds, nrow = rows_no)
resul_svm_raw<-as.data.frame(resul_svm_raw)
names(resul_svm_raw) <- paste("fold_",1:n_folds, sep = "")
resul_svm_prob1<-resul_svm_raw
resul_svm_prob2<-resul_svm_raw
resul_svm_prob3<-resul_svm_raw
resul_crt_raw<-resul_svm_raw
resul_crt_prob1<-resul_svm_raw
resul_crt_prob2<-resul_svm_raw
resul_crt_prob3<-resul_svm_raw
resul_nnet_raw<-resul_svm_raw
resul_nnet_prob1<-resul_svm_raw
resul_nnet_prob2<-resul_svm_raw
resul_nnet_prob3<-resul_svm_raw
resul_tan_raw<-resul_svm_raw
resul_tan_prob1<-resul_svm_raw
resul_tan_prob2<-resul_svm_raw
resul_tan_prob3<-resul_svm_raw
resul_rf_raw<-resul_svm_raw
resul_rf_prob1<-resul_svm_raw
resul_rf_prob2<-resul_svm_raw
resul_rf_prob3<-resul_svm_raw
resul_cart_raw<-resul_svm_raw
resul_cart_prob1<-resul_svm_raw
resul_cart_prob2<-resul_svm_raw
resul_cart_prob3<-resul_svm_raw
resul_svm_imp<-matrix(0, ncol = n_folds, nrow = length(vars))
resul_svm_imp<-as.data.frame(resul_svm_imp)
names(resul_svm_imp) <- paste("fold_",1:n_folds, sep = "")
resul_svm_imp$variables<-vars
resul_rf_imp<-resul_svm_imp
resul_nnet_imp<-resul_svm_imp
resul_tan_imp<-resul_svm_imp
resul_cart_imp<-resul_svm_imp
tan_vars<-vars
formula_format<-paste("TARGET ~ ",paste(vars, collapse="+"),sep = "")
formula<-as.formula(formula_format)
VarsData_Phase<-as.data.frame(vars)
names(VarsData_Phase)<-"variables"
tracker<-list()
prob_results<-list()
#i<-1
all_vars<-c(vars,"TARGET")
for ( i in 1: n_folds){
test_data<-data_pred[data_pred$folds == i,]
train_data<-data_pred[data_pred$folds != i,]
tracker[[paste("fold_",i,sep="")]]<-test_data$id
train_data<-train_data[,all_vars]
test_data_pri<-test_data
test_data<-test_data[,all_vars]
tt<-as.data.frame(table(train_data$TARGET))
#table(train_data$TARGET)
common<-tt$Freq[1]
rare<-tt$Freq[2]
if(B_alg=="RUS"){train_data<-RUS_func(train_data,"TARGET")}
if(B_alg=="SMOTE"){train_data<-SMOTE_func(train_data,formula,"TARGET")}
if(ensembler=="rf" | method_pred=="regular"){
mod_rf<-caret::train(formula, data=train_data, method="rf",
trControl = control_, tuneGrid=expand.grid(mtry = length(vars)),
tuneLength = 5)
{
if(class(try(varImp(mod_rf),silent = TRUE))!="try-error"){
resul_rf_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_rf_imp,model_imp_exc(mod_rf,VarsData_Phase),"variables")}
resul_rf_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_rf, newdata=test_data, type="raw"))
resul_rf_prob1[,paste("fold_",i, sep = "")]<-predict(mod_rf, newdata=test_data, type="prob")[1]
resul_rf_prob2[,paste("fold_",i, sep = "")]<-predict(mod_rf, newdata=test_data, type="prob")[2]
resul_rf_prob3[,paste("fold_",i, sep = "")]<-predict(mod_rf, newdata=test_data, type="prob")[3]
}
}
if(ensembler=="cart" | method_pred=="regular"){
mod_cart<-caret::train(formula, data=train_data, method="rpart1SE",
trControl = control_, tuneLength = 5)
{
if(class(try(varImp(mod_cart),silent = TRUE))!="try-error"){
resul_cart_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_cart_imp,model_imp_exc(mod_cart,VarsData_Phase),"variables")}
resul_cart_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_cart, newdata=test_data, type="raw"))
resul_cart_prob1[,paste("fold_",i, sep = "")]<-predict(mod_cart, newdata=test_data, type="prob")[1]
resul_cart_prob2[,paste("fold_",i, sep = "")]<-predict(mod_cart, newdata=test_data, type="prob")[2]
resul_cart_prob3[,paste("fold_",i, sep = "")]<-predict(mod_cart, newdata=test_data, type="prob")[3]
}
}
if(ensembler=="svm" | method_pred=="regular"){
svm_cost<-.5
svm_sigma<-.0001
# if(class(try(
#
# mod_svm<-caret::train(formula, data=train_data, method="svmRadial", family="binomial",
# trControl = control_, tuneGrid=expand.grid(C=svm_cost, sigma=svm_sigma),
# tuneLength = 5)
# ))=="try-error"){}
mod_svm<-caret::train(formula, data=train_data, method="svmLinear2", family="binomial",
trControl = control_, tuneGrid=expand.grid(cost=svm_cost),
tuneLength = 5)
{
if(class(try(varImp(mod_svm),silent = TRUE))!="try-error"){
resul_svm_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_svm_imp,model_imp_exc(mod_svm,VarsData_Phase),"variables")}
resul_svm_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_svm, newdata=test_data, type="raw"))
resul_svm_prob1[,paste("fold_",i, sep = "")]<-predict(mod_svm, newdata=test_data, type="prob")[1]
resul_svm_prob2[,paste("fold_",i, sep = "")]<-predict(mod_svm, newdata=test_data, type="prob")[2]
resul_svm_prob3[,paste("fold_",i, sep = "")]<-predict(mod_svm, newdata=test_data, type="prob")[3]
}
}
if(ensembler=="nnet" | method_pred=="regular"){
mod_nnet<-caret::train(formula, data=train_data, method="nnet", family="binomial",
trControl = control_, tuneGrid=expand.grid(size=5, decay=0.1), MaxNWts=20000,
tuneLength = 5)
{
if(class(try(varImp(mod_nnet),silent = TRUE))!="try-error"){
resul_nnet_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_nnet_imp,model_imp_exc(mod_nnet,VarsData_Phase),"variables")}
resul_nnet_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_nnet, newdata=test_data, type="raw"))
resul_nnet_prob1[,paste("fold_",i, sep = "")]<-predict(mod_nnet, newdata=test_data, type="prob")[1]
resul_nnet_prob2[,paste("fold_",i, sep = "")]<-predict(mod_nnet, newdata=test_data, type="prob")[2]
resul_nnet_prob3[,paste("fold_",i, sep = "")]<-predict(mod_nnet, newdata=test_data, type="prob")[3]
}
}
##===making data ready for TAN
# disceretizing train and test
if(ensembler=="tan" | method_pred=="regular"){
data_tan_train<-train_data
y_tr<-data_tan_train$TARGET
data_tan_train$TARGET<-NULL
data_tan_train$folds<-NULL
data_tan_train$type<-"tr"
data_tan_test<-test_data
y_ts<-data_tan_test$TARGET
data_tan_test$TARGET<-NULL
data_tan_test$folds<-NULL
data_tan_test$type<-"ts"
data_tan<-rbind(data_tan_train,data_tan_test)
for(j in 1:(ncol(data_tan)-1)){
if(is.numeric(data_tan_train[,j])){if(is.numeric(data_tan[,j])){
tester<-as.data.frame(data_tan[,j])
if(tester[1,1]!=tester[nrow(data_tan),1]){
data_tan[,j]<-discretize(data_tan[,j],method="interval",categories = floor(log(nrow(data_tan))))}else{
data_tan[,j]<-discretize(data_tan[,j],method="interval",categories = 1)
}
}
}
}
data_tan_train<-data_tan[which(data_tan$type=="tr"),]
data_tan_test<-data_tan[which(data_tan$type=="ts"),]
data_tan_train$type<-NULL
data_tan_test$type<-NULL
data_tan_train<-data_tan_train[, which(names(data_tan_train) %in% tan_vars)]
mod_tan <- caret::train(data_tan_train, y_tr, method="tan",
trControl = control_, tuneGrid = expand.grid(score= "aic", smooth=0.5),
tuneLength = 5)
{
if(class(try(varImp(mod_tan),silent = TRUE))!="try-error"){
resul_tan_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_tan_imp,model_imp_exc(mod_tan,VarsData_Phase),"variables")}
resul_tan_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_tan, newdata=data_tan_test, type="raw"))
resul_tan_prob1[,paste("fold_",i, sep = "")]<-predict(mod_tan, newdata=data_tan_test, type="prob")[1]
resul_tan_prob2[,paste("fold_",i, sep = "")]<-predict(mod_tan, newdata=data_tan_test, type="prob")[2]
resul_tan_prob3[,paste("fold_",i, sep = "")]<-predict(mod_tan, newdata=data_tan_test, type="prob")[3]
}
}
probs_1<-as.data.frame(cbind(resul_svm_prob1[,paste("fold_",i, sep = "")],
resul_nnet_prob1[,paste("fold_",i, sep = "")],
resul_tan_prob1[,paste("fold_",i, sep = "")],
resul_rf_prob1[,paste("fold_",i, sep = "")],
resul_cart_prob1[,paste("fold_",i, sep = "")]))
names(probs_1)<-c("svm_1","nnet_1","tan_1","rf_1","cart_1")
probs_2<-as.data.frame(cbind(resul_svm_prob2[,paste("fold_",i, sep = "")],
resul_nnet_prob2[,paste("fold_",i, sep = "")],
resul_tan_prob2[,paste("fold_",i, sep = "")],
resul_rf_prob2[,paste("fold_",i, sep = "")],
resul_cart_prob2[,paste("fold_",i, sep = "")]))
names(probs_2)<-c("svm_2","nnet_2","tan_2","rf_2","cart_2")
probs_3<-as.data.frame(cbind(resul_svm_prob3[,paste("fold_",i, sep = "")],
resul_nnet_prob3[,paste("fold_",i, sep = "")],
resul_tan_prob3[,paste("fold_",i, sep = "")],
resul_rf_prob3[,paste("fold_",i, sep = "")],
resul_cart_prob3[,paste("fold_",i, sep = "")]))
names(probs_3)<-c("svm_3","nnet_3","tan_3","rf_3","cart_3")
probs<-cbind(probs_1,probs_2,test_data_pri[c("TARGET","id","folds")])
prob_results[[paste("fold_",i,sep="")]]<-probs
probs_all<-prob_results[["fold_1"]]
for(i in 2:n_folds){
probs_all<-rbind(probs_all,prob_results[[paste("fold_",i,sep="")]])
}
}
#end of predicting part
performance_svm<-matrix(0, ncol = n_folds, nrow = 4)
performance_svm<-as.data.frame(performance_svm)
row.names(performance_svm) <- c("level-1","level-2","level-3","Overall")
names(performance_svm)[1:n_folds] <- paste("Fold", 1:n_folds, sep="")
performance_cart<-performance_svm
performance_nnet<-performance_svm
performance_tan<-performance_svm
performance_rf<-performance_svm
for(i in 1:n_folds){
data_One<-data_pred[which(data_pred$folds==i & data_pred$TARGET=="One"),]
data_Two<-data_pred[which(data_pred$folds==i & data_pred$TARGET=="Two"),]
data_Three<-data_pred[which(data_pred$folds==i & data_pred$TARGET=="Three"),]
if(ensembler=="rf" | method_pred=="regular"){
performance_rf[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_rf_raw[,i])
performance_rf[1,i]<-accuracy_cal(data_One$TARGET,resul_rf_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_rf[2,i]<-accuracy_cal(data_Two$TARGET,resul_rf_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_rf[3,i]<-accuracy_cal(data_Three$TARGET,resul_rf_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])
}
if(ensembler=="cart" | method_pred=="regular"){
performance_cart[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_cart_raw[,i])
performance_cart[1,i]<-accuracy_cal(data_One$TARGET,resul_cart_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_cart[2,i]<-accuracy_cal(data_Two$TARGET,resul_cart_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_cart[3,i]<-accuracy_cal(data_Three$TARGET,resul_cart_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])
}
if(ensembler=="svm" | method_pred=="regular"){
performance_svm[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_svm_raw[,i])
performance_svm[1,i]<-accuracy_cal(data_One$TARGET,resul_svm_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_svm[2,i]<-accuracy_cal(data_Two$TARGET,resul_svm_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_svm[3,i]<-accuracy_cal(data_Three$TARGET,resul_svm_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])
}
if(ensembler=="nnet" | method_pred=="regular"){
performance_nnet[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_nnet_raw[,i])
performance_nnet[1,i]<-accuracy_cal(data_One$TARGET,resul_nnet_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_nnet[2,i]<-accuracy_cal(data_Two$TARGET,resul_nnet_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_nnet[3,i]<-accuracy_cal(data_Three$TARGET,resul_nnet_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])
}
if(ensembler=="tan" | method_pred=="regular"){
performance_tan[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_tan_raw[,i])
performance_tan[1,i]<-accuracy_cal(data_One$TARGET,resul_tan_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_tan[2,i]<-accuracy_cal(data_Two$TARGET,resul_tan_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
performance_tan[3,i]<-accuracy_cal(data_Three$TARGET,resul_tan_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])}
}
results<-list()
performance<-list()
nnet<-list()
svm<-list()
tan<-list()
rf<-list()
cart<-list()
if(ensembler=="nnet" | method_pred=="regular"){
nnet$performance_nnet<-performance_nnet
nnet$resul_nnet_raw<-resul_nnet_raw
nnet$resul_nnet_prob1<-resul_nnet_prob1
nnet$resul_nnet_prob2<-resul_nnet_prob2
nnet$resul_nnet_prob3<-resul_nnet_prob3
nnet$resul_nnet_imp<-resul_nnet_imp
results$nnet<-nnet}
if(ensembler=="svm" | method_pred=="regular"){
svm$performance_svm<-performance_svm
svm$resul_svm_raw<-resul_svm_raw
svm$resul_svm_prob1<-resul_svm_prob1
svm$resul_svm_prob2<-resul_svm_prob2
svm$resul_svm_prob3<-resul_svm_prob3
svm$resul_svm_imp<-resul_svm_imp
results$svm<-svm}
if(ensembler=="tan" | method_pred=="regular"){
tan$performance_tan<-performance_tan
tan$resul_tan_raw<-resul_tan_raw
tan$resul_tan_prob1<-resul_tan_prob1
tan$resul_tan_prob2<-resul_tan_prob2
tan$resul_tan_prob3<-resul_tan_prob3
tan$resul_tan_imp<-resul_tan_imp
results$tan<-tan}
if(ensembler=="rf" | method_pred=="regular"){
rf$performance_rf<-performance_rf
rf$resul_rf_raw<-resul_rf_raw
rf$resul_rf_prob1<-resul_rf_prob1
rf$resul_rf_prob2<-resul_rf_prob2
rf$resul_rf_prob3<-resul_rf_prob3
rf$resul_rf_imp<-resul_rf_imp
results$rf<-rf}
if(ensembler=="cart" | method_pred=="regular"){
cart$performance_cart<-performance_cart
cart$resul_cart_raw<-resul_cart_raw
cart$resul_cart_prob1<-resul_cart_prob1
cart$resul_cart_prob2<-resul_cart_prob2
cart$resul_cart_prob3<-resul_cart_prob3
cart$resul_cart_imp<-resul_cart_imp
results$cart<-cart}
results$ensemble_data<-list()
results$ensemble_data$prob_results<-prob_results
results$ensemble_data$probs_all<-probs_all
return(results)
}
##################
model_imp_exc <- function(input_object,vars_org){
imp_model<-varImp(input_object)
imp_model<-as.data.frame(imp_model$importance)
imp_model$importance<-rowMeans(imp_model)
imp_model$importance<-round(imp_model$importance,2)
imp_model$variables<-rownames(imp_model)
model_imp<-matrix(0, ncol = 1, nrow = nrow(imp_model))
model_imp<-as.data.frame(model_imp)
model_imp$variables<-imp_model$variables
model_imp$importance<-imp_model$importance
model_imp$V1<-NULL
seeker<-model_imp
seeker$real_name<-NA
seeked<-as.data.frame(vars_org[,1])
names(seeked)<-"variables"
for (i in 1:nrow(seeker)){
for (j in 1:nrow(seeked)) {
if(grepl(seeked$variables[j],seeker$variables[i])){
if(str_count(seeker$variables[i],"_")==str_count(seeked$variables[j],"_")){
seeker$real_name[i]<-as.character(seeked$variables[j])}
}
}
}
# It's for the variables that created by us and are not in the original dataset
for (i in 1:nrow(seeker)){
if(is.na(seeker$real_name[i])){seeker$real_name[i]<-seeker$variables[i]}
}
str(seeker$importance)
res.by<- by(seeker$importance, seeker$real_name, mean)
res.by
seeker<-as.data.frame(as.matrix(res.by))
seeker$variables<-rownames(seeker)
seeker$importance<-seeker$V1
seeker$V1<-NULL
return(seeker)
}
###########
matrix_filler<-function(col_no,filling_matrix,filler_matrix,common_col){
common_col<-as.character(common_col)
filler_col<-match(common_col,names(filler_matrix))
filling_col<-match(common_col,names(filling_matrix))
if(filler_col-2==0){
values<-1
}else{values<-2}
for (k in 1:nrow(filling_matrix)){
for (j in 1:nrow(filler_matrix)) {
if(grepl(filler_matrix[j,filler_col],filling_matrix[k,filling_col])){
if(str_count(filling_matrix[k,filling_col],"_")==str_count(filler_matrix[j,filler_col],"_")){
filling_matrix[k,col_no]<-filler_matrix[j,values]}
}
}
}
return(filling_matrix)
}
###########
accuracy_cal<-function(actual,predicted){
counter<-0
for(i in 1:length(actual)){
if(actual[i]==predicted[i]){counter<-counter+1}
}
percent_accuracy<-round(100*(counter/length(actual)),2)
return(percent_accuracy)
}
###########
RUS_func<-function(train_data,TARGET){
leveler<-as.data.frame(table(train_data[TARGET]))
leveler<-leveler[order(leveler[,2]),]
rare<-min(leveler$Freq)
common<-max(leveler$Freq)
leveler<-as.data.frame(table(train_data[TARGET]))
leveler<-leveler[order(leveler[,2]),]
rare<-min(leveler$Freq)
common<-max(leveler$Freq)
temp<-train_data[which(train_data[,TARGET]==leveler$Var1[1]),]
for(j in 2:length(levels(train_data[,TARGET]))){
tag<-leveler$Var1[j]
kkk<-train_data[which(train_data[,TARGET]==tag),]
kkk<-kkk[sample(rare),]
temp<-rbind(temp,kkk)
}
train_data<-temp
return(train_data)
}
###################################
SMOTE_func<-function(train_data,formula,TARGET){
leveler<-as.data.frame(table(train_data[TARGET]))
leveler<-leveler[order(leveler[,2]),]
rare<-min(leveler$Freq)
common<-max(leveler$Freq)
for(j in 1:(length(levels(train_data[,TARGET]))-1)){
leveler2<-as.data.frame(table(train_data[TARGET]))
leveler2<-leveler2[order(leveler2[,2]),]
rare<-min(leveler2$Freq)
common<-max(leveler2$Freq)
tag<-leveler$Var1[j]
k<-1
H_por_low<-1
H_por_high<-1
H_por_low<-(1/H_por_low)
H_por_high<-H_por_high*100
perc_over<-100*(common-rare)/(rare*H_por_low)*k
perc_under<-100*(1/perc_over)*common
train_data2<-DMwR::SMOTE(formula, train_data, perc.over = perc_over,perc.under = (((perc_over/100)+1)/(perc_over/100))*H_por_high)
kkk<-train_data2[which(train_data2[,TARGET]==tag),]
ggg<-train_data[which(train_data[,TARGET]!=tag),]
train_data<-rbind(kkk,ggg)
}
return(train_data)
}
#===================
data_plot<-function(data,start_col,end_col,partition_col,remov_char,Y_label,X_label,coef_dis){
start<-which(colnames(data)== start_col)
end<-which(colnames(data)== end_col)
clusters_data_clus1<-as.data.frame(colMeans(data[which(data[[partition_col]]==1),start:end]))
clusters_data_clus2<-as.data.frame(colMeans(data[which(data[[partition_col]]==2),start:end]))
clusters_data<-cbind(clusters_data_clus1,clusters_data_clus2)
# if(toupper(flip_clusters)=="YES"){
# clusters_data[1]<-(1/clusters_data[,1])
# clusters_data[2]<-(1/clusters_data[,2])
# }
# if(mean(clusters_data[,1])>mean(clusters_data[,2])){
# clusters_data$mid<-clusters_data[,1]
# clusters_data[1]<-clusters_data[,2]
# clusters_data[2]<-clusters_data$mid
# clusters_data$mid<-NULL
# }
names(clusters_data)<-c("clus_1","clus_2")
quar1<-quantile(c(clusters_data$clus_1,clusters_data$clus_2))[2]
quar2<-quantile(c(clusters_data$clus_1,clusters_data$clus_2))[3]
quar3<-quantile(c(clusters_data$clus_1,clusters_data$clus_2))[4]
IQR<- quar3 - quar1
maxy<-quar3+coef_dis*IQR
miny<-quar1-coef_dis*IQR
x<-gsub(remov_char,"",rownames(clusters_data))
counter_years<-length(x)
start_date<-as.numeric(substr(gsub(remov_char,"",start_col),1,4))
end_date<-as.numeric(substr(gsub(remov_char,"",end_col),1,4))
X_label<-paste(X_label," (",start_date," - ",end_date,")",sep="")
par(mar = c(5,5,2,5),cex.lab=1.6)
plot(clusters_data$clus_1,xaxt="n", type="p", pch=0,col="red3",ylim=c(miny,maxy),
ylab= Y_label,ann = FALSE,
xlab=X_label,cex=1.2,cex.axis=1.1)
line_dist<-2
#next line is for tilted labels but for 2002-2016 I may deactive it
{
if((end_date-start_date)<5){
axis(1,at=1:counter_years,labels=x,cex.axis=.9,las=2)
line_dist<-2
}
}
mtext(side = 1, text = X_label, line = line_dist,cex=1.2)
mtext(side = 2, text = Y_label, line = 3,cex=1.2)
par(new = T)
plot(clusters_data$clus_2, type="p", pch=2,axes=F, xlab=NA, ylab=NA,ylim=c(miny,maxy),cex=1.2,lty=2)
#axis(side = 4)
#mtext(side = 4, line = 3, "Cluster 2")
legend("topleft",
legend=c("High Performance","Low Performance"),
lty=c(0,0), bty = "p",pch=c(2, 0), col=c("black","red3"),cex=1.2)
}
####################
flipper<-function(data_series,data_clust,cluss_one){
min_1<-0
min_2<-0
work_1<-as.numeric(sum(which(data_clust==1)))
work_2<-as.numeric(sum(which(data_clust==2)))
if(work_1*work_2>0){
for(i in which(data_clust==1)){
min_1<- min_1+mean(data_series[[i]])}
for(j in which(data_clust==2)){
min_2<- min_2+mean(data_series[[j]])}
min_1<-min_1/length(which(data_clust==1))
min_2<-min_2/length(which(data_clust==2))
}
if(cluss_one=="low" && min_1>min_2){
temp<-data_clust
temp[which(data_clust==1)]<-2
temp[which(data_clust==2)]<-1
data_clust<-temp
}
if(cluss_one=="high" && min_1<min_2){
temp<-data_clust
temp[which(data_clust==1)]<-2
temp[which(data_clust==2)]<-1
data_clust<-temp
}
return(data_clust)
}
############################################
# centr_par<-c("mean", "median", "shape", "dba", "pam")
# dis_par<-c("dtw","dtw2","dtw_basic","dtw_lb","lbk","lbi","sbd","gak","sdtw")
# for(i in centr_par){}
# for(j in dis_par){}
clusterer_proportion<-function(times_series,centr_par,dis_par,cluss_one){
acf_fun <- function(dat, ...) {lapply(dat, function(x) as.numeric(acf(x, lag.max = 100L, plot = FALSE)$acf))}
clus_results<-dtwclust::tsclust(times_series, k = 2L,
distance = dis_par, centroid = centr_par,
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
results<-list()
results$clus_results<-clus_results@clusinfo
results$labels<-flipper(times_series,clus_results@cluster,"low")
return(results)
}
##################
#this function goes inside of a list that has vectors, then cuts each vector and put the trimmed one inside the list
vector_in_list_cutter<-function(my_list,starter,ender){
for(i in 1:length(my_list)){
if(ender>length(my_list[[i]])){ender<-length(my_list[[i]])}
my_list[[i]]<-my_list[[i]][starter:ender]
}
return(my_list)
}
####################################
clustering<-function(objects,tool){
#clustering for 2012-2016
{
#partitional clustering
{
pc_one_train_2012_2016 <- tsclust(objects$ts_2012_2016, k = 2L,
distance = "dtwz._lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
objects$data_table_2012_2016$partitional_one<-pc_one_train_2012_2016@cluster
#Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
{
if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
if(tool=="three"){dcv<-1000}else{dcv<-1.5}
tadpole_one_train_2012_2016 <- tsclust(objects$ts_2012_2016, type = "tadpole", k = 2L,
trace = TRUE,
control = tadpole_control(dc = dcv, window.size = 2L))
objects$data_table_2012_2016$tadple_one<-tadpole_one_train_2012_2016@cluster
}
#fuzzy clustering
{
if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
for(i in 1:nrow(objects$data_table_2012_2016)){
for(j in 1:last_date){
if(objects$ts_2012_2016[[i]][j]==0){
objects$ts_2012_2016[[i]][j]<-abs(runif(1, 0.1, 0.2))
}
}
}
# i<-1
# j<-1
acf_fun <- function(dat, ...) {
lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
}
# Autocorrelation-based fuzzy c-means
fcm_one_train_2012_2016 <- tsclust(objects$ts_2012_2016, type = "fuzzy", k = 2L,
preproc = acf_fun, distance = "L2",
seed = 123)
fc_one_train_2012_2016<-as.data.frame(fcm_one_train_2012_2016@fcluster)
fc_one_train_2012_2016$company_name<-rownames(fc_one_train_2012_2016)
fc_one_train_2012_2016$cluster<-NA
for(i in 1:nrow(fc_one_train_2012_2016)){
if(as.numeric(fc_one_train_2012_2016$cluster_1[i])>as.numeric(fc_one_train_2012_2016$cluster_2[i])){fc_one_train_2012_2016$cluster[i]<-"Cluster_1"}else{fc_one_train_2012_2016$cluster[i]<-"Cluster_2"}
}
objects$data_table_2012_2016$fc_one<-fc_one_train_2012_2016$cluster
#write.csv(objects$data_table_2012_2016,paste(base_folder,"Data/clustering_one_train_2012_2016.csv",sep = ""))
}
}
#clustering for 2007-2016
{
#partitional clustering
{
pc_one_train_2007_2016 <- tsclust(objects$ts_2007_2016, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
objects$data_table_2007_2016$partitional_one<-pc_one_train_2007_2016@cluster
#Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
{
if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
if(tool=="three"){dcv<-1000}else{dcv<-1.5}
tadpole_one_train_2007_2016 <- tsclust(objects$ts_2007_2016, type = "tadpole", k = 2L,
trace = TRUE,
control = tadpole_control(dc = dcv, window.size = 2L))
objects$data_table_2007_2016$tadple_one<-tadpole_one_train_2007_2016@cluster
}
#fuzzy clustering
{
if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
for(i in 1:nrow(objects$data_table_2007_2016)){
for(j in 1:last_date){
if(objects$ts_2007_2016[[i]][j]==0){
objects$ts_2007_2016[[i]][j]<-abs(runif(1, 0.1, 0.2))
}
}
}
acf_fun <- function(dat, ...) {
lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
}
# Autocorrelation-based fuzzy c-means
fcm_one_train_2007_2016 <- tsclust(objects$ts_2007_2016, type = "fuzzy", k = 2L,
preproc = acf_fun, distance = "L2",
seed = 123)
fc_one_train_2007_2016<-as.data.frame(fcm_one_train_2007_2016@fcluster)
fc_one_train_2007_2016$company_name<-rownames(fc_one_train_2007_2016)
fc_one_train_2007_2016$cluster<-NA
for(i in 1:nrow(fc_one_train_2007_2016)){
if(as.numeric(fc_one_train_2007_2016$cluster_1[i])>as.numeric(fc_one_train_2007_2016$cluster_2[i])){fc_one_train_2007_2016$cluster[i]<-"Cluster_1"}else{fc_one_train_2007_2016$cluster[i]<-"Cluster_2"}
}
objects$data_table_2007_2016$fc_one<-fc_one_train_2007_2016$cluster
#write.csv(objects$data_table_2007_2016,paste(base_folder,"Data/clustering_one_train_2007_2016.csv",sep = ""))
}
}
#clustering for 2002-2016
{
#partitional clustering
{
pc_one_train_2002_2016 <- tsclust(objects$ts_2002_2016, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
objects$data_table_2002_2016$partitional_one<-pc_one_train_2002_2016@cluster
#Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
{
if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
if(tool=="three"){dcv<-1000}else{dcv<-1.5}
tadpole_one_train_2002_2016 <- tsclust(objects$ts_2002_2016, type = "tadpole", k = 2L,
trace = TRUE,
control = tadpole_control(dc = dcv, window.size = 2L))
objects$data_table_2002_2016$tadple_one<-tadpole_one_train_2002_2016@cluster
}
#fuzzy clustering
{
if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
for(i in 1:nrow(objects$data_table_2002_2016)){
for(j in 1:last_date){
if(objects$ts_2002_2016[[i]][j]==0){
objects$ts_2002_2016[[i]][j]<-abs(runif(1, 0.1, 0.2))
}
}
}
acf_fun <- function(dat, ...) {
lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
}
# Autocorrelation-based fuzzy c-means
fcm_one_train_2002_2016 <- tsclust(objects$ts_2002_2016, type = "fuzzy", k = 2L,
preproc = acf_fun, distance = "L2",
seed = 123)
fc_one_train_2002_2016<-as.data.frame(fcm_one_train_2002_2016@fcluster)
fc_one_train_2002_2016$company_name<-rownames(fc_one_train_2002_2016)
fc_one_train_2002_2016$cluster<-NA
for(i in 1:nrow(fc_one_train_2002_2016)){
if(as.numeric(fc_one_train_2002_2016$cluster_1[i])>as.numeric(fc_one_train_2002_2016$cluster_2[i])){fc_one_train_2002_2016$cluster[i]<-"Cluster_1"}else{fc_one_train_2002_2016$cluster[i]<-"Cluster_2"}
}
objects$data_table_2002_2016$fc_one<-fc_one_train_2002_2016$cluster
#write.csv(objects$data_table_2002_2016,paste(base_folder,"Data/clustering_one_train_2002_2016.csv",sep = ""))
}
}
#clustering for 2007-2011
{
#partitional clustering
{
pc_one_train_2007_2011 <- tsclust(objects$ts_2007_2011, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
objects$data_table_2007_2011$partitional_one<-pc_one_train_2007_2011@cluster
#Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
{
if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
if(tool=="three"){dcv<-1000}else{dcv<-1.5}
tadpole_one_train_2007_2011 <- tsclust(objects$ts_2007_2011, type = "tadpole", k = 2L,
trace = TRUE,
control = tadpole_control(dc = dcv, window.size = 2L))
objects$data_table_2007_2011$tadple_one<-tadpole_one_train_2007_2011@cluster
}
#fuzzy clustering
{
if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
for(i in 1:nrow(objects$data_table_2007_2011)){
for(j in 1:last_date){
if(objects$ts_2007_2011[[i]][j]==0){
objects$ts_2007_2011[[i]][j]<-abs(runif(1, 0.1, 0.2))
}
}
}
acf_fun <- function(dat, ...) {
lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
}
# Autocorrelation-based fuzzy c-means
fcm_one_train_2007_2011 <- tsclust(objects$ts_2007_2011, type = "fuzzy", k = 2L,
preproc = acf_fun, distance = "L2",
seed = 123)
fc_one_train_2007_2011<-as.data.frame(fcm_one_train_2007_2011@fcluster)
fc_one_train_2007_2011$company_name<-rownames(fc_one_train_2007_2011)
fc_one_train_2007_2011$cluster<-NA
for(i in 1:nrow(fc_one_train_2007_2011)){
if(as.numeric(fc_one_train_2007_2011$cluster_1[i])>as.numeric(fc_one_train_2007_2011$cluster_2[i])){fc_one_train_2007_2011$cluster[i]<-"Cluster_1"}else{fc_one_train_2007_2011$cluster[i]<-"Cluster_2"}
}
objects$data_table_2007_2011$fc_one<-fc_one_train_2007_2011$cluster
#write.csv(objects$data_table_2007_2011,paste(base_folder,"Data/clustering_one_train_2007_2011.csv",sep = ""))
}
}
#clustering for 2002-2006
{
#partitional clustering
{
pc_one_train_2002_2006 <- tsclust(objects$ts_2002_2006, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
objects$data_table_2002_2006$partitional_one<-pc_one_train_2002_2006@cluster
#Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
{
if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
if(tool=="three"){dcv<-1000}else{dcv<-1.5}
tadpole_one_train_2002_2006 <- tsclust(objects$ts_2002_2006, type = "tadpole", k = 2L,
trace = TRUE,
control = tadpole_control(dc = dcv, window.size = 2L))
objects$data_table_2002_2006$tadple_one<-tadpole_one_train_2002_2006@cluster
}
#fuzzy clustering
{
if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
for(i in 1:nrow(objects$data_table_2002_2006)){
for(j in 1:last_date){
if(objects$ts_2002_2006[[i]][j]==0){
objects$ts_2002_2006[[i]][j]<-abs(runif(1, 0.1, 0.2))
}
}
}
acf_fun <- function(dat, ...) {
lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
}
# Autocorrelation-based fuzzy c-means
fcm_one_train_2002_2006 <- tsclust(objects$ts_2002_2006, type = "fuzzy", k = 2L,
preproc = acf_fun, distance = "L2",
seed = 123)
fc_one_train_2002_2006<-as.data.frame(fcm_one_train_2002_2006@fcluster)
fc_one_train_2002_2006$company_name<-rownames(fc_one_train_2002_2006)
fc_one_train_2002_2006$cluster<-NA
for(i in 1:nrow(fc_one_train_2002_2006)){
if(as.numeric(fc_one_train_2002_2006$cluster_1[i])>as.numeric(fc_one_train_2002_2006$cluster_2[i])){fc_one_train_2002_2006$cluster[i]<-"Cluster_1"}else{fc_one_train_2002_2006$cluster[i]<-"Cluster_2"}
}
objects$data_table_2002_2006$fc_one<-fc_one_train_2002_2006$cluster
#write.csv(objects$data_table_2002_2006,paste(base_folder,"Data/clustering_one_train_2002_2006.csv",sep = ""))
}
}
return(objects)
}
##################
clustering_lite<-function(objects,tool){
#clustering for 2012-2016
{
#partitional clustering
{
pc_one_train_2012_2016 <- tsclust(objects$ts_2012_2016, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
gg<-as.data.frame(pc_one_train_2012_2016@cluster)
names(gg)<-paste("partitional_",tool,sep="")
objects$data_table_2012_2016<-cbind(objects$data_table_2012_2016,gg)
}
#clustering for 2007-2016
{
#partitional clustering
{
pc_one_train_2007_2016 <- tsclust(objects$ts_2007_2016, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
gg<-as.data.frame(pc_one_train_2007_2016@cluster)
names(gg)<-paste("partitional_",tool,sep="")
objects$data_table_2007_2016<-cbind(objects$data_table_2007_2016,gg)
}
#clustering for 2002-2016
{
#partitional clustering
{
pc_one_train_2002_2016 <- tsclust(objects$ts_2002_2016, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
gg<-as.data.frame(pc_one_train_2002_2016@cluster)
names(gg)<-paste("partitional_",tool,sep="")
objects$data_table_2002_2016<-cbind(objects$data_table_2002_2016,gg)
}
#clustering for 2007-2011
{
#partitional clustering
{
pc_one_train_2007_2011 <- tsclust(objects$ts_2007_2011, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
gg<-as.data.frame(pc_one_train_2007_2011@cluster)
names(gg)<-paste("partitional_",tool,sep="")
objects$data_table_2007_2011<-cbind(objects$data_table_2007_2011,gg)
}
#clustering for 2002-2006
{
#partitional clustering
{
pc_one_train_2002_2006 <- tsclust(objects$ts_2002_2006, k = 2L,
distance = "dtw_lb", centroid = "pam",
seed = 3247, trace = TRUE,
control = partitional_control(pam.precompute = FALSE),
args = tsclust_args(dist = list(window.size = 2L)))
}
#assigning cluster to each company
gg<-as.data.frame(pc_one_train_2002_2006@cluster)
names(gg)<-paste("partitional_",tool,sep="")
objects$data_table_2002_2006<-cbind(objects$data_table_2002_2006,gg)
}
return(objects)
}
##################
In this new analysis I mainly try t investigate BoardEx data and see what is its makeup. Specially what does qualification and netqork size mean.
It the following section, I perform Binomial clustering using dtw package in R
load("C:/Users/hza0020/Box/OHI/new/data_obj.Rdata")
set.seed(3247)
data_obj$CompuStat$object_one$binomial_clus<-list()
data_obj$CompuStat$object_one$binomial_clus$proportional_2007_2016<-
clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_one$culster_all_one$ts_2007_2016,5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016$partitional<-
data_obj$CompuStat$object_one$binomial_clus$proportional_2007_2016$labels
data_obj$CompuStat$object_two$binomial_clus<-list()
data_obj$CompuStat$object_two$binomial_clus$proportional_2007_2016<-
clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_two$culster_all_two$ts_2007_2016[1:729],5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016$partitional<-0
data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016$partitional[1:729]<-
data_obj$CompuStat$object_two$binomial_clus$proportional_2007_2016$labels
data_obj$CompuStat$object_three$binomial_clus<-list()
data_obj$CompuStat$object_three$binomial_clus$proportional_2007_2016<-
clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_three$culster_all_three$ts_2007_2016[1:760],4,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016$partitional<-0
data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016$partitional[1:760]<-
data_obj$CompuStat$object_three$binomial_clus$proportional_2007_2016$labels
data_obj$CompuStat$object_four$binomial_clus<-list()
data_obj$CompuStat$object_four$binomial_clus$proportional_2007_2016<-
clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_four$culster_all_four$ts_2007_2016,5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_four$culster_all_four$data_table_2007_2016$partitional<-
data_obj$CompuStat$object_four$binomial_clus$proportional_2007_2016$labels
data_obj$CompuStat$object_five$binomial_clus<-list()
data_obj$CompuStat$object_five$binomial_clus$proportional_2007_2016<-
clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_five$culster_all_five$ts_2007_2016,4,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_five$culster_all_five$data_table_2007_2016$partitional<-
data_obj$CompuStat$object_five$binomial_clus$proportional_2007_2016$labels
data_obj$CompuStat$object_RandD$binomial_clus<-list()
data_obj$CompuStat$object_RandD$binomial_clus$proportional_2007_2016<-
clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_RandD$culster_all_RandD$ts_2007_2016,5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_RandD$culster_all_RandD$data_table_2007_2016$partitional<-
data_obj$CompuStat$object_RandD$binomial_clus$proportional_2007_2016$labels
data_obj$CompuStat$object_RDREV$binomial_clus<-list()
data_obj$CompuStat$object_RDREV$binomial_clus$proportional_2007_2016<-
clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_RDREV$culster_all_RDREV$ts_2007_2016,5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_RDREV$culster_all_RDREV$data_table_2007_2016$partitional<-
data_obj$CompuStat$object_RDREV$binomial_clus$proportional_2007_2016$labels
binomial_output<-matrix(0, ncol = 4, nrow = 6)
binomial_output<-as.data.frame(binomial_output)
row.names(binomial_output)<-c("Index_1","Index_2","Index_3","Index_4","Index_5","Index_6")
names(binomial_output)<-c("Mean intra-cluster Distance (long term)","Size","Mean intra-cluster Distance (short term)","Size")
options(scipen=999)
binomial_output[1,]<-round(unlist(data_obj$CompuStat$object_one$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[2,]<-round(unlist(data_obj$CompuStat$object_two$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[3,]<-round(unlist(data_obj$CompuStat$object_three$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[4,]<-round(unlist(data_obj$CompuStat$object_four$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[5,]<-round(unlist(data_obj$CompuStat$object_five$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[6,]<-round(unlist(data_obj$CompuStat$object_RDREV$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
data_obj$CompuStat$summary<-list()
data_obj$CompuStat$summary$summary2007_2016<-binomial_output
cat("Binomial Clustering")
## Binomial Clustering
DT::datatable(data_obj$CompuStat$summary$summary2007_2016)
data_plot(data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016,"R2008Q1","R2016Q4",
"partitional","R","Capital Expenditures / Depreciation","Date",1.5)
data_plot(data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016,"R2008Q1","R2016Q4",
"partitional","R","Capital Expenditures / Depreciation","Date",1.5)
data_plot(data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016[1:729,],"R2008Q1","R2016Q4",
"partitional","R","Revenue / Receivables","Date",1.5)
data_plot(data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016[1:760,],"d2008Q1","d2016Q4",
"partitional","d","1 /|Revenue Growth-Earning Growth|","Date",9)
data_plot(data_obj$CompuStat$object_four$culster_all_four$data_table_2007_2016,"d2008Q1","d2016Q4",
"partitional","d","|Actual EPS - Predicted EPS|","Date",6)
data_plot(data_obj$CompuStat$object_five$culster_all_five$data_table_2007_2016,"d2008Q1","d2016Q4",
"partitional","d","|Income Growth-Earning Per Share Growth|","Date",8)
# data_plot(data_obj$CompuStat$object_RandD$culster_all_RandD$data_table_2007_2016,"R2007Q1","R2016Q4",
# "partitional_RandD","R","Research and Development Expense","Date","NO",1.5)
data_plot(data_obj$CompuStat$object_RDREV$culster_all_RDREV$data_table_2007_2016,"R2008Q1","R2016Q4",
"partitional","R","R&D Cost / Revenue","Date",1.5)
In the next part, I try to adopt index-1,index-2,index-3, and index-5 for multiple clustering
# first I wan to find companies that have all of these indexes: index_1, index_2, index_3, index_5
{
cluster_check<-matrix(0, ncol = 7, nrow = nrow(data_obj$identifiers$tickers))
cluster_check<-as.data.frame(cluster_check)
names(cluster_check)<-c("tickers","index_1","index_2","index_3","index_4","index_5","index_6")
cluster_check$tickers<-data_obj$identifiers$tickers$TIC
cluster_check$index_1[which(as.character(data_obj$identifiers$tickers$TIC) %in%
intersect(cluster_check$tickers,names(data_obj$CompuStat$
object_one$culster_all_one$ts_2007_2016)))]<-1
cluster_check$index_2[which(as.character(data_obj$identifiers$tickers$TIC) %in%
intersect(cluster_check$tickers,names(data_obj$CompuStat$
object_two$culster_all_two$ts_2007_2016[1:729])))]<-1
cluster_check$index_3[which(as.character(data_obj$identifiers$tickers$TIC) %in%
intersect(cluster_check$tickers,names(data_obj$CompuStat$
object_three$culster_all_three$ts_2007_2016[1:760])))]<-1
cluster_check$index_4[which(as.character(data_obj$identifiers$tickers$TIC) %in%
intersect(cluster_check$tickers,names(data_obj$CompuStat$
object_four$culster_all_four$ts_2007_2016)))]<-1
cluster_check$index_5[which(as.character(data_obj$identifiers$tickers$TIC) %in%
intersect(cluster_check$tickers,names(data_obj$CompuStat$
object_five$culster_all_five$ts_2007_2016)))]<-1
cluster_check$index_6[which(as.character(data_obj$identifiers$tickers$TIC) %in%
intersect(cluster_check$tickers,names(data_obj$CompuStat$
object_RDREV$culster_all_RDREV$ts_2007_2016)))]<-1
cluster_check$index1235<-cluster_check$index_1*cluster_check$index_2*cluster_check$index_3*cluster_check$index_5
cluster_check<-cluster_check[which(cluster_check$index1235==1),]
cluster_check_mean<-cluster_check
}
# Then I make a multi dimensional data for multiple clustering
s_tart1<-which(colnames(data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016)=="R2008Q1")
e_nd1<-which(colnames(data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016)=="R2016Q4")
s_tart2<-which(colnames(data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016)=="R2008Q1")
e_nd2<-which(colnames(data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016)=="R2016Q4")
s_tart3<-which(colnames(data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016)=="d2008Q1")
e_nd3<-which(colnames(data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016)=="d2016Q4")
s_tart5<-which(colnames(data_obj$CompuStat$object_five$culster_all_five$data_table_2007_2016)=="d2008Q1")
e_nd5<-which(colnames(data_obj$CompuStat$object_five$culster_all_five$data_table_2007_2016)=="d2016Q4")
num.ts <- nrow(cluster_check) # number of time series
num.dim <- 4 # number of dimensions
len.ts <- (e_nd1-s_tart1+1) # length of time series
# generate Gaussian white noise
multi_dim_data <- array(dim = c(len.ts, num.ts, num.dim),data = NA)
for(i in 1:nrow(cluster_check)){
t1<-as.data.frame(t(data_obj$CompuStat$object_one$culster_all_one$
data_table_2007_2016[which(data_obj$CompuStat$object_one$
culster_all_one$data_table_2007_2016$TIC==cluster_check$tickers[i]),s_tart1:e_nd1]))
colnames(t1)<-"one"
t2<-as.data.frame(t(data_obj$CompuStat$object_two$culster_all_two$
data_table_2007_2016[which(data_obj$CompuStat$object_two$
culster_all_two$data_table_2007_2016$TIC==cluster_check$tickers[i]),s_tart1:e_nd1]))
colnames(t2)<-"two"
t3<-as.data.frame(t(data_obj$CompuStat$object_three$culster_all_three$
data_table_2007_2016[which(data_obj$CompuStat$object_three$
culster_all_three$data_table_2007_2016$TIC==cluster_check$tickers[i]),s_tart1:e_nd1]))
colnames(t3)<-"three"
t5<-as.data.frame(t(data_obj$CompuStat$object_five$culster_all_five$
data_table_2007_2016[which(data_obj$CompuStat$object_five$
culster_all_five$data_table_2007_2016$TIC==cluster_check$tickers[i]),s_tart1:e_nd1]))
colnames(t5)<-"five"
cluster_check_mean$index_1[i]<-mean(t1[,])
cluster_check_mean$index_2[i]<-mean(t2[,])
cluster_check_mean$index_3[i]<-mean(t3[,])
cluster_check_mean$index_5[i]<-mean(t5[,])
ts<-cbind(t1,t2,t3,t5)
for(j in 1:ncol(ts)){
multi_dim_data[ ,i,j]<-as.numeric(ts[,j])
}
}
#
multi_clustering<- pdclust(X = multi_dim_data, m=3,t=4)
cluster_check$clus3<-cutree(multi_clustering, k = 3)
Distribution of multivariate clustering is demonstrated below:
multi3_clus<-as.data.frame(table(cluster_check$clus3))
names(multi3_clus)<-c("Cluster_Label","Frequency")
results_comp2008_2016<-list()
results_comp2008_2016$summary<-multi3_clus
results_comp2008_2016$cluster_results<-cluster_check
data_obj$CompuStat$mult_cluster$results_comp2008_2016<-results_comp2008_2016
DT::datatable(multi3_clus)
seed_no<-123
org_sum<-read.csv("C:/Users/hza0020/Box/OHI/new/Organization_Summary_Analytics.csv")
# company_ID<-as.data.frame(table(org_sum$BoardID))[1]
# names(company_ID)<-"ID"
# write.csv(company_ID,"C:/Users/hza0020/Box/OHI/new/company_ID.csv", row.names=FALSE)
indiv_Edu<-read.csv("C:/Users/hza0020/Box/OHI/new/Profile_Edu.csv")
It seems Qualifications indicates number of certificates and/or professional trainings that the board member received
DT::datatable(as.data.frame(summary(indiv_Edu$Qualification)))
summary(org_sum$GenderRatio)[1]
## Min.
## 0.375
In the next step I want to perform a descriptive investigation over the Board member data:
data<-data_obj$pred$data_2007_2016
summary(data$GR_mean)
hist(data$GR_mean,
main="Histogram for Gender Ratio in the Board Members",
xlab="mean Gender Ratio",
border="black",
col="white",
las=1,
breaks=10)
summary(data$NS_mean)
hist(data$NS_mean,
main="Histogram for total network size of the Board Members",
xlab="total Network Size",
border="black",
col="white",
las=1,
breaks=10)
summary(data$NQ_mean)
hist(data$NQ_mean,
main="Histogram for total qualifications of the Board Members",
xlab="total qualifications",
border="black",
col="white",
las=1,
breaks=10)
summary(data$TR_mean)
hist(data$TR_mean,
main="Histogram for total time to toretirement of the Board Members",
xlab="total time to toretirement ",
border="black",
col="white",
las=1,
breaks=10)
In the following section, I join CompuStat data and BoardEx data
table2008_2016<-data_obj$CompuStat$data$data_table_org_all
data_obj$BoardEx$data$data_table_GR_all_2007_2016$mean8_16<-rowMeans(data_obj$BoardEx$data$
data_table_GR_all_2007_2016[c("GR2008","GR2009","GR2010","GR2011", "GR2012","GR2013",
"GR2014", "GR2015","GR2016")])
data_obj$BoardEx$data$data_table_netsize_all_2007_2016$mean8_16 <-rowMeans(data_obj$BoardEx$data$
data_table_netsize_all_2007_2016[c("NS2008","NS2009","NS2010","NS2011","NS2012","NS2013",
"NS2014","NS2015","NS2016")])
data_obj$BoardEx$data$data_table_noquals_all_2007_2016$mean8_16 <-rowMeans(data_obj$BoardEx$data$
data_table_noquals_all_2007_2016[c("NQ2008","NQ2009","NQ2010","NQ2011","NQ2012","NQ2013",
"NQ2014","NQ2015","NQ2016")])
data_obj$BoardEx$data$data_table_timeboard_all_2007_2016$mean8_16 <-rowMeans(data_obj$BoardEx$data$
data_table_timeboard_all_2007_2016[c("TB2008","TB2009","TB2010","TB2011","TB2012","TB2013",
"TB2014","TB2015","TB2016")])
data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$mean8_16 <-rowMeans(data_obj$BoardEx$data$
data_table_timeretire_all_2007_2016[c("TR2008","TR2009","TR2010","TR2011","TR2012","TR2013",
"TR2014","TR2015","TR2016")])
table2008_2016$GR_mean<-NA
table2008_2016$NS_mean<-NA
table2008_2016$NQ_mean<-NA
table2008_2016$TR_mean<-NA
table2008_2016$TARGET<-NA
table2008_2016$GR_mean[which(as.character(table2008_2016$TIC) %in%
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_GR_all_2007_2016$TIC))]<-
data_obj$BoardEx$data$data_table_GR_all_2007_2016$mean8_16[which(
as.character(data_obj$BoardEx$data$data_table_GR_all_2007_2016$TIC) %in%
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_GR_all_2007_2016$TIC))]
table2008_2016$NS_mean[which(as.character(table2008_2016$TIC) %in%
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_netsize_all_2007_2016$TIC))]<-
data_obj$BoardEx$data$data_table_netsize_all_2007_2016$mean8_16[which(
as.character(data_obj$BoardEx$data$data_table_netsize_all_2007_2016$TIC) %in%
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_netsize_all_2007_2016$TIC))]
table2008_2016$NQ_mean[which(as.character(table2008_2016$TIC) %in%
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_noquals_all_2007_2016$TIC))]<-
data_obj$BoardEx$data$data_table_noquals_all_2007_2016$mean8_16[which(
as.character(data_obj$BoardEx$data$data_table_noquals_all_2007_2016$TIC) %in%
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_noquals_all_2007_2016$TIC))]
table2008_2016$TR_mean[which(as.character(table2008_2016$TIC) %in%
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$TIC))]<-
data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$mean8_16[which(
as.character(data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$TIC) %in%
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$TIC))]
table2008_2016$TARGET[which(as.character(table2008_2016$TIC) %in%
intersect(table2008_2016$TIC,cluster_check$tickers))]<-
cluster_check$clus3[which(as.character(cluster_check$tickers) %in%
intersect(cluster_check$tickers,table2008_2016$TIC))]
table2008_2016<-table2008_2016[complete.cases(table2008_2016),]
industry_dummies<-mlr::createDummyFeatures(as.factor(table2008_2016$NAICS2))
names(industry_dummies)<-paste("c",colnames(industry_dummies),sep="")
industry_dummies<-industry_dummies[1:(ncol(industry_dummies)-1)]
table2008_2016<-cbind(table2008_2016,industry_dummies)
data_obj$pred_data$data_2008_2016<-table2008_2016
data_obj$pred_data$NAICS2_2008_2016<-industry_dummies
Distribution of multivariate clustering after including BoardEx data (excluding NAs):
multi3_clus_BX<-as.data.frame(table(data_obj$pred_data$data_2008_2016$TARGET))
names(multi3_clus_BX)<-c("Cluster_Label","Frequency")
data_obj$pred_data$summary2008_2016<-multi3_clus_BX
DT::datatable(multi3_clus_BX)
Next, I load the large object for prediction and then increasing accuracy over stacking approach
############################
{
# rm(list = ls()) # clears global environment (functions and variables)
# graphics.off() # clears graphics
# source("C:/Users/hza0020/Box/OHI/new/functions.R")
# load("C:/Users/hza0020/Box/OHI/new/data_obj.Rdata")
data<-data_obj$pred_data$data_2008_2016
data$id<-data$TIC
data$GR_mean<-as.numeric(data$GR_mean)
data$NS_mean<-as.numeric(data$NS_mean)
data$NQ_mean<-as.numeric(data$NQ_mean)
data$TR_mean<-as.numeric(data$TR_mean)
for(i in names(data_obj$pred_data$NAICS2_2008_2016)){
data[i]<-as.factor(data[,i])
}
cntr_type<-"simple"
TARGET<-"TARGET"
n_folds<-10}
############################
B_alg<-"NONE"
method_pred<-"regular"
vars<-c( "GR_mean","NS_mean","NQ_mean","TR_mean",names(data_obj$pred_data$NAICS2_2008_2016))
ensembler<-"NONE"
results<-list()
results$regular<-list()
results$ensemble<-list()
set.seed(12)
B_alg<-"SMOTE"
results$regular$regular_3SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$regular$regular_3NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"RUS"
results$regular$regular_3RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
data<-results$regular$regular_3NONE$ensemble_data$probs_all
#data<-regular_3SMOTE$ensemble_data$probs_all
method_pred<-"ensemble"
ensembler<-"rf"
vars<-c("tan","svm","nnet","cart")
B_alg<-"RUS"
results$ensemble$ensemble_NONE_3rf_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3rf_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3rf_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
ensembler<-"nnet"
vars<-c("tan","svm","rf","cart")
B_alg<-"RUS"
results$ensemble$ensemble_NONE_3nnet_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3nnet_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3nnet_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
ensembler<-"svm"
vars<-c("nnet","tan","rf","cart")
B_alg<-"RUS"
results$ensemble$ensemble_NONE_3svm_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3svm_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3svm_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
ensembler<-"tan"
vars<-c("nnet","svm","rf","cart")
B_alg<-"RUS"
results$ensemble$ensemble_NONE_3tan_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3tan_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3tan_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
ensembler<-"cart"
vars<-c("nnet","svm","rf","tan")
B_alg<-"RUS"
results$ensemble$ensemble_NONE_3cart_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3cart_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3cart_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
data_obj$results<-results
save(data_obj,file=paste(getwd(),"/","data_obj",".RData",sep=""))
Next, I show performance of the predictive models
cat("Here I show performance of nnet when we use SMOTE sample balancing")
## Here I show performance of nnet when we use SMOTE sample balancing
DT::datatable(results$regular$regular_3SMOTE$nnet$performance_nnet)
cat("Here I show performance of nnet when we use no sample balancing")
## Here I show performance of nnet when we use no sample balancing
DT::datatable(results$regular$regular_3NONE$nnet$performance_nnet)
cat("Here I show performance of nnet when we use RUS sample balancing")
## Here I show performance of nnet when we use RUS sample balancing
DT::datatable(results$regular$regular_3RUS$nnet$performance_nnet)
cat("Here I show importance of variables in nnet with no sample balancing")
## Here I show importance of variables in nnet with no sample balancing
DT::datatable(results$regular$regular_3NONE$nnet$resul_nnet_imp)
cat("performance of svm when we use SMOTE sample balancing")
## performance of svm when we use SMOTE sample balancing
DT::datatable(results$regular$regular_3SMOTE$svm$performance_svm)
cat("performance of svm when we use no sample balancing")
## performance of svm when we use no sample balancing
DT::datatable(results$regular$regular_3NONE$svm$performance_svm)
cat("performance of svm when we use RUS sample balancing")
## performance of svm when we use RUS sample balancing
DT::datatable(results$regular$regular_3RUS$svm$performance_svm)
#
cat("importance of variables in svm with no sample balancing")
## importance of variables in svm with no sample balancing
DT::datatable(results$regular$regular_3NONE$svm$resul_svm_imp)
#
cat("performance of rf when we use SMOTE sample balancing")
## performance of rf when we use SMOTE sample balancing
DT::datatable(results$regular$regular_3SMOTE$rf$performance_rf)
cat("performance of rf when we use no sample balancing")
## performance of rf when we use no sample balancing
DT::datatable(results$regular$regular_3NONE$rf$performance_rf)
cat("performance of rf when we use RUS sample balancing")
## performance of rf when we use RUS sample balancing
DT::datatable(results$regular$regular_3RUS$rf$performance_rf)
#
cat("importance of variables in rf with no sample balancing")
## importance of variables in rf with no sample balancing
DT::datatable(results$regular$regular_3NONE$rf$resul_rf_imp)
#
cat("performance of cart when we use SMOTE sample balancing")
## performance of cart when we use SMOTE sample balancing
DT::datatable(results$regular$regular_3SMOTE$cart$performance_cart)
cat("performance of cart when we use no sample balancing")
## performance of cart when we use no sample balancing
DT::datatable(results$regular$regular_3NONE$cart$performance_cart)
cat("performance of cart when we use RUS sample balancing")
## performance of cart when we use RUS sample balancing
DT::datatable(results$regular$regular_3RUS$cart$performance_cart)
#
cat("importance of variables in cart with no sample balancing")
## importance of variables in cart with no sample balancing
DT::datatable(results$regular$regular_3NONE$cart$resul_cart_imp)
#
cat("performance of tan when we use SMOTE sample balancing")
## performance of tan when we use SMOTE sample balancing
DT::datatable(results$regular$regular_3SMOTE$tan$performance_tan)
cat("performance of tan when we use no sample balancing")
## performance of tan when we use no sample balancing
DT::datatable(results$regular$regular_3NONE$tan$performance_tan)
cat("performance of tan when we use RUS sample balancing")
## performance of tan when we use RUS sample balancing
DT::datatable(results$regular$regular_3RUS$tan$performance_tan)
#
cat("importance of variables in tan with no sample balancing")
## importance of variables in tan with no sample balancing
DT::datatable(results$regular$regular_3NONE$tan$resul_tan_imp)
#
#
#
cat("performance of stacking & Rus sample balancing by rf, without sample balancing in the previous step")
## performance of stacking & Rus sample balancing by rf, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3rf_RUS$rf$performance_rf)
cat("performance of stacking & SMOTE sample balancing by rf, without sample balancing in the previous step")
## performance of stacking & SMOTE sample balancing by rf, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3rf_SMOTE$rf$performance_rf)
cat("performance of stacking & no sample balancing by rf, without sample balancing in the previous step")
## performance of stacking & no sample balancing by rf, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3rf_NONE$rf$performance_rf)
#
cat("performance of stacking & Rus sample balancing by nnet, without sample balancing in the previous step")
## performance of stacking & Rus sample balancing by nnet, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3nnet_RUS$nnet$performance_nnet)
cat("performance of stacking & SMOTE sample balancing by nnet, without sample balancing in the previous step")
## performance of stacking & SMOTE sample balancing by nnet, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3nnet_SMOTE$nnet$performance_nnet)
cat("performance of stacking & no sample balancing by nnet, without sample balancing in the previous step")
## performance of stacking & no sample balancing by nnet, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3nnet_NONE$nnet$performance_nnet)
#
cat("performance of stacking & Rus sample balancing by svm, without sample balancing in the previous step")
## performance of stacking & Rus sample balancing by svm, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3svm_RUS$svm$performance_svm)
cat("performance of stacking & SMOTE sample balancing by svm, without sample balancing in the previous step")
## performance of stacking & SMOTE sample balancing by svm, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3svm_SMOTE$svm$performance_svm)
cat("performance of stacking & no sample balancing by svm, without sample balancing in the previous step")
## performance of stacking & no sample balancing by svm, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3svm_NONE$svm$performance_svm)
#
cat("performance of stacking & Rus sample balancing by tan, without sample balancing in the previous step")
## performance of stacking & Rus sample balancing by tan, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3tan_RUS$tan$performance_tan)
cat("performance of stacking & SMOTE sample balancing by tan, without sample balancing in the previous step")
## performance of stacking & SMOTE sample balancing by tan, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3tan_SMOTE$tan$performance_tan)
cat("performance of stacking & no sample balancing by tan, without sample balancing in the previous step")
## performance of stacking & no sample balancing by tan, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3tan_NONE$tan$performance_tan)
#
cat("performance of stacking & Rus sample balancing by cart, without sample balancing in the previous step")
## performance of stacking & Rus sample balancing by cart, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3cart_RUS$cart$performance_cart)
cat("performance of stacking & SMOTE sample balancing by cart, without sample balancing in the previous step")
## performance of stacking & SMOTE sample balancing by cart, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3cart_SMOTE$cart$performance_cart)
cat("performance of stacking & no sample balancing by cart, without sample balancing in the previous step")
## performance of stacking & no sample balancing by cart, without sample balancing in the previous step
DT::datatable(results$ensemble$ensemble_NONE_3cart_NONE$cart$performance_cart)
Aubrun University, azg0074@auburn.edu↩
Aubrun University, hamid@auburn.edu↩