Investigating BoardEx data

In the following block, required R libraries are checked for existance and then installed if it’s needed.

rm(list = ls()) # clears global environment (functions and variables)
graphics.off() # clears graphics
if(!("pacman" %in% rownames(installed.packages()))){install.packages("pacman")}# pacman would be installed if it's not existed
library(pacman) 
p_load(caret, rio, dtwclust, graph, RBGL, arules, stringr, pROC, DT,papeR, pdc, mlr)

Object oriented programming is employed for this project. In this approach of programming, an empty object is created then in each step some attributes and data would be added to the object. Each step is basically a function that perform some processes over the object.Th following snippet is designed to define the functions.

defining functions and loading the required libraries

pred_three<-function(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds){
  #next line is just for finding threshold of prediction
  if(method_pred=="regular"){
    data$TARGET01<-as.numeric(data$TARGET)
    
    data$TARGET<-as.factor(data$TARGET)
    levels(data$TARGET)[1] <- "One"
    levels(data$TARGET)[2] <- "Two"
    levels(data$TARGET)[3] <- "Three"
    
    data$TARGET<-as.factor(data$TARGET)
    
    n_rows<-nrow(data)
    
    #____________________making the n folds_____________________________________
    {
      One_No<-length(which(data$TARGET=="One"))
      Two_No<-length(which(data$TARGET=="Two"))
      Three_No<-length(which(data$TARGET=="Three"))
      
      # Create the training and testing data sets
      
      kept_rows_One<-floor(One_No/n_folds)*n_folds
      kept_rows_Two<-floor(Two_No/n_folds)*n_folds
      kept_rows_Three<-floor(Three_No/n_folds)*n_folds
      
      data_One <- data[ which(data$TARGET=="One"), ]
      data_Two <- data[ which(data$TARGET=="Two"), ]
      data_Three <- data[ which(data$TARGET=="Three"), ]
      
      #shuffling
      data_One<-data_One[sample(nrow(data_One),kept_rows_One),]
      data_Two<-data_Two[sample(nrow(data_Two),kept_rows_Two),]
      data_Three<-data_Three[sample(nrow(data_Three),kept_rows_Three),]
      
      #naming the folds ID
      folds_One <- cut(seq(1,nrow(data_One)),breaks=n_folds,labels=FALSE)
      folds_Two <- cut(seq(1,nrow(data_Two)),breaks=n_folds,labels=FALSE)
      folds_Three <- cut(seq(1,nrow(data_Three)),breaks=n_folds,labels=FALSE)
      
      #
      data_One$folds<-folds_One
      data_Two$folds<-folds_Two
      data_Three$folds<-folds_Three
      #
      data_pred<-rbind(data_One,data_Two,data_Three)
      
    }
  }
  
  #next line is just for finding threshold of prediction
  if(method_pred=="ensemble"){
    x <- vector(mode="character", length=0)
    
    for(i in 1:length(vars)){
      x<-c(x,paste(vars[i],c("_1","_2"),sep=""))
    }
    
    vars<-x
    data_pred<-data}
  ####
  
  
  if(cntr_type=="simple"){
    control_<- trainControl(method = "none",  savePredictions = TRUE,
                            verboseIter = FALSE,returnResamp = "all",classProbs = TRUE)}
  if(cntr_type=="simplegbm"){
    control_<- trainControl(method = "cv", number = 1, savePredictions = TRUE,
                            verboseIter = FALSE,classProbs = TRUE, summaryFunction = twoClassSummary)
  }
  
  if(cntr_type=="simple_roc"){
    control_<- trainControl(method = "none",  savePredictions = TRUE,
                            verboseIter = FALSE,returnResamp = "all",classProbs = TRUE, summaryFunction = twoClassSummary)}
  
  if(cntr_type=="repeatedcv_roc"){
    control_<- trainControl(method="repeatedcv", number=2, repeats=5,  savePredictions = TRUE,
                            verboseIter = FALSE,returnResamp = "all",classProbs = TRUE, summaryFunction = twoClassSummary)}
  
  if(cntr_type=="cv_roc"){
    control_<- trainControl(method = "cv", number = 2, savePredictions = TRUE,
                            verboseIter = FALSE,classProbs = TRUE, summaryFunction = twoClassSummary)}
  
  
  
  rows_no<-floor(nrow(data_pred)/n_folds)
  
  resul_svm_raw<-matrix(0, ncol = n_folds, nrow = rows_no)
  resul_svm_raw<-as.data.frame(resul_svm_raw)
  names(resul_svm_raw) <- paste("fold_",1:n_folds, sep = "")
  
  resul_svm_prob1<-resul_svm_raw
  resul_svm_prob2<-resul_svm_raw
  resul_svm_prob3<-resul_svm_raw
  
  resul_crt_raw<-resul_svm_raw
  resul_crt_prob1<-resul_svm_raw
  resul_crt_prob2<-resul_svm_raw
  resul_crt_prob3<-resul_svm_raw
  
  resul_nnet_raw<-resul_svm_raw
  resul_nnet_prob1<-resul_svm_raw
  resul_nnet_prob2<-resul_svm_raw
  resul_nnet_prob3<-resul_svm_raw
  
  resul_tan_raw<-resul_svm_raw
  resul_tan_prob1<-resul_svm_raw
  resul_tan_prob2<-resul_svm_raw
  resul_tan_prob3<-resul_svm_raw
  
  resul_rf_raw<-resul_svm_raw
  resul_rf_prob1<-resul_svm_raw
  resul_rf_prob2<-resul_svm_raw
  resul_rf_prob3<-resul_svm_raw
  
  resul_cart_raw<-resul_svm_raw
  resul_cart_prob1<-resul_svm_raw
  resul_cart_prob2<-resul_svm_raw
  resul_cart_prob3<-resul_svm_raw
  
  resul_svm_imp<-matrix(0, ncol = n_folds, nrow = length(vars))
  resul_svm_imp<-as.data.frame(resul_svm_imp)
  names(resul_svm_imp) <- paste("fold_",1:n_folds, sep = "")
  resul_svm_imp$variables<-vars
  
  resul_rf_imp<-resul_svm_imp
  resul_nnet_imp<-resul_svm_imp
  resul_tan_imp<-resul_svm_imp
  resul_cart_imp<-resul_svm_imp
  
  tan_vars<-vars
  
  
  formula_format<-paste("TARGET ~ ",paste(vars, collapse="+"),sep = "")
  formula<-as.formula(formula_format)
  
  VarsData_Phase<-as.data.frame(vars)
  names(VarsData_Phase)<-"variables"
  tracker<-list()
  prob_results<-list()
  
  #i<-1
  all_vars<-c(vars,"TARGET")
  for ( i in 1: n_folds){
    test_data<-data_pred[data_pred$folds == i,]
    train_data<-data_pred[data_pred$folds != i,]
    tracker[[paste("fold_",i,sep="")]]<-test_data$id
    train_data<-train_data[,all_vars]
    test_data_pri<-test_data
    test_data<-test_data[,all_vars]
    
    tt<-as.data.frame(table(train_data$TARGET))
    #table(train_data$TARGET)
    common<-tt$Freq[1]
    rare<-tt$Freq[2]
    
    if(B_alg=="RUS"){train_data<-RUS_func(train_data,"TARGET")}
    if(B_alg=="SMOTE"){train_data<-SMOTE_func(train_data,formula,"TARGET")}
    
    if(ensembler=="rf" | method_pred=="regular"){
      
      mod_rf<-caret::train(formula,  data=train_data, method="rf",
                           trControl = control_, tuneGrid=expand.grid(mtry = length(vars)),
                           tuneLength = 5)
      
      {
        if(class(try(varImp(mod_rf),silent = TRUE))!="try-error"){
          resul_rf_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_rf_imp,model_imp_exc(mod_rf,VarsData_Phase),"variables")}
        resul_rf_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_rf, newdata=test_data, type="raw"))
        resul_rf_prob1[,paste("fold_",i, sep = "")]<-predict(mod_rf, newdata=test_data, type="prob")[1]   
        resul_rf_prob2[,paste("fold_",i, sep = "")]<-predict(mod_rf, newdata=test_data, type="prob")[2] 
        resul_rf_prob3[,paste("fold_",i, sep = "")]<-predict(mod_rf, newdata=test_data, type="prob")[3]
      }
      
    }
    
    if(ensembler=="cart" | method_pred=="regular"){
      
      mod_cart<-caret::train(formula,  data=train_data, method="rpart1SE",
                             trControl = control_, tuneLength = 5)
      
      {
        if(class(try(varImp(mod_cart),silent = TRUE))!="try-error"){
          resul_cart_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_cart_imp,model_imp_exc(mod_cart,VarsData_Phase),"variables")}
        resul_cart_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_cart, newdata=test_data, type="raw"))
        resul_cart_prob1[,paste("fold_",i, sep = "")]<-predict(mod_cart, newdata=test_data, type="prob")[1]   
        resul_cart_prob2[,paste("fold_",i, sep = "")]<-predict(mod_cart, newdata=test_data, type="prob")[2] 
        resul_cart_prob3[,paste("fold_",i, sep = "")]<-predict(mod_cart, newdata=test_data, type="prob")[3]
      }
      
    }
    
    
    if(ensembler=="svm" | method_pred=="regular"){    
      svm_cost<-.5
      svm_sigma<-.0001
      
      # if(class(try(
      #   
      #   mod_svm<-caret::train(formula,  data=train_data, method="svmRadial", family="binomial",
      #                         trControl = control_,  tuneGrid=expand.grid(C=svm_cost, sigma=svm_sigma),
      #                         tuneLength = 5)
      # ))=="try-error"){}
      mod_svm<-caret::train(formula,  data=train_data, method="svmLinear2", family="binomial",
                            trControl = control_,  tuneGrid=expand.grid(cost=svm_cost),
                            tuneLength = 5)
      
      {
        if(class(try(varImp(mod_svm),silent = TRUE))!="try-error"){
          resul_svm_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_svm_imp,model_imp_exc(mod_svm,VarsData_Phase),"variables")}
        resul_svm_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_svm, newdata=test_data, type="raw"))
        resul_svm_prob1[,paste("fold_",i, sep = "")]<-predict(mod_svm, newdata=test_data, type="prob")[1]   
        resul_svm_prob2[,paste("fold_",i, sep = "")]<-predict(mod_svm, newdata=test_data, type="prob")[2] 
        resul_svm_prob3[,paste("fold_",i, sep = "")]<-predict(mod_svm, newdata=test_data, type="prob")[3]
      }
    }
    
    
    if(ensembler=="nnet" | method_pred=="regular"){  
      mod_nnet<-caret::train(formula,  data=train_data, method="nnet", family="binomial",
                             trControl = control_,  tuneGrid=expand.grid(size=5, decay=0.1), MaxNWts=20000,
                             tuneLength = 5)
      
      {
        if(class(try(varImp(mod_nnet),silent = TRUE))!="try-error"){
          resul_nnet_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_nnet_imp,model_imp_exc(mod_nnet,VarsData_Phase),"variables")}
        resul_nnet_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_nnet, newdata=test_data, type="raw"))
        resul_nnet_prob1[,paste("fold_",i, sep = "")]<-predict(mod_nnet, newdata=test_data, type="prob")[1]   
        resul_nnet_prob2[,paste("fold_",i, sep = "")]<-predict(mod_nnet, newdata=test_data, type="prob")[2] 
        resul_nnet_prob3[,paste("fold_",i, sep = "")]<-predict(mod_nnet, newdata=test_data, type="prob")[3]
      }
      
      
      
    }
    ##===making data ready for TAN
    # disceretizing train and test
    if(ensembler=="tan" | method_pred=="regular"){ 
      data_tan_train<-train_data
      y_tr<-data_tan_train$TARGET
      data_tan_train$TARGET<-NULL
      data_tan_train$folds<-NULL
      data_tan_train$type<-"tr"
      
      data_tan_test<-test_data
      y_ts<-data_tan_test$TARGET
      data_tan_test$TARGET<-NULL
      data_tan_test$folds<-NULL
      data_tan_test$type<-"ts"
      data_tan<-rbind(data_tan_train,data_tan_test)
      
      
      
      for(j in 1:(ncol(data_tan)-1)){
        if(is.numeric(data_tan_train[,j])){if(is.numeric(data_tan[,j])){
          
          tester<-as.data.frame(data_tan[,j])
          if(tester[1,1]!=tester[nrow(data_tan),1]){
            data_tan[,j]<-discretize(data_tan[,j],method="interval",categories = floor(log(nrow(data_tan))))}else{
              data_tan[,j]<-discretize(data_tan[,j],method="interval",categories = 1)
            }
        }
        }
      }
      
      data_tan_train<-data_tan[which(data_tan$type=="tr"),]
      data_tan_test<-data_tan[which(data_tan$type=="ts"),]
      data_tan_train$type<-NULL
      data_tan_test$type<-NULL
      
      data_tan_train<-data_tan_train[, which(names(data_tan_train) %in% tan_vars)]
      mod_tan <- caret::train(data_tan_train, y_tr, method="tan",
                              trControl = control_, tuneGrid = expand.grid(score= "aic", smooth=0.5),
                              tuneLength = 5)
      
      {
        if(class(try(varImp(mod_tan),silent = TRUE))!="try-error"){
          resul_tan_imp<-matrix_filler(paste("fold_",i, sep = ""),resul_tan_imp,model_imp_exc(mod_tan,VarsData_Phase),"variables")}
        resul_tan_raw[,paste("fold_",i, sep = "")]<-as.character(predict(mod_tan, newdata=data_tan_test, type="raw"))
        resul_tan_prob1[,paste("fold_",i, sep = "")]<-predict(mod_tan, newdata=data_tan_test, type="prob")[1]
        resul_tan_prob2[,paste("fold_",i, sep = "")]<-predict(mod_tan, newdata=data_tan_test, type="prob")[2]
        resul_tan_prob3[,paste("fold_",i, sep = "")]<-predict(mod_tan, newdata=data_tan_test, type="prob")[3]
      }
      
    }
    
    
    probs_1<-as.data.frame(cbind(resul_svm_prob1[,paste("fold_",i, sep = "")],
                                 resul_nnet_prob1[,paste("fold_",i, sep = "")],
                                 resul_tan_prob1[,paste("fold_",i, sep = "")],
                                 resul_rf_prob1[,paste("fold_",i, sep = "")],
                                 resul_cart_prob1[,paste("fold_",i, sep = "")]))
    names(probs_1)<-c("svm_1","nnet_1","tan_1","rf_1","cart_1")
    
    probs_2<-as.data.frame(cbind(resul_svm_prob2[,paste("fold_",i, sep = "")],
                                 resul_nnet_prob2[,paste("fold_",i, sep = "")],
                                 resul_tan_prob2[,paste("fold_",i, sep = "")],
                                 resul_rf_prob2[,paste("fold_",i, sep = "")],
                                 resul_cart_prob2[,paste("fold_",i, sep = "")]))
    names(probs_2)<-c("svm_2","nnet_2","tan_2","rf_2","cart_2")
    
    probs_3<-as.data.frame(cbind(resul_svm_prob3[,paste("fold_",i, sep = "")],
                                 resul_nnet_prob3[,paste("fold_",i, sep = "")],
                                 resul_tan_prob3[,paste("fold_",i, sep = "")],
                                 resul_rf_prob3[,paste("fold_",i, sep = "")],
                                 resul_cart_prob3[,paste("fold_",i, sep = "")]))
    names(probs_3)<-c("svm_3","nnet_3","tan_3","rf_3","cart_3")
    
    probs<-cbind(probs_1,probs_2,test_data_pri[c("TARGET","id","folds")])
    
    prob_results[[paste("fold_",i,sep="")]]<-probs
    
    probs_all<-prob_results[["fold_1"]]
    for(i in 2:n_folds){
      probs_all<-rbind(probs_all,prob_results[[paste("fold_",i,sep="")]])
    }
    
    
    
  }
  #end of predicting part
  
  
  
  performance_svm<-matrix(0, ncol = n_folds, nrow = 4)
  performance_svm<-as.data.frame(performance_svm)
  row.names(performance_svm) <- c("level-1","level-2","level-3","Overall")
  names(performance_svm)[1:n_folds] <- paste("Fold", 1:n_folds, sep="")
  
  performance_cart<-performance_svm
  performance_nnet<-performance_svm
  performance_tan<-performance_svm
  performance_rf<-performance_svm
  
  for(i in 1:n_folds){
    
    data_One<-data_pred[which(data_pred$folds==i & data_pred$TARGET=="One"),]
    data_Two<-data_pred[which(data_pred$folds==i & data_pred$TARGET=="Two"),]
    data_Three<-data_pred[which(data_pred$folds==i & data_pred$TARGET=="Three"),]
    
    if(ensembler=="rf" | method_pred=="regular"){
      performance_rf[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_rf_raw[,i])
      performance_rf[1,i]<-accuracy_cal(data_One$TARGET,resul_rf_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_rf[2,i]<-accuracy_cal(data_Two$TARGET,resul_rf_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_rf[3,i]<-accuracy_cal(data_Three$TARGET,resul_rf_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])
    }
    
    if(ensembler=="cart" | method_pred=="regular"){
      performance_cart[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_cart_raw[,i])
      performance_cart[1,i]<-accuracy_cal(data_One$TARGET,resul_cart_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_cart[2,i]<-accuracy_cal(data_Two$TARGET,resul_cart_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_cart[3,i]<-accuracy_cal(data_Three$TARGET,resul_cart_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])
    }
    
    if(ensembler=="svm" | method_pred=="regular"){
      performance_svm[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_svm_raw[,i])
      performance_svm[1,i]<-accuracy_cal(data_One$TARGET,resul_svm_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_svm[2,i]<-accuracy_cal(data_Two$TARGET,resul_svm_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_svm[3,i]<-accuracy_cal(data_Three$TARGET,resul_svm_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])
    }
    
    if(ensembler=="nnet" | method_pred=="regular"){
      performance_nnet[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_nnet_raw[,i])
      performance_nnet[1,i]<-accuracy_cal(data_One$TARGET,resul_nnet_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_nnet[2,i]<-accuracy_cal(data_Two$TARGET,resul_nnet_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_nnet[3,i]<-accuracy_cal(data_Three$TARGET,resul_nnet_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])
    }
    
    if(ensembler=="tan" | method_pred=="regular"){
      performance_tan[4,i]<-accuracy_cal(data_pred$TARGET[which(data_pred$folds==i)],resul_tan_raw[,i])
      performance_tan[1,i]<-accuracy_cal(data_One$TARGET,resul_tan_raw[match(data_One$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_tan[2,i]<-accuracy_cal(data_Two$TARGET,resul_tan_raw[match(data_Two$id , tracker[[paste("fold_",i,sep="")]]),i])
      performance_tan[3,i]<-accuracy_cal(data_Three$TARGET,resul_tan_raw[match(data_Three$id , tracker[[paste("fold_",i,sep="")]]),i])}
    
  }
  
  
  results<-list()
  
  performance<-list()
  nnet<-list()
  svm<-list()
  tan<-list()
  rf<-list()
  cart<-list()
  
  if(ensembler=="nnet" | method_pred=="regular"){
    nnet$performance_nnet<-performance_nnet
    nnet$resul_nnet_raw<-resul_nnet_raw  
    nnet$resul_nnet_prob1<-resul_nnet_prob1
    nnet$resul_nnet_prob2<-resul_nnet_prob2
    nnet$resul_nnet_prob3<-resul_nnet_prob3
    nnet$resul_nnet_imp<-resul_nnet_imp
    results$nnet<-nnet}
  
  if(ensembler=="svm" | method_pred=="regular"){
    svm$performance_svm<-performance_svm
    svm$resul_svm_raw<-resul_svm_raw  
    svm$resul_svm_prob1<-resul_svm_prob1
    svm$resul_svm_prob2<-resul_svm_prob2
    svm$resul_svm_prob3<-resul_svm_prob3
    svm$resul_svm_imp<-resul_svm_imp
    results$svm<-svm}
  
  if(ensembler=="tan" | method_pred=="regular"){
    tan$performance_tan<-performance_tan
    tan$resul_tan_raw<-resul_tan_raw  
    tan$resul_tan_prob1<-resul_tan_prob1
    tan$resul_tan_prob2<-resul_tan_prob2
    tan$resul_tan_prob3<-resul_tan_prob3
    tan$resul_tan_imp<-resul_tan_imp
    results$tan<-tan}
  
  if(ensembler=="rf" | method_pred=="regular"){
    rf$performance_rf<-performance_rf
    rf$resul_rf_raw<-resul_rf_raw  
    rf$resul_rf_prob1<-resul_rf_prob1
    rf$resul_rf_prob2<-resul_rf_prob2
    rf$resul_rf_prob3<-resul_rf_prob3
    rf$resul_rf_imp<-resul_rf_imp
    results$rf<-rf}
  
  if(ensembler=="cart" | method_pred=="regular"){
    cart$performance_cart<-performance_cart
    cart$resul_cart_raw<-resul_cart_raw  
    cart$resul_cart_prob1<-resul_cart_prob1
    cart$resul_cart_prob2<-resul_cart_prob2
    cart$resul_cart_prob3<-resul_cart_prob3
    cart$resul_cart_imp<-resul_cart_imp
    results$cart<-cart}
  
  results$ensemble_data<-list()
  results$ensemble_data$prob_results<-prob_results
  results$ensemble_data$probs_all<-probs_all
  
  
  
  return(results)
}
##################

model_imp_exc <- function(input_object,vars_org){
  imp_model<-varImp(input_object)
  imp_model<-as.data.frame(imp_model$importance)
  imp_model$importance<-rowMeans(imp_model)
  imp_model$importance<-round(imp_model$importance,2)
  imp_model$variables<-rownames(imp_model)
  
  
  model_imp<-matrix(0, ncol = 1, nrow = nrow(imp_model))
  model_imp<-as.data.frame(model_imp)
  
  
  model_imp$variables<-imp_model$variables
  model_imp$importance<-imp_model$importance
  model_imp$V1<-NULL 
  
  seeker<-model_imp
  seeker$real_name<-NA
  
  seeked<-as.data.frame(vars_org[,1])
  names(seeked)<-"variables"
  
  for (i in 1:nrow(seeker)){
    for (j in 1:nrow(seeked)) {
      if(grepl(seeked$variables[j],seeker$variables[i])){
        if(str_count(seeker$variables[i],"_")==str_count(seeked$variables[j],"_")){
          seeker$real_name[i]<-as.character(seeked$variables[j])}
      }
    }
  }
  # It's for the variables that created by us and are not in the original dataset
  for (i in 1:nrow(seeker)){
    if(is.na(seeker$real_name[i])){seeker$real_name[i]<-seeker$variables[i]}
  }
  str(seeker$importance)
  res.by<- by(seeker$importance, seeker$real_name, mean)
  res.by
  
  seeker<-as.data.frame(as.matrix(res.by))
  seeker$variables<-rownames(seeker)
  seeker$importance<-seeker$V1
  seeker$V1<-NULL
  
  
  return(seeker) 
}

###########

matrix_filler<-function(col_no,filling_matrix,filler_matrix,common_col){
  common_col<-as.character(common_col)
  
  filler_col<-match(common_col,names(filler_matrix))
  filling_col<-match(common_col,names(filling_matrix))
  
  if(filler_col-2==0){
    values<-1
  }else{values<-2}
  
  for (k in 1:nrow(filling_matrix)){
    for (j in 1:nrow(filler_matrix)) {
      if(grepl(filler_matrix[j,filler_col],filling_matrix[k,filling_col])){
        if(str_count(filling_matrix[k,filling_col],"_")==str_count(filler_matrix[j,filler_col],"_")){
          filling_matrix[k,col_no]<-filler_matrix[j,values]}
      }
    }
  }        
  return(filling_matrix) 
}

###########

accuracy_cal<-function(actual,predicted){
  counter<-0
  for(i in 1:length(actual)){
    if(actual[i]==predicted[i]){counter<-counter+1}
  }
  percent_accuracy<-round(100*(counter/length(actual)),2)
  return(percent_accuracy)
}

###########
RUS_func<-function(train_data,TARGET){
  leveler<-as.data.frame(table(train_data[TARGET]))
  leveler<-leveler[order(leveler[,2]),]
  rare<-min(leveler$Freq)
  common<-max(leveler$Freq)
  
  
  leveler<-as.data.frame(table(train_data[TARGET]))
  leveler<-leveler[order(leveler[,2]),]
  rare<-min(leveler$Freq)
  common<-max(leveler$Freq)  
  
  temp<-train_data[which(train_data[,TARGET]==leveler$Var1[1]),]
  
  
  for(j in 2:length(levels(train_data[,TARGET]))){
    
    tag<-leveler$Var1[j]
    
    kkk<-train_data[which(train_data[,TARGET]==tag),]
    kkk<-kkk[sample(rare),]
    temp<-rbind(temp,kkk)
    
  }
  train_data<-temp
  return(train_data)
}
###################################

SMOTE_func<-function(train_data,formula,TARGET){
  leveler<-as.data.frame(table(train_data[TARGET]))
  leveler<-leveler[order(leveler[,2]),]
  rare<-min(leveler$Freq)
  common<-max(leveler$Freq)

   for(j in 1:(length(levels(train_data[,TARGET]))-1)){

    leveler2<-as.data.frame(table(train_data[TARGET]))
    leveler2<-leveler2[order(leveler2[,2]),]
    rare<-min(leveler2$Freq)
    common<-max(leveler2$Freq)

    tag<-leveler$Var1[j]

    k<-1
    H_por_low<-1
    H_por_high<-1
    H_por_low<-(1/H_por_low)
    H_por_high<-H_por_high*100
    perc_over<-100*(common-rare)/(rare*H_por_low)*k
    perc_under<-100*(1/perc_over)*common

    train_data2<-DMwR::SMOTE(formula, train_data, perc.over = perc_over,perc.under = (((perc_over/100)+1)/(perc_over/100))*H_por_high)

    kkk<-train_data2[which(train_data2[,TARGET]==tag),]
    ggg<-train_data[which(train_data[,TARGET]!=tag),]
    train_data<-rbind(kkk,ggg)

  }

  return(train_data)
}
#===================

data_plot<-function(data,start_col,end_col,partition_col,remov_char,Y_label,X_label,coef_dis){
  start<-which(colnames(data)== start_col)
  end<-which(colnames(data)== end_col)
  
  clusters_data_clus1<-as.data.frame(colMeans(data[which(data[[partition_col]]==1),start:end]))
  clusters_data_clus2<-as.data.frame(colMeans(data[which(data[[partition_col]]==2),start:end]))
  clusters_data<-cbind(clusters_data_clus1,clusters_data_clus2)
  # if(toupper(flip_clusters)=="YES"){
  #   clusters_data[1]<-(1/clusters_data[,1])
  #   clusters_data[2]<-(1/clusters_data[,2])
  # }
  
  # if(mean(clusters_data[,1])>mean(clusters_data[,2])){
  #   clusters_data$mid<-clusters_data[,1]
  #   clusters_data[1]<-clusters_data[,2]
  #   clusters_data[2]<-clusters_data$mid
  #   clusters_data$mid<-NULL
  # }
  names(clusters_data)<-c("clus_1","clus_2")
  quar1<-quantile(c(clusters_data$clus_1,clusters_data$clus_2))[2]
  
  quar2<-quantile(c(clusters_data$clus_1,clusters_data$clus_2))[3]
  
  quar3<-quantile(c(clusters_data$clus_1,clusters_data$clus_2))[4]
  
  IQR<- quar3 - quar1
  
  maxy<-quar3+coef_dis*IQR
  miny<-quar1-coef_dis*IQR
  
  x<-gsub(remov_char,"",rownames(clusters_data))
  counter_years<-length(x)
  
  start_date<-as.numeric(substr(gsub(remov_char,"",start_col),1,4))
  end_date<-as.numeric(substr(gsub(remov_char,"",end_col),1,4))
  
  X_label<-paste(X_label," (",start_date," - ",end_date,")",sep="")
  par(mar = c(5,5,2,5),cex.lab=1.6)
  plot(clusters_data$clus_1,xaxt="n", type="p", pch=0,col="red3",ylim=c(miny,maxy),
       ylab= Y_label,ann = FALSE, 
       xlab=X_label,cex=1.2,cex.axis=1.1)
  
  line_dist<-2
  #next line is for tilted labels but for 2002-2016 I may deactive it
  {
    if((end_date-start_date)<5){
      axis(1,at=1:counter_years,labels=x,cex.axis=.9,las=2)
      line_dist<-2
    }
  }
  
  mtext(side = 1, text = X_label, line = line_dist,cex=1.2)
  mtext(side = 2, text = Y_label, line = 3,cex=1.2)
  
  
  
  
  par(new = T)
  plot(clusters_data$clus_2,  type="p", pch=2,axes=F, xlab=NA, ylab=NA,ylim=c(miny,maxy),cex=1.2,lty=2)
  
  #axis(side = 4)
  #mtext(side = 4, line = 3, "Cluster 2")
  legend("topleft",
         legend=c("High Performance","Low Performance"),
         lty=c(0,0), bty = "p",pch=c(2, 0), col=c("black","red3"),cex=1.2)
  
}
####################

flipper<-function(data_series,data_clust,cluss_one){
  
  min_1<-0
  min_2<-0
  work_1<-as.numeric(sum(which(data_clust==1)))
  work_2<-as.numeric(sum(which(data_clust==2)))

  if(work_1*work_2>0){
    for(i in which(data_clust==1)){
      min_1<- min_1+mean(data_series[[i]])}
    for(j in which(data_clust==2)){
      min_2<- min_2+mean(data_series[[j]])}
    
    min_1<-min_1/length(which(data_clust==1))
    min_2<-min_2/length(which(data_clust==2))
  }
  
  
  
  if(cluss_one=="low" && min_1>min_2){
    temp<-data_clust
    temp[which(data_clust==1)]<-2
    temp[which(data_clust==2)]<-1
    data_clust<-temp
  }
  if(cluss_one=="high" && min_1<min_2){
    temp<-data_clust
    temp[which(data_clust==1)]<-2
    temp[which(data_clust==2)]<-1
    data_clust<-temp
  }
  return(data_clust)
  
}
############################################

# centr_par<-c("mean", "median", "shape", "dba", "pam")
# dis_par<-c("dtw","dtw2","dtw_basic","dtw_lb","lbk","lbi","sbd","gak","sdtw")
# for(i in centr_par){}
# for(j in dis_par){}
  
clusterer_proportion<-function(times_series,centr_par,dis_par,cluss_one){
  acf_fun <- function(dat, ...) {lapply(dat, function(x) as.numeric(acf(x, lag.max = 100L, plot = FALSE)$acf))}
  

    clus_results<-dtwclust::tsclust(times_series, k = 2L, 
                                                              distance = dis_par, centroid = centr_par, 
                                                              seed = 3247, trace = TRUE,
                                                              control = partitional_control(pam.precompute = FALSE),
                                                              args = tsclust_args(dist = list(window.size = 2L)))
    results<-list()
    results$clus_results<-clus_results@clusinfo
    
    results$labels<-flipper(times_series,clus_results@cluster,"low")
    

  return(results)
  
}
##################

#this function goes inside of a list that has vectors, then cuts each vector and put the trimmed one inside the list
vector_in_list_cutter<-function(my_list,starter,ender){
  
  for(i in 1:length(my_list)){
    if(ender>length(my_list[[i]])){ender<-length(my_list[[i]])}
    my_list[[i]]<-my_list[[i]][starter:ender]
  }
  
  return(my_list)
}

####################################
clustering<-function(objects,tool){
    #clustering for 2012-2016
    {
      #partitional clustering
      {
        pc_one_train_2012_2016 <- tsclust(objects$ts_2012_2016, k = 2L, 
                                          distance = "dtwz._lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      objects$data_table_2012_2016$partitional_one<-pc_one_train_2012_2016@cluster
      
      
      #Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
      {
        if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
        if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
        if(tool=="three"){dcv<-1000}else{dcv<-1.5}
        tadpole_one_train_2012_2016 <- tsclust(objects$ts_2012_2016, type = "tadpole", k = 2L,
                                               trace = TRUE,
                                               control = tadpole_control(dc = dcv, window.size = 2L))
        
        objects$data_table_2012_2016$tadple_one<-tadpole_one_train_2012_2016@cluster
        
      }
      
      #fuzzy clustering
      {
        if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
        
        for(i in 1:nrow(objects$data_table_2012_2016)){
          for(j in 1:last_date){
            if(objects$ts_2012_2016[[i]][j]==0){
              objects$ts_2012_2016[[i]][j]<-abs(runif(1, 0.1, 0.2))
            }
          }
        }
        # i<-1
        # j<-1
        acf_fun <- function(dat, ...) {
          lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
        }
        
        # Autocorrelation-based fuzzy c-means
        
        fcm_one_train_2012_2016 <- tsclust(objects$ts_2012_2016, type = "fuzzy", k = 2L,
                                           preproc = acf_fun, distance = "L2",
                                           seed = 123)
        
        fc_one_train_2012_2016<-as.data.frame(fcm_one_train_2012_2016@fcluster) 
        fc_one_train_2012_2016$company_name<-rownames(fc_one_train_2012_2016)
        fc_one_train_2012_2016$cluster<-NA
        
        for(i in 1:nrow(fc_one_train_2012_2016)){
          if(as.numeric(fc_one_train_2012_2016$cluster_1[i])>as.numeric(fc_one_train_2012_2016$cluster_2[i])){fc_one_train_2012_2016$cluster[i]<-"Cluster_1"}else{fc_one_train_2012_2016$cluster[i]<-"Cluster_2"}
        }
        objects$data_table_2012_2016$fc_one<-fc_one_train_2012_2016$cluster
        
        #write.csv(objects$data_table_2012_2016,paste(base_folder,"Data/clustering_one_train_2012_2016.csv",sep = ""))
        
      }
      
    }
    
    #clustering for 2007-2016
    {
      #partitional clustering
      {
        pc_one_train_2007_2016 <- tsclust(objects$ts_2007_2016, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      objects$data_table_2007_2016$partitional_one<-pc_one_train_2007_2016@cluster
      
      
      #Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
      {
        if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
        if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
        if(tool=="three"){dcv<-1000}else{dcv<-1.5}
        tadpole_one_train_2007_2016 <- tsclust(objects$ts_2007_2016, type = "tadpole", k = 2L,
                                               trace = TRUE,
                                               control = tadpole_control(dc = dcv, window.size = 2L))
        
        objects$data_table_2007_2016$tadple_one<-tadpole_one_train_2007_2016@cluster
        
      }
      
      #fuzzy clustering
      {
        if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
        
        for(i in 1:nrow(objects$data_table_2007_2016)){
          for(j in 1:last_date){
            if(objects$ts_2007_2016[[i]][j]==0){
              objects$ts_2007_2016[[i]][j]<-abs(runif(1, 0.1, 0.2))
            }
          }
        }
        
        acf_fun <- function(dat, ...) {
          lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
        }
        
        # Autocorrelation-based fuzzy c-means
        
        fcm_one_train_2007_2016 <- tsclust(objects$ts_2007_2016, type = "fuzzy", k = 2L,
                                           preproc = acf_fun, distance = "L2",
                                           seed = 123)
        
        fc_one_train_2007_2016<-as.data.frame(fcm_one_train_2007_2016@fcluster)
        fc_one_train_2007_2016$company_name<-rownames(fc_one_train_2007_2016)
        fc_one_train_2007_2016$cluster<-NA
        
        for(i in 1:nrow(fc_one_train_2007_2016)){
          if(as.numeric(fc_one_train_2007_2016$cluster_1[i])>as.numeric(fc_one_train_2007_2016$cluster_2[i])){fc_one_train_2007_2016$cluster[i]<-"Cluster_1"}else{fc_one_train_2007_2016$cluster[i]<-"Cluster_2"}
        }
        objects$data_table_2007_2016$fc_one<-fc_one_train_2007_2016$cluster
        
        #write.csv(objects$data_table_2007_2016,paste(base_folder,"Data/clustering_one_train_2007_2016.csv",sep = ""))
        
      }
    }
    
    #clustering for 2002-2016
    {
      #partitional clustering
      {
        pc_one_train_2002_2016 <- tsclust(objects$ts_2002_2016, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      objects$data_table_2002_2016$partitional_one<-pc_one_train_2002_2016@cluster
      
      
      #Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
      {
        if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
        if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
        if(tool=="three"){dcv<-1000}else{dcv<-1.5}
        tadpole_one_train_2002_2016 <- tsclust(objects$ts_2002_2016, type = "tadpole", k = 2L,
                                               trace = TRUE,
                                               control = tadpole_control(dc = dcv, window.size = 2L))
        
        objects$data_table_2002_2016$tadple_one<-tadpole_one_train_2002_2016@cluster
        
      }
      
      #fuzzy clustering
      {
        if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
        for(i in 1:nrow(objects$data_table_2002_2016)){
          for(j in 1:last_date){
            if(objects$ts_2002_2016[[i]][j]==0){
              objects$ts_2002_2016[[i]][j]<-abs(runif(1, 0.1, 0.2))
            }
          }
        }
        
        acf_fun <- function(dat, ...) {
          lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
        }
        
        # Autocorrelation-based fuzzy c-means
        
        fcm_one_train_2002_2016 <- tsclust(objects$ts_2002_2016, type = "fuzzy", k = 2L,
                                           preproc = acf_fun, distance = "L2",
                                           seed = 123)
        
        fc_one_train_2002_2016<-as.data.frame(fcm_one_train_2002_2016@fcluster)
        fc_one_train_2002_2016$company_name<-rownames(fc_one_train_2002_2016)
        fc_one_train_2002_2016$cluster<-NA
        
        for(i in 1:nrow(fc_one_train_2002_2016)){
          if(as.numeric(fc_one_train_2002_2016$cluster_1[i])>as.numeric(fc_one_train_2002_2016$cluster_2[i])){fc_one_train_2002_2016$cluster[i]<-"Cluster_1"}else{fc_one_train_2002_2016$cluster[i]<-"Cluster_2"}
        }
        objects$data_table_2002_2016$fc_one<-fc_one_train_2002_2016$cluster
        
        #write.csv(objects$data_table_2002_2016,paste(base_folder,"Data/clustering_one_train_2002_2016.csv",sep = ""))
        
      }
    }
    
    #clustering for 2007-2011
    {
      #partitional clustering
      {
        pc_one_train_2007_2011 <- tsclust(objects$ts_2007_2011, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      objects$data_table_2007_2011$partitional_one<-pc_one_train_2007_2011@cluster
      
      
      #Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
      {
        if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
        if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
        if(tool=="three"){dcv<-1000}else{dcv<-1.5}
        tadpole_one_train_2007_2011 <- tsclust(objects$ts_2007_2011, type = "tadpole", k = 2L,
                                               trace = TRUE,
                                               control = tadpole_control(dc = dcv, window.size = 2L))
        
        objects$data_table_2007_2011$tadple_one<-tadpole_one_train_2007_2011@cluster
        
      }
      
      #fuzzy clustering
      {
        if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
        for(i in 1:nrow(objects$data_table_2007_2011)){
          for(j in 1:last_date){
            if(objects$ts_2007_2011[[i]][j]==0){
              objects$ts_2007_2011[[i]][j]<-abs(runif(1, 0.1, 0.2))
            }
          }
        }
        
        acf_fun <- function(dat, ...) {
          lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
        }
        
        # Autocorrelation-based fuzzy c-means
        
        fcm_one_train_2007_2011 <- tsclust(objects$ts_2007_2011, type = "fuzzy", k = 2L,
                                           preproc = acf_fun, distance = "L2",
                                           seed = 123)
        
        fc_one_train_2007_2011<-as.data.frame(fcm_one_train_2007_2011@fcluster)
        fc_one_train_2007_2011$company_name<-rownames(fc_one_train_2007_2011)
        fc_one_train_2007_2011$cluster<-NA
        
        for(i in 1:nrow(fc_one_train_2007_2011)){
          if(as.numeric(fc_one_train_2007_2011$cluster_1[i])>as.numeric(fc_one_train_2007_2011$cluster_2[i])){fc_one_train_2007_2011$cluster[i]<-"Cluster_1"}else{fc_one_train_2007_2011$cluster[i]<-"Cluster_2"}
        }
        objects$data_table_2007_2011$fc_one<-fc_one_train_2007_2011$cluster
        
        #write.csv(objects$data_table_2007_2011,paste(base_folder,"Data/clustering_one_train_2007_2011.csv",sep = ""))
        
      }
    }
    
    #clustering for 2002-2006
    {
      #partitional clustering
      {
        pc_one_train_2002_2006 <- tsclust(objects$ts_2002_2006, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      objects$data_table_2002_2006$partitional_one<-pc_one_train_2002_2006@cluster
      
      
      #Time-series Anytime Density Peaks Clustering as proposed by Begum et al. (2015).
      {
        if(tool=="three" | tool=="four" | tool=="five"){dcv<-100}else{dcv<-1.5}
        if(tool=="five"|tool=="three"){dcv<-200}else{dcv<-1.5}
        if(tool=="three"){dcv<-1000}else{dcv<-1.5}
        tadpole_one_train_2002_2006 <- tsclust(objects$ts_2002_2006, type = "tadpole", k = 2L,
                                               trace = TRUE,
                                               control = tadpole_control(dc = dcv, window.size = 2L))
        
        objects$data_table_2002_2006$tadple_one<-tadpole_one_train_2002_2006@cluster
        
      }
      
      #fuzzy clustering
      {
        if(tool=="three" | tool=="five"){last_date<-19}else{last_date<-(4*(2016-2012+1))}
        for(i in 1:nrow(objects$data_table_2002_2006)){
          for(j in 1:last_date){
            if(objects$ts_2002_2006[[i]][j]==0){
              objects$ts_2002_2006[[i]][j]<-abs(runif(1, 0.1, 0.2))
            }
          }
        }
        
        acf_fun <- function(dat, ...) {
          lapply(dat, function(x) as.numeric(acf(x, lag.max = 50L, plot = FALSE)$acf))
        }
        
        # Autocorrelation-based fuzzy c-means
        
        fcm_one_train_2002_2006 <- tsclust(objects$ts_2002_2006, type = "fuzzy", k = 2L,
                                           preproc = acf_fun, distance = "L2",
                                           seed = 123)
        
        fc_one_train_2002_2006<-as.data.frame(fcm_one_train_2002_2006@fcluster)
        fc_one_train_2002_2006$company_name<-rownames(fc_one_train_2002_2006)
        fc_one_train_2002_2006$cluster<-NA
        
        for(i in 1:nrow(fc_one_train_2002_2006)){
          if(as.numeric(fc_one_train_2002_2006$cluster_1[i])>as.numeric(fc_one_train_2002_2006$cluster_2[i])){fc_one_train_2002_2006$cluster[i]<-"Cluster_1"}else{fc_one_train_2002_2006$cluster[i]<-"Cluster_2"}
        }
        objects$data_table_2002_2006$fc_one<-fc_one_train_2002_2006$cluster
        
        #write.csv(objects$data_table_2002_2006,paste(base_folder,"Data/clustering_one_train_2002_2006.csv",sep = ""))
        
      }
    }
    return(objects)
  }
##################
  
clustering_lite<-function(objects,tool){
    #clustering for 2012-2016
    {
      #partitional clustering
      {
        pc_one_train_2012_2016 <- tsclust(objects$ts_2012_2016, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      gg<-as.data.frame(pc_one_train_2012_2016@cluster)
      names(gg)<-paste("partitional_",tool,sep="")
      objects$data_table_2012_2016<-cbind(objects$data_table_2012_2016,gg)
    } 
    
    #clustering for 2007-2016
    {
      #partitional clustering
      {
        pc_one_train_2007_2016 <- tsclust(objects$ts_2007_2016, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      gg<-as.data.frame(pc_one_train_2007_2016@cluster)
      names(gg)<-paste("partitional_",tool,sep="")
      objects$data_table_2007_2016<-cbind(objects$data_table_2007_2016,gg)
    } 
    
    #clustering for 2002-2016
    {
      #partitional clustering
      {
        pc_one_train_2002_2016 <- tsclust(objects$ts_2002_2016, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      gg<-as.data.frame(pc_one_train_2002_2016@cluster)
      names(gg)<-paste("partitional_",tool,sep="")
      objects$data_table_2002_2016<-cbind(objects$data_table_2002_2016,gg)
    }  
    #clustering for 2007-2011
    {
      #partitional clustering
      {
        pc_one_train_2007_2011 <- tsclust(objects$ts_2007_2011, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      gg<-as.data.frame(pc_one_train_2007_2011@cluster)
      names(gg)<-paste("partitional_",tool,sep="")
      objects$data_table_2007_2011<-cbind(objects$data_table_2007_2011,gg)
    } 
    
    #clustering for 2002-2006
    {
      #partitional clustering
      {
        pc_one_train_2002_2006 <- tsclust(objects$ts_2002_2006, k = 2L, 
                                          distance = "dtw_lb", centroid = "pam", 
                                          seed = 3247, trace = TRUE,
                                          control = partitional_control(pam.precompute = FALSE),
                                          args = tsclust_args(dist = list(window.size = 2L)))
        
      }
      
      #assigning cluster to each company
      gg<-as.data.frame(pc_one_train_2002_2006@cluster)
      names(gg)<-paste("partitional_",tool,sep="")
      objects$data_table_2002_2006<-cbind(objects$data_table_2002_2006,gg)
    }
    return(objects)
}
##################

In this new analysis I mainly try t investigate BoardEx data and see what is its makeup. Specially what does qualification and netqork size mean.

It the following section, I perform Binomial clustering using dtw package in R

load("C:/Users/hza0020/Box/OHI/new/data_obj.Rdata")
set.seed(3247)

data_obj$CompuStat$object_one$binomial_clus<-list()
data_obj$CompuStat$object_one$binomial_clus$proportional_2007_2016<-
  clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_one$culster_all_one$ts_2007_2016,5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016$partitional<-
  data_obj$CompuStat$object_one$binomial_clus$proportional_2007_2016$labels

data_obj$CompuStat$object_two$binomial_clus<-list()
data_obj$CompuStat$object_two$binomial_clus$proportional_2007_2016<-
  clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_two$culster_all_two$ts_2007_2016[1:729],5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016$partitional<-0
data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016$partitional[1:729]<-
  data_obj$CompuStat$object_two$binomial_clus$proportional_2007_2016$labels

data_obj$CompuStat$object_three$binomial_clus<-list()
data_obj$CompuStat$object_three$binomial_clus$proportional_2007_2016<-
  clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_three$culster_all_three$ts_2007_2016[1:760],4,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016$partitional<-0
data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016$partitional[1:760]<-
  data_obj$CompuStat$object_three$binomial_clus$proportional_2007_2016$labels

data_obj$CompuStat$object_four$binomial_clus<-list()
data_obj$CompuStat$object_four$binomial_clus$proportional_2007_2016<-
  clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_four$culster_all_four$ts_2007_2016,5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_four$culster_all_four$data_table_2007_2016$partitional<-
  data_obj$CompuStat$object_four$binomial_clus$proportional_2007_2016$labels

data_obj$CompuStat$object_five$binomial_clus<-list()
data_obj$CompuStat$object_five$binomial_clus$proportional_2007_2016<-
  clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_five$culster_all_five$ts_2007_2016,4,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_five$culster_all_five$data_table_2007_2016$partitional<-
  data_obj$CompuStat$object_five$binomial_clus$proportional_2007_2016$labels

data_obj$CompuStat$object_RandD$binomial_clus<-list()
data_obj$CompuStat$object_RandD$binomial_clus$proportional_2007_2016<-
  clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_RandD$culster_all_RandD$ts_2007_2016,5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_RandD$culster_all_RandD$data_table_2007_2016$partitional<-
  data_obj$CompuStat$object_RandD$binomial_clus$proportional_2007_2016$labels

data_obj$CompuStat$object_RDREV$binomial_clus<-list()
data_obj$CompuStat$object_RDREV$binomial_clus$proportional_2007_2016<-
  clusterer_proportion(vector_in_list_cutter(data_obj$CompuStat$object_RDREV$culster_all_RDREV$ts_2007_2016,5,100),"pam","dtw_lb","low")
data_obj$CompuStat$object_RDREV$culster_all_RDREV$data_table_2007_2016$partitional<-
  data_obj$CompuStat$object_RDREV$binomial_clus$proportional_2007_2016$labels

binomial_output<-matrix(0, ncol = 4, nrow = 6)
binomial_output<-as.data.frame(binomial_output)
row.names(binomial_output)<-c("Index_1","Index_2","Index_3","Index_4","Index_5","Index_6")
names(binomial_output)<-c("Mean intra-cluster Distance (long term)","Size","Mean intra-cluster Distance (short term)","Size")
options(scipen=999)
binomial_output[1,]<-round(unlist(data_obj$CompuStat$object_one$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[2,]<-round(unlist(data_obj$CompuStat$object_two$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[3,]<-round(unlist(data_obj$CompuStat$object_three$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[4,]<-round(unlist(data_obj$CompuStat$object_four$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[5,]<-round(unlist(data_obj$CompuStat$object_five$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
binomial_output[6,]<-round(unlist(data_obj$CompuStat$object_RDREV$binomial_clus$proportional_2007_2016$clus_results)[c(4,2,3,1)],2)
data_obj$CompuStat$summary<-list()
data_obj$CompuStat$summary$summary2007_2016<-binomial_output

cat("Binomial Clustering")

## Binomial Clustering

DT::datatable(data_obj$CompuStat$summary$summary2007_2016)

In the following part, I visualized the binomial clustering. The following figure demonstrate average of time series within each class of companies.

data_plot(data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016,"R2008Q1","R2016Q4",
          "partitional","R","Capital Expenditures / Depreciation","Date",1.5)
data_plot(data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016,"R2008Q1","R2016Q4",
          "partitional","R","Capital Expenditures / Depreciation","Date",1.5)

data_plot(data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016[1:729,],"R2008Q1","R2016Q4",
          "partitional","R","Revenue / Receivables","Date",1.5)

data_plot(data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016[1:760,],"d2008Q1","d2016Q4",
          "partitional","d","1 /|Revenue Growth-Earning Growth|","Date",9)

data_plot(data_obj$CompuStat$object_four$culster_all_four$data_table_2007_2016,"d2008Q1","d2016Q4",
          "partitional","d","|Actual EPS - Predicted EPS|","Date",6)

data_plot(data_obj$CompuStat$object_five$culster_all_five$data_table_2007_2016,"d2008Q1","d2016Q4",
          "partitional","d","|Income Growth-Earning Per Share Growth|","Date",8)

# data_plot(data_obj$CompuStat$object_RandD$culster_all_RandD$data_table_2007_2016,"R2007Q1","R2016Q4",
#           "partitional_RandD","R","Research and Development Expense","Date","NO",1.5)

data_plot(data_obj$CompuStat$object_RDREV$culster_all_RDREV$data_table_2007_2016,"R2008Q1","R2016Q4",
          "partitional","R","R&D Cost / Revenue","Date",1.5)

In the next part, I try to adopt index-1,index-2,index-3, and index-5 for multiple clustering

# first I wan to find companies that have all of these indexes: index_1, index_2, index_3, index_5
{
cluster_check<-matrix(0, ncol = 7, nrow = nrow(data_obj$identifiers$tickers))
cluster_check<-as.data.frame(cluster_check)
names(cluster_check)<-c("tickers","index_1","index_2","index_3","index_4","index_5","index_6")
cluster_check$tickers<-data_obj$identifiers$tickers$TIC



cluster_check$index_1[which(as.character(data_obj$identifiers$tickers$TIC) %in% 
                      intersect(cluster_check$tickers,names(data_obj$CompuStat$
                      object_one$culster_all_one$ts_2007_2016)))]<-1

cluster_check$index_2[which(as.character(data_obj$identifiers$tickers$TIC) %in% 
                      intersect(cluster_check$tickers,names(data_obj$CompuStat$
                      object_two$culster_all_two$ts_2007_2016[1:729])))]<-1

cluster_check$index_3[which(as.character(data_obj$identifiers$tickers$TIC) %in% 
                      intersect(cluster_check$tickers,names(data_obj$CompuStat$
                      object_three$culster_all_three$ts_2007_2016[1:760])))]<-1

cluster_check$index_4[which(as.character(data_obj$identifiers$tickers$TIC) %in% 
                      intersect(cluster_check$tickers,names(data_obj$CompuStat$
                      object_four$culster_all_four$ts_2007_2016)))]<-1

cluster_check$index_5[which(as.character(data_obj$identifiers$tickers$TIC) %in% 
                      intersect(cluster_check$tickers,names(data_obj$CompuStat$
                      object_five$culster_all_five$ts_2007_2016)))]<-1

cluster_check$index_6[which(as.character(data_obj$identifiers$tickers$TIC) %in% 
                      intersect(cluster_check$tickers,names(data_obj$CompuStat$
                      object_RDREV$culster_all_RDREV$ts_2007_2016)))]<-1

cluster_check$index1235<-cluster_check$index_1*cluster_check$index_2*cluster_check$index_3*cluster_check$index_5
cluster_check<-cluster_check[which(cluster_check$index1235==1),]
cluster_check_mean<-cluster_check

}

# Then I make a multi dimensional data for multiple clustering

      s_tart1<-which(colnames(data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016)=="R2008Q1")
      e_nd1<-which(colnames(data_obj$CompuStat$object_one$culster_all_one$data_table_2007_2016)=="R2016Q4")
      
      s_tart2<-which(colnames(data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016)=="R2008Q1")
      e_nd2<-which(colnames(data_obj$CompuStat$object_two$culster_all_two$data_table_2007_2016)=="R2016Q4")

      s_tart3<-which(colnames(data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016)=="d2008Q1")
      e_nd3<-which(colnames(data_obj$CompuStat$object_three$culster_all_three$data_table_2007_2016)=="d2016Q4")
      
      s_tart5<-which(colnames(data_obj$CompuStat$object_five$culster_all_five$data_table_2007_2016)=="d2008Q1")
      e_nd5<-which(colnames(data_obj$CompuStat$object_five$culster_all_five$data_table_2007_2016)=="d2016Q4")
      
      num.ts <- nrow(cluster_check) # number of time series
      num.dim <- 4 # number of dimensions
      len.ts <- (e_nd1-s_tart1+1) # length of time series
      
      # generate Gaussian white noise
      multi_dim_data <- array(dim = c(len.ts, num.ts, num.dim),data = NA)
      
 for(i in 1:nrow(cluster_check)){
      t1<-as.data.frame(t(data_obj$CompuStat$object_one$culster_all_one$
          data_table_2007_2016[which(data_obj$CompuStat$object_one$
          culster_all_one$data_table_2007_2016$TIC==cluster_check$tickers[i]),s_tart1:e_nd1]))
      colnames(t1)<-"one"
        
        
      t2<-as.data.frame(t(data_obj$CompuStat$object_two$culster_all_two$
          data_table_2007_2016[which(data_obj$CompuStat$object_two$
          culster_all_two$data_table_2007_2016$TIC==cluster_check$tickers[i]),s_tart1:e_nd1]))
      colnames(t2)<-"two"
      
      t3<-as.data.frame(t(data_obj$CompuStat$object_three$culster_all_three$
          data_table_2007_2016[which(data_obj$CompuStat$object_three$
          culster_all_three$data_table_2007_2016$TIC==cluster_check$tickers[i]),s_tart1:e_nd1]))
          colnames(t3)<-"three"
          
      t5<-as.data.frame(t(data_obj$CompuStat$object_five$culster_all_five$
          data_table_2007_2016[which(data_obj$CompuStat$object_five$
          culster_all_five$data_table_2007_2016$TIC==cluster_check$tickers[i]),s_tart1:e_nd1]))
          colnames(t5)<-"five"
        
       cluster_check_mean$index_1[i]<-mean(t1[,])
       cluster_check_mean$index_2[i]<-mean(t2[,])
       cluster_check_mean$index_3[i]<-mean(t3[,])
       cluster_check_mean$index_5[i]<-mean(t5[,])
        
        ts<-cbind(t1,t2,t3,t5)
        
        for(j in 1:ncol(ts)){
          multi_dim_data[ ,i,j]<-as.numeric(ts[,j])
        }
        
      }
      #
    
    multi_clustering<- pdclust(X = multi_dim_data, m=3,t=4)

    cluster_check$clus3<-cutree(multi_clustering, k = 3)

Distribution of multivariate clustering is demonstrated below:

  multi3_clus<-as.data.frame(table(cluster_check$clus3))
  names(multi3_clus)<-c("Cluster_Label","Frequency")
  results_comp2008_2016<-list()
  results_comp2008_2016$summary<-multi3_clus
  results_comp2008_2016$cluster_results<-cluster_check
  data_obj$CompuStat$mult_cluster$results_comp2008_2016<-results_comp2008_2016
  DT::datatable(multi3_clus)

Here, I want to see what are the company IDs of my companies. I do it by utilizing my ticker IDs from “Organization Summary - Analytics” database then I go to the “Individual Profile Education” database for observing qualification database then I extract qualification and network size variables.

seed_no<-123
org_sum<-read.csv("C:/Users/hza0020/Box/OHI/new/Organization_Summary_Analytics.csv")
# company_ID<-as.data.frame(table(org_sum$BoardID))[1]
# names(company_ID)<-"ID"
# write.csv(company_ID,"C:/Users/hza0020/Box/OHI/new/company_ID.csv", row.names=FALSE)
indiv_Edu<-read.csv("C:/Users/hza0020/Box/OHI/new/Profile_Edu.csv")

It seems Qualifications indicates number of certificates and/or professional trainings that the board member received

   DT::datatable(as.data.frame(summary(indiv_Edu$Qualification)))

It seems the Gender Ratio variable refers to proportion of the male directors

   summary(org_sum$GenderRatio)[1]

##  Min. 
## 0.375

In the next step I want to perform a descriptive investigation over the Board member data:

data<-data_obj$pred$data_2007_2016
summary(data$GR_mean)
hist(data$GR_mean, 
     main="Histogram for Gender Ratio in the Board Members", 
     xlab="mean Gender Ratio", 
     border="black", 
     col="white",
     las=1, 
     breaks=10)

summary(data$NS_mean)
hist(data$NS_mean, 
     main="Histogram for total network size of the Board Members", 
     xlab="total Network Size", 
     border="black", 
     col="white",
     las=1, 
     breaks=10)

summary(data$NQ_mean)
hist(data$NQ_mean, 
     main="Histogram for total qualifications of the Board Members", 
     xlab="total qualifications", 
     border="black", 
     col="white",
     las=1, 
     breaks=10)

summary(data$TR_mean)
hist(data$TR_mean, 
     main="Histogram for total time to toretirement of the Board Members", 
     xlab="total time to toretirement ", 
     border="black", 
     col="white",
     las=1, 
     breaks=10)

In the following section, I join CompuStat data and BoardEx data

table2008_2016<-data_obj$CompuStat$data$data_table_org_all

data_obj$BoardEx$data$data_table_GR_all_2007_2016$mean8_16<-rowMeans(data_obj$BoardEx$data$
      data_table_GR_all_2007_2016[c("GR2008","GR2009","GR2010","GR2011", "GR2012","GR2013",
                                    "GR2014", "GR2015","GR2016")])

data_obj$BoardEx$data$data_table_netsize_all_2007_2016$mean8_16 <-rowMeans(data_obj$BoardEx$data$
      data_table_netsize_all_2007_2016[c("NS2008","NS2009","NS2010","NS2011","NS2012","NS2013", 
                                         "NS2014","NS2015","NS2016")])

data_obj$BoardEx$data$data_table_noquals_all_2007_2016$mean8_16 <-rowMeans(data_obj$BoardEx$data$
      data_table_noquals_all_2007_2016[c("NQ2008","NQ2009","NQ2010","NQ2011","NQ2012","NQ2013", 
                                         "NQ2014","NQ2015","NQ2016")]) 

data_obj$BoardEx$data$data_table_timeboard_all_2007_2016$mean8_16 <-rowMeans(data_obj$BoardEx$data$
      data_table_timeboard_all_2007_2016[c("TB2008","TB2009","TB2010","TB2011","TB2012","TB2013", 
                                           "TB2014","TB2015","TB2016")])


data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$mean8_16 <-rowMeans(data_obj$BoardEx$data$
      data_table_timeretire_all_2007_2016[c("TR2008","TR2009","TR2010","TR2011","TR2012","TR2013", 
                                           "TR2014","TR2015","TR2016")])



table2008_2016$GR_mean<-NA
table2008_2016$NS_mean<-NA
table2008_2016$NQ_mean<-NA
table2008_2016$TR_mean<-NA
table2008_2016$TARGET<-NA

table2008_2016$GR_mean[which(as.character(table2008_2016$TIC) %in% 
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_GR_all_2007_2016$TIC))]<-
data_obj$BoardEx$data$data_table_GR_all_2007_2016$mean8_16[which(
as.character(data_obj$BoardEx$data$data_table_GR_all_2007_2016$TIC) %in% 
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_GR_all_2007_2016$TIC))]


table2008_2016$NS_mean[which(as.character(table2008_2016$TIC) %in% 
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_netsize_all_2007_2016$TIC))]<-
data_obj$BoardEx$data$data_table_netsize_all_2007_2016$mean8_16[which(
as.character(data_obj$BoardEx$data$data_table_netsize_all_2007_2016$TIC) %in% 
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_netsize_all_2007_2016$TIC))]


table2008_2016$NQ_mean[which(as.character(table2008_2016$TIC) %in% 
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_noquals_all_2007_2016$TIC))]<-
data_obj$BoardEx$data$data_table_noquals_all_2007_2016$mean8_16[which(
as.character(data_obj$BoardEx$data$data_table_noquals_all_2007_2016$TIC) %in% 
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_noquals_all_2007_2016$TIC))]


table2008_2016$TR_mean[which(as.character(table2008_2016$TIC) %in% 
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$TIC))]<-
data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$mean8_16[which(
as.character(data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$TIC) %in% 
intersect(table2008_2016$TIC,data_obj$BoardEx$data$data_table_timeretire_all_2007_2016$TIC))]


table2008_2016$TARGET[which(as.character(table2008_2016$TIC) %in% 
intersect(table2008_2016$TIC,cluster_check$tickers))]<-
cluster_check$clus3[which(as.character(cluster_check$tickers) %in% 
intersect(cluster_check$tickers,table2008_2016$TIC))]

table2008_2016<-table2008_2016[complete.cases(table2008_2016),]

industry_dummies<-mlr::createDummyFeatures(as.factor(table2008_2016$NAICS2))
names(industry_dummies)<-paste("c",colnames(industry_dummies),sep="")
industry_dummies<-industry_dummies[1:(ncol(industry_dummies)-1)]

table2008_2016<-cbind(table2008_2016,industry_dummies)

data_obj$pred_data$data_2008_2016<-table2008_2016
data_obj$pred_data$NAICS2_2008_2016<-industry_dummies

Distribution of multivariate clustering after including BoardEx data (excluding NAs):

  multi3_clus_BX<-as.data.frame(table(data_obj$pred_data$data_2008_2016$TARGET))
  names(multi3_clus_BX)<-c("Cluster_Label","Frequency")

  data_obj$pred_data$summary2008_2016<-multi3_clus_BX
  
  DT::datatable(multi3_clus_BX)

Next, I load the large object for prediction and then increasing accuracy over stacking approach

############################
{
# rm(list = ls()) # clears global environment (functions and variables)
# graphics.off() # clears graphics
# source("C:/Users/hza0020/Box/OHI/new/functions.R")
# load("C:/Users/hza0020/Box/OHI/new/data_obj.Rdata")

data<-data_obj$pred_data$data_2008_2016
data$id<-data$TIC
data$GR_mean<-as.numeric(data$GR_mean)
data$NS_mean<-as.numeric(data$NS_mean)
data$NQ_mean<-as.numeric(data$NQ_mean)
data$TR_mean<-as.numeric(data$TR_mean)


for(i in names(data_obj$pred_data$NAICS2_2008_2016)){
  data[i]<-as.factor(data[,i])
}


cntr_type<-"simple"
TARGET<-"TARGET"
n_folds<-10}
############################

B_alg<-"NONE"
method_pred<-"regular"

vars<-c( "GR_mean","NS_mean","NQ_mean","TR_mean",names(data_obj$pred_data$NAICS2_2008_2016))

ensembler<-"NONE"

results<-list()
results$regular<-list()
results$ensemble<-list()

set.seed(12)

B_alg<-"SMOTE"
results$regular$regular_3SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$regular$regular_3NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"RUS"
results$regular$regular_3RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)

data<-results$regular$regular_3NONE$ensemble_data$probs_all
#data<-regular_3SMOTE$ensemble_data$probs_all

method_pred<-"ensemble"

ensembler<-"rf"
vars<-c("tan","svm","nnet","cart")

B_alg<-"RUS"
results$ensemble$ensemble_NONE_3rf_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3rf_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3rf_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)

ensembler<-"nnet"
vars<-c("tan","svm","rf","cart")

B_alg<-"RUS"
results$ensemble$ensemble_NONE_3nnet_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3nnet_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3nnet_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)

ensembler<-"svm"
vars<-c("nnet","tan","rf","cart")

B_alg<-"RUS"
results$ensemble$ensemble_NONE_3svm_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3svm_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3svm_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)


ensembler<-"tan"
vars<-c("nnet","svm","rf","cart")

B_alg<-"RUS"
results$ensemble$ensemble_NONE_3tan_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3tan_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3tan_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)


ensembler<-"cart"
vars<-c("nnet","svm","rf","tan")

B_alg<-"RUS"
results$ensemble$ensemble_NONE_3cart_RUS<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"SMOTE"
results$ensemble$ensemble_NONE_3cart_SMOTE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)
B_alg<-"NONE"
results$ensemble$ensemble_NONE_3cart_NONE<-pred_three(data,TARGET,vars,ensembler,B_alg,cntr_type,method_pred,n_folds)


data_obj$results<-results
save(data_obj,file=paste(getwd(),"/","data_obj",".RData",sep=""))

Next, I show performance of the predictive models

cat("Here I show performance of nnet when we use SMOTE sample balancing")

## Here I show performance of nnet when we use SMOTE sample balancing

DT::datatable(results$regular$regular_3SMOTE$nnet$performance_nnet)

cat("Here I show performance of nnet when we use no sample balancing")

## Here I show performance of nnet when we use no sample balancing

DT::datatable(results$regular$regular_3NONE$nnet$performance_nnet)

cat("Here I show performance of nnet when we use RUS sample balancing")

## Here I show performance of nnet when we use RUS sample balancing

DT::datatable(results$regular$regular_3RUS$nnet$performance_nnet)

cat("Here I show importance of variables in nnet with no sample balancing")

## Here I show importance of variables in nnet with no sample balancing

DT::datatable(results$regular$regular_3NONE$nnet$resul_nnet_imp)

cat("performance of svm when we use SMOTE sample balancing")

## performance of svm when we use SMOTE sample balancing

DT::datatable(results$regular$regular_3SMOTE$svm$performance_svm)

cat("performance of svm when we use no sample balancing")

## performance of svm when we use no sample balancing

DT::datatable(results$regular$regular_3NONE$svm$performance_svm)

cat("performance of svm when we use RUS sample balancing")

## performance of svm when we use RUS sample balancing

DT::datatable(results$regular$regular_3RUS$svm$performance_svm)

# 
cat("importance of variables in svm with no sample balancing")

## importance of variables in svm with no sample balancing

DT::datatable(results$regular$regular_3NONE$svm$resul_svm_imp)

# 
cat("performance of rf when we use SMOTE sample balancing")

## performance of rf when we use SMOTE sample balancing

DT::datatable(results$regular$regular_3SMOTE$rf$performance_rf)

cat("performance of rf when we use no sample balancing")

## performance of rf when we use no sample balancing

DT::datatable(results$regular$regular_3NONE$rf$performance_rf)

cat("performance of rf when we use RUS sample balancing")

## performance of rf when we use RUS sample balancing

DT::datatable(results$regular$regular_3RUS$rf$performance_rf)

# 
cat("importance of variables in rf with no sample balancing")

## importance of variables in rf with no sample balancing

DT::datatable(results$regular$regular_3NONE$rf$resul_rf_imp)

# 
cat("performance of cart when we use SMOTE sample balancing")

## performance of cart when we use SMOTE sample balancing

DT::datatable(results$regular$regular_3SMOTE$cart$performance_cart)

cat("performance of cart when we use no sample balancing")

## performance of cart when we use no sample balancing

DT::datatable(results$regular$regular_3NONE$cart$performance_cart)

cat("performance of cart when we use RUS sample balancing")

## performance of cart when we use RUS sample balancing

DT::datatable(results$regular$regular_3RUS$cart$performance_cart)

# 
cat("importance of variables in cart with no sample balancing")

## importance of variables in cart with no sample balancing

DT::datatable(results$regular$regular_3NONE$cart$resul_cart_imp)

# 
cat("performance of tan when we use SMOTE sample balancing")

## performance of tan when we use SMOTE sample balancing

DT::datatable(results$regular$regular_3SMOTE$tan$performance_tan)

cat("performance of tan when we use no sample balancing")

## performance of tan when we use no sample balancing

DT::datatable(results$regular$regular_3NONE$tan$performance_tan)

cat("performance of tan when we use RUS sample balancing")

## performance of tan when we use RUS sample balancing

DT::datatable(results$regular$regular_3RUS$tan$performance_tan)

# 
cat("importance of variables in tan with no sample balancing")

## importance of variables in tan with no sample balancing

DT::datatable(results$regular$regular_3NONE$tan$resul_tan_imp)

# 
# 
# 
cat("performance of stacking & Rus sample balancing by rf, without sample balancing in the previous step")

## performance of stacking & Rus sample balancing by rf, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3rf_RUS$rf$performance_rf)

cat("performance of stacking & SMOTE sample balancing by rf, without sample balancing in the previous step")

## performance of stacking & SMOTE sample balancing by rf, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3rf_SMOTE$rf$performance_rf)

cat("performance of stacking & no sample balancing by rf, without sample balancing in the previous step")

## performance of stacking & no sample balancing by rf, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3rf_NONE$rf$performance_rf)

# 
cat("performance of stacking & Rus sample balancing by nnet, without sample balancing in the previous step")

## performance of stacking & Rus sample balancing by nnet, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3nnet_RUS$nnet$performance_nnet)

cat("performance of stacking & SMOTE sample balancing by nnet, without sample balancing in the previous step")

## performance of stacking & SMOTE sample balancing by nnet, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3nnet_SMOTE$nnet$performance_nnet)

cat("performance of stacking & no sample balancing by nnet, without sample balancing in the previous step")

## performance of stacking & no sample balancing by nnet, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3nnet_NONE$nnet$performance_nnet)

# 
cat("performance of stacking & Rus sample balancing by svm, without sample balancing in the previous step")

## performance of stacking & Rus sample balancing by svm, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3svm_RUS$svm$performance_svm)

cat("performance of stacking & SMOTE sample balancing by svm, without sample balancing in the previous step")

## performance of stacking & SMOTE sample balancing by svm, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3svm_SMOTE$svm$performance_svm)

cat("performance of stacking & no sample balancing by svm, without sample balancing in the previous step")

## performance of stacking & no sample balancing by svm, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3svm_NONE$svm$performance_svm)

# 
cat("performance of stacking & Rus sample balancing by tan, without sample balancing in the previous step")

## performance of stacking & Rus sample balancing by tan, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3tan_RUS$tan$performance_tan)

cat("performance of stacking & SMOTE sample balancing by tan, without sample balancing in the previous step")

## performance of stacking & SMOTE sample balancing by tan, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3tan_SMOTE$tan$performance_tan)

cat("performance of stacking & no sample balancing by tan, without sample balancing in the previous step")

## performance of stacking & no sample balancing by tan, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3tan_NONE$tan$performance_tan)

# 
cat("performance of stacking & Rus sample balancing by cart, without sample balancing in the previous step")

## performance of stacking & Rus sample balancing by cart, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3cart_RUS$cart$performance_cart)

cat("performance of stacking & SMOTE sample balancing by cart, without sample balancing in the previous step")

## performance of stacking & SMOTE sample balancing by cart, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3cart_SMOTE$cart$performance_cart)

cat("performance of stacking & no sample balancing by cart, without sample balancing in the previous step")

## performance of stacking & no sample balancing by cart, without sample balancing in the previous step

DT::datatable(results$ensemble$ensemble_NONE_3cart_NONE$cart$performance_cart)

Aubrun University, azg0074@auburn.edu ↩
Aubrun University, hamid@auburn.edu ↩

Investigating BoardEx data

Ashish Gupta¹

Hamidreza Ahady Dolatsara²

October 10, 2018

Investigating BoardEx data

Ashish Gupta1

Hamidreza Ahady Dolatsara2

October 10, 2018

Ashish Gupta¹

Hamidreza Ahady Dolatsara²