R Notebook

suppressMessages(library(readr))
suppressMessages(library("FRESA.CAD"))
suppressMessages(library(dplyr))
#install.packages("FRESA.CAD")
#setwd("~/GitHub/Exploring_Learning_Algorithms_4_CADx/synthetic experiments'")

 #Import the dataset and convert variables
pokedex <- read_csv("../datasets/pokedex.csv",col_types = cols(`#` = col_skip()))
class(pokedex) <- "data.frame" 
colnames(pokedex) <- c("Name","Type1","Type2","Total","HP","Attack","Defense",
                       "Sp_Atk","Sp_Def","Speed","Generation","Legendary")
pokedex$Legendary <- 1*(pokedex$Legendary == TRUE)
#character as numeric
pokedex$Type1 <- as.numeric(as.factor(pokedex$Type1))
pokedex$Type2 <- as.numeric(as.factor(pokedex$Type2))
#eliminate the NA from type 2
pokedex$Type2[is.na(pokedex$Type2)] <- 99
#save(pokedex,file = "tidy_pokedex.Rdata")

#model creation and test performance

#load("tidy_pokedex.Rdata")
pokedex$Name <- NULL 
#pokedex$Sp_Atk <- NULL
#pokedex$Sp_Def <-NULL

model_bswims.knn <- HLCM_EM(Legendary ~ .,pokedex)

## [+-][+-][+-]( 0 )< 748 , 757 , 55 , 0 >

model_bswims.knn_bs <- predictionStats_binary(cbind(pokedex$Legendary,
                                   predict(model_bswims.knn,pokedex)),
                                   "LC_BSWIMS.KNN",
                                   cex=0.8)

## LC_BSWIMS.KNN

## Warning in if (class(modelPredictions) == "data.frame") {: the condition has
## length > 1 and only the first element will be used

#function to create syntetic features based on permutations of the real features
Permutation_sampling<- function(theData,theOutcome,n_synt) {
  theData$theOutcome <- NULL
  n_obs <- dim(theData)[1]
  n_var <- dim(theData)[2]
  #empty the data so it get filled with the permutations
  new_data = data.frame(matrix(ncol=0,nrow=n_obs)) 
  for (i in 1:n_synt) {
    #pick a random variable
    #print(i)
    synt_var = data.frame(matrix(ncol=2,nrow=n_obs)) 
    synt_var$var <-  theData[sample(1:n_var, 1)] 
    synt_var$order_i= rnorm(n_obs,mean=0,sd=1) 
    synt_var <- arrange(synt_var,order_i)
    new_data <- cbind(new_data,synt_var$var)
    #change the name of the new feature do synt i
    names(new_data)[c(i)] <- paste("synt",i,sep="")
  }
  
  return(new_data)
  
}

10 synthetic features

set.seed(42)
#create 10 new synt features 
synt_feat<- Permutation_sampling(theData=pokedex,theOutcome="Legendary",n_synt=10)

pokedex_10synt <- cbind(pokedex,synt_feat)

model_bswims.knn_10 <- HLCM_EM(Legendary ~ .,pokedex_10synt)

## [+-][+-][+-]( 0 )< 748 , 757 , 55 , 0 >

model_bswims.knn_bs_10 <- predictionStats_binary(cbind(pokedex_10synt$Legendary,
                                   predict(model_bswims.knn_10,pokedex_10synt)),
                                   "LC_BSWIMS.KNN 10", cex=0.89)

## LC_BSWIMS.KNN 10

## Warning in if (class(modelPredictions) == "data.frame") {: the condition has
## length > 1 and only the first element will be used

100 synthetic features

#create 100 new synt features 
synt_feat100<- Permutation_sampling(pokedex,"Legendary",100)
#create a new dataframe
pokedex_100synt <- cbind(pokedex,synt_feat100)

model_bswims.knn_100 <- HLCM_EM(Legendary ~ .,pokedex_100synt)

## [+-][+-][+-]( 0 )< 748 , 757 , 55 , 0 >

model_bswims.knn_bs_100 <- predictionStats_binary(cbind(pokedex_100synt$Legendary,
                                   predict(model_bswims.knn_100,pokedex_100synt)),
                                   "LC_BSWIMS.KNN 100",
                                   cex=0.89)

## LC_BSWIMS.KNN 100

## Warning in if (class(modelPredictions) == "data.frame") {: the condition has
## length > 1 and only the first element will be used

1000 synthetic features

#create 1000 new synt features 
synt_feat1k<- Permutation_sampling(pokedex,"Legendary",1000)
#create a new dataframe
pokedex_1ksynt <- cbind(pokedex,synt_feat1k)

model_bswims.knn_1k <- HLCM_EM(Legendary ~ .,pokedex_1ksynt)

## [+-][+-][+-]( 0 )< 748 , 757 , 55 , 0 >

model_bswims.knn_bs_1k <- predictionStats_binary(cbind(pokedex_1ksynt$Legendary,
                                   predict(model_bswims.knn_1k,pokedex_1ksynt)),
                                   "LC_BSWIMS.KNN 1k",
                                   cex=0.89)

## LC_BSWIMS.KNN 1k

## Warning in if (class(modelPredictions) == "data.frame") {: the condition has
## length > 1 and only the first element will be used