vinc_eda

knitr::opts_chunk$set(echo = TRUE)
library(reticulate)
use_python("/anaconda3/bin/python3.7")

Preparation

import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
plt.switch_backend('agg')

#source("vinc_gather_data.R")
# data = gatherData()
# print(data)

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

combi <- read_csv("data-clean/final_combi.csv")

## Parsed with column specification:
## cols(
##   asx = col_double(),
##   oecd_li = col_double(),
##   abs_imports = col_double(),
##   abs_exports = col_double(),
##   gold_price_london_fixing = col_double(),
##   unemployment = col_double(),
##   rba_cash_rate = col_double(),
##   yearly_inflation = col_double(),
##   quarterly_inflation = col_double(),
##   exchange_rate = col_double(),
##   djia = col_double(),
##   pe_ratio = col_double(),
##   dividend = col_double(),
##   iron = col_double(),
##   oil = col_double(),
##   Date = col_date(format = "")
## )

combizs <- read_csv("data-clean/final_combi_zs.csv")

## Parsed with column specification:
## cols(
##   asx = col_double(),
##   oecd_li = col_double(),
##   abs_imports = col_double(),
##   abs_exports = col_double(),
##   gold_price_london_fixing = col_double(),
##   unemployment = col_double(),
##   rba_cash_rate = col_double(),
##   yearly_inflation = col_double(),
##   quarterly_inflation = col_double(),
##   exchange_rate = col_double(),
##   djia = col_double(),
##   pe_ratio = col_double(),
##   dividend = col_double(),
##   iron = col_double(),
##   oil = col_double(),
##   binary_asx = col_double()
## )

# Format date for combi
combi$Date = as.Date(combi$Date, "%Y-%m-%d")
glimpse(combi)

## Observations: 175
## Variables: 16
## $ asx                      <dbl> 4107, 4173, 4110, 3983, 4106, 4278, 438…
## $ oecd_li                  <dbl> 100.21160, 100.14570, 100.06870, 100.01…
## $ abs_imports              <dbl> 11154780, 11123461, 12699350, 12569908,…
## $ abs_exports              <dbl> 9232638, 9503409, 10451659, 11566650, 1…
## $ gold_price_london_fixing <dbl> 438.00, 423.80, 436.55, 427.50, 433.20,…
## $ unemployment             <dbl> 5.056507, 5.660385, 5.796897, 5.503269,…
## $ rba_cash_rate            <dbl> 5.25, 5.25, 5.50, 5.50, 5.50, 5.50, 5.5…
## $ yearly_inflation         <dbl> 2.5, 2.5, 2.5, 2.4, 2.4, 2.4, 2.5, 2.5,…
## $ quarterly_inflation      <dbl> 1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.6, 0.6,…
## $ exchange_rate            <dbl> 0.7744, 0.7905, 0.7719, 0.7811, 0.7557,…
## $ djia                     <dbl> 10490, 10766, 10504, 10193, 10467, 1027…
## $ pe_ratio                 <dbl> 17.0, 15.1, 14.9, 14.3, 14.3, 15.2, 15.…
## $ dividend                 <dbl> 3.6, 3.8, 3.9, 4.0, 4.0, 3.8, 3.7, 3.4,…
## $ iron                     <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,…
## $ oil                      <dbl> 47, 48, 54, 53, 50, 56, 59, 65, 66, 62,…
## $ Date                     <date> 2005-01-01, 2005-02-01, 2005-03-01, 20…

# Add date to combi zs
date_seq = seq(as.Date("2005-01-01"), by = "1 month", length.out = nrow(combizs))
combizs_d = combizs
combizs_d$Date = date_seq

After mungling and merging data, we have 15 variables and 175 observations. It is the 175 months ranging from 2005 to 2019.

Check missing data

Missing data could give big impact on modeling. We recheck again to see if there is missing data. We are using missing plot, on which the variables are on the yaxis and instances are on the xaxis. Below graph shows that there is no red color so there is no missing data in this dataset.

library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(mlbench)
missmap(combi, col=c("red", "lightgreen"), legend=FALSE)

## Warning in if (class(obj) == "amelia") {: the condition has length > 1 and
## only the first element will be used

## Warning: Unknown or uninitialised column: 'arguments'.

## Warning: Unknown or uninitialised column: 'arguments'.

## Warning: Unknown or uninitialised column: 'imputations'.

# library(nycflights13)
# missmap(flights, col=c("blue", "lightgreen"), legend=FALSE)

Remove missing values

# combi %>% summarise_all(funs(sum(is.na(.))))
# combi %>%
#      rowid_to_column() %>%
#      filter(is.na(abs_imports))
combi <- combi %>% drop_na()
glimpse(combi)

## Observations: 174
## Variables: 16
## $ asx                      <dbl> 4107, 4173, 4110, 3983, 4106, 4278, 438…
## $ oecd_li                  <dbl> 100.21160, 100.14570, 100.06870, 100.01…
## $ abs_imports              <dbl> 11154780, 11123461, 12699350, 12569908,…
## $ abs_exports              <dbl> 9232638, 9503409, 10451659, 11566650, 1…
## $ gold_price_london_fixing <dbl> 438.00, 423.80, 436.55, 427.50, 433.20,…
## $ unemployment             <dbl> 5.056507, 5.660385, 5.796897, 5.503269,…
## $ rba_cash_rate            <dbl> 5.25, 5.25, 5.50, 5.50, 5.50, 5.50, 5.5…
## $ yearly_inflation         <dbl> 2.5, 2.5, 2.5, 2.4, 2.4, 2.4, 2.5, 2.5,…
## $ quarterly_inflation      <dbl> 1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.6, 0.6,…
## $ exchange_rate            <dbl> 0.7744, 0.7905, 0.7719, 0.7811, 0.7557,…
## $ djia                     <dbl> 10490, 10766, 10504, 10193, 10467, 1027…
## $ pe_ratio                 <dbl> 17.0, 15.1, 14.9, 14.3, 14.3, 15.2, 15.…
## $ dividend                 <dbl> 3.6, 3.8, 3.9, 4.0, 4.0, 3.8, 3.7, 3.4,…
## $ iron                     <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,…
## $ oil                      <dbl> 47, 48, 54, 53, 50, 56, 59, 65, 66, 62,…
## $ Date                     <date> 2005-01-01, 2005-02-01, 2005-03-01, 20…

After removing of missing values, one line of data is cut off. Now we have 15 variables with 174 observations.

Feature engineering data

As the goal is to predict the up/down of asx variable, we add one column called ‘direction’ with value 1 for up and 0 for down. In this report, we will use direction as a response vairable, as that shows whether the asx went up or down since the previous day.

#--- Remove 1 row at top ---#
#asxt <- combi[-1,]
#asxt[nrow(asxt) + 1,] = asxt[nrow(asxt),]

#--- Add 1 row at top ---#
asxt <- rbind(combi[1,], combi)
asxt <- asxt[-nrow(asxt),]

combi$direction = combi$asx - asxt$asx
for (i in seq_along(combi$direction)) {
    t = combi$direction[[i]]
    if (t > 0) {
        combi$direction[[i]] = 1
    } else {
        combi$direction[[i]] = 0
    }
}

# Switch to factor direction for combi
combi$direction = as.factor(combi$direction)

# Switch to factor direction for combi zs
colnames(combizs)[colnames(combizs)=="binary_asx"] <- "direction"
combizs_nofactor  = combizs
combizs$direction = as.factor(combizs$direction)


head(combi %>% select(asx, direction))

## # A tibble: 6 x 2
##     asx direction
##   <dbl> <fct>    
## 1  4107 0        
## 2  4173 1        
## 3  4110 0        
## 4  3983 0        
## 5  4106 1        
## 6  4278 1

Visualise data

Starting with histogram could provide us the indication of distribution of some variables

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

combi %>%
  keep(is.numeric) %>% 
    multi.hist()

Dividend, direction, unemployment are quite skew on one side while asx, abs_imports, unemployment are quite normal distribution. The rest of other variables are about bi-modal distribution. To withdraw a little understanding from this plot, the unemployment and abs_imports are quite similar distribution as asx. Additionally, the up direction is quite bigger than down direction.

par(mfrow=c(1,8))
for(i in 1:8) {
    boxplot(combi[,i], main=names(combi)[i])
}

par(mfrow=c(1,8))
for(i in 9:15) {
    boxplot(combi[,i], main=names(combi)[i])
}

Distribution can also be seen by box plot. By this plot, oecd_li, pe_ratio and divident are having few outliers while rba_cash_ra and iron are quite distributed.

What should we deal with outliers here???? I am stil thinking ……..

Now, we plot the correlation between each pair of numeric variables to give an idea of which variables change together.

library(corrplot)

## corrplot 0.84 loaded

correlations <- cor(combi[,1:15])
corrplot(correlations, method="circle")

The correlation matrix is used where blue represents positive correlation and red negative and larger dot the larger correlation. In this report, we focus on the correlation between asx and other variables. On this plot, asx seems to have postive correlation with djia, pe_ratio, oecd_li, abs_imports and abs_exports as well as negative correlation with dividend, yearly_inflation, iron, exchange_rate.

Given those high correlation variables, we put those into scatter plots matrix with direction up/down as an indicator, blue for up and red for down.

group <- NA
group[combi$direction == 1] <- 1
group[combi$direction == 0] <- 2
pairs(combi%>%select(asx, djia, pe_ratio, oecd_li, abs_imports, abs_exports, direction), col=c("blue", "red")[group])

# dividend, yearly_inflation, iron, exchange_rate

The djia, pe_ratio could give some signals of positive correlation but not too strong here.

Now we take the further step to view the distribution of each variable broken down into direction. Below plot presents to us the distribution of up and down are quite similar in all variables. There are slight differences of up and down at iron, oil, quarterly_inflation and abs_exports but it is still hard to predict something here. In general, it is hard to predict up and down if using only one or two variables.

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

x <- combi[,1:8]
y <- combi[,17]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=combi[,1:15], y=combi$direction, plot="density", scales=list(x=list(relation="free"), y=list(relation="free")), auto.key=list(columns=3))

## Warning in draw.key(simpleKey(...), draw = FALSE): not enough rows for
## columns

Next, we could explore trend of all variables with asx to observe the trend differences. We do log scale these variables to be able to place those on same plot without losing each individual trend.

combi_t <- combi[,c(1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16)] %>%
  gather(key = "indicator", value = "value", -Date)

ggplot(combi_t, aes(x = Date, y = value)) + 
    geom_line(aes(color = indicator), size = 1) + 
    theme_minimal() +
    scale_y_log10()

At this angle, even we log scale these variables but it is still hard to discover trend between multiple variables. However it seems that there is no variable having same trend as asx.

Compare different machine learning algorithm

library(lattice)
library(munsell)
library(ggplot2)
library(caret)


#combi_ml <- combi[,c(-16, -1)]
combi_ml <- combizs[,-1]

# Run algorithms using 10-fold cross validation
# control <- trainControl(method="cv", number=10)
control <- trainControl(method = "cv",
    number = 10,
    search = "grid")

metric <- "Accuracy"

# a) linear algorithms
set.seed(100)
fit.lda <- train(direction~., data=combi_ml, method="lda", metric=metric, trControl=control)

# b) nonlinear algorithms
# CART
set.seed(100)
fit.cart <- train(direction~., data=combi_ml, method="rpart", metric=metric, trControl=control)
# 
# kNN
set.seed(100)
fit.knn <- train(direction~., data=combi_ml, method="knn", metric=metric, trControl=control)
# 
# c) advanced algorithms
# SVM
set.seed(100)
fit.svm <- train(direction~., data=combi_ml, method="svmRadial", metric=metric, trControl=control)
# 
# Random Forest
set.seed(100)
fit.rf <- train(direction~., data=combi_ml, method="rf", metric=metric, trControl=control)
# 
# Logistic Regression
set.seed(100)
fit.glm <- train(direction~., data=combi_ml, method="glm", metric=metric, trControl=control)

# # summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf, glm=fit.glm))

summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, cart, knn, svm, rf, glm 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda  0.5000000 0.6617647 0.7058824 0.7081699 0.7638889 0.8823529    0
## cart 0.5294118 0.5939542 0.6846405 0.6830065 0.7540850 0.8823529    0
## knn  0.4117647 0.6078431 0.6862745 0.6604575 0.7058824 0.8333333    0
## svm  0.4444444 0.6470588 0.6862745 0.6669935 0.7540850 0.8235294    0
## rf   0.4444444 0.6470588 0.7140523 0.6901961 0.7540850 0.8235294    0
## glm  0.5000000 0.6519608 0.7140523 0.7019608 0.7647059 0.8823529    0
## 
## Kappa 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda  -0.02531646 0.2913577 0.3783697 0.3910914 0.5200949 0.7462687    0
## cart -0.06250000 0.1242578 0.3345929 0.3341357 0.4805141 0.7571429    0
## knn  -0.21428571 0.1410707 0.3294187 0.2812441 0.3795620 0.6582278    0
## svm  -0.12500000 0.2120452 0.3380725 0.2953258 0.4769979 0.6277372    0
## rf   -0.12500000 0.2388060 0.3833647 0.3462883 0.4769979 0.6433566    0
## glm  -0.02531646 0.2848214 0.4049709 0.3851978 0.5088486 0.7462687    0

dotplot(results)

Run kfold logistic regression

Running logistic with random partition

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:psych':
## 
##     outlier

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(caret)
library(e1071)

combi_ml <- combizs[,-1]
t <- createDataPartition(combi_ml$direction, p = 0.8, list=FALSE)
combi_train <- combi_ml[t,]
combi_test <- combi_ml[-t,]


# Logistic regression with train set
glm1 = glm(direction ~ ., family=binomial(logit), data = combi_train)
summary(glm1)

## 
## Call:
## glm(formula = direction ~ ., family = binomial(logit), data = combi_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1880  -0.7887   0.2982   0.7460   2.2477  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               0.208811   0.227821   0.917   0.3594    
## oecd_li                  -0.462097   0.362575  -1.274   0.2025    
## abs_imports               0.343740   0.248223   1.385   0.1661    
## abs_exports               0.135808   0.239707   0.567   0.5710    
## gold_price_london_fixing  0.016457   0.257003   0.064   0.9489    
## unemployment             -0.383793   0.325225  -1.180   0.2380    
## rba_cash_rate             0.398722   0.620285   0.643   0.5204    
## yearly_inflation         -0.346929   0.373206  -0.930   0.3526    
## quarterly_inflation      -0.549619   0.294286  -1.868   0.0618 .  
## exchange_rate            -0.173940   0.285152  -0.610   0.5419    
## djia                      1.767087   0.378839   4.664 3.09e-06 ***
## pe_ratio                  0.119773   0.639629   0.187   0.8515    
## dividend                 -0.676071   0.660572  -1.023   0.3061    
## iron                      0.039304   0.243980   0.161   0.8720    
## oil                       0.008865   0.262081   0.034   0.9730    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 191.21  on 139  degrees of freedom
## Residual deviance: 130.72  on 125  degrees of freedom
## AIC: 160.72
## 
## Number of Fisher Scoring iterations: 5

plot(glm1)

# predict with train set
probability<-predict(glm1, newdata = combi_test, type="response")
plot(probability)

#setting a threshold value of 0.5 for positive...
#you may want to see if there are better settings that you 
#could use here (Hint: do a search for "ROC curve")
prediction <- ifelse(probability > 0.5, 1, 0) 

# building a contingency table of the counts at each combination of factor levels
confusion  <- table(combi_test$direction, prediction) 
confusion

##    prediction
##      0  1
##   0  9  5
##   1  5 15

Running default k-fold

## Generalized Linear Model 
## 
## 174 samples
##  14 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 156, 157, 156, 157, 157, 157, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7019608  0.3851978

Running k-fold manual

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

## 
  |                                                                       
  |                                                                 |   0%

## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

Accuracy of models

# Average accuracy of the model
mean(acc)

## [1] 0.6333333

Histogram / Boxplot of accuracy

# Histogram of accuracy
hist(acc,xlab='Accuracy',ylab='Freq',
     col='cyan',border='blue',density=30)

# Boxplot of accuracy
boxplot(acc,col='cyan',border='blue',horizontal=T,xlab='Accuracy',
         main='Accuracy CV')

Mean of False postive rate & False negative rate

# Confusion matrix and plots of fpr and fnr
mean(fpr)

## [1] 0.4116667

mean(fnr)

## [1] 0.3483333

Histogram of False postive rate & False negative rate & Miss class

hist(fpr,xlab='% of fnr',ylab='Freq',main='FPR',
     col='cyan',border='blue',density=30)

hist(fnr,xlab='% of fnr',ylab='Freq',main='FNR',
     col='cyan',border='blue',density=30)

hist(fnr,xlab='% of misclassfication',ylab='Freq',main='Miss class',
     col='cyan',border='blue',density=30)

Run random forests

1. Use default setting when train model

Build the model with the default values:

library(randomForest)
library(caret)
library(e1071)

combi_ml <- combizs[,-1]
t <- createDataPartition(combi_ml$direction, p = 0.8, list=FALSE)
combi_train <- combi_ml[t,]
combi_test <- combi_ml[-t,]

# Define the control
trControl <- trainControl(method = "cv",
    number = 10,
    search = "grid")
set.seed(1234)

# Run the model
rf_default <- train(direction~.,
    data = combi_train,
    method = "rf",
    metric = "Accuracy",
    trControl = trControl)

# Print the results
plot(rf_default)

The algorithm tested three different values of mtry: 2, 8, 14. The optimum value is 14 ith accuracy 0.64. Next step is to find a better mtry.

Step 2 Find better mtry

Test the model with values of mtry from 1 to 14.

set.seed(100)
tuneGrid <- expand.grid(.mtry = c(1: 14))
rf_mtry <- train(direction~.,
    data = combi_train,
    method = "rf",
    metric = "Accuracy",
    tuneGrid = tuneGrid,
    trControl = trControl,
    importance = TRUE,
    nodesize = 14,
    ntree = 300)
plot(rf_mtry)

best_mtry <- rf_mtry$bestTune$mtry 
max(rf_mtry$results$Accuracy)

## [1] 0.65

We find out that the optimum value of mtry is 10 with accuracy 0.69. We would go for 10.

Step 3. Search the best maxnodes

store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(5: 20)) {
    set.seed(100)
    rf_maxnode <- train(direction~.,
        data = combi_train,
        method = "rf",
        metric = "Accuracy",
        tuneGrid = tuneGrid,
        trControl = trControl,
        importance = TRUE,
        nodesize = 14,
        maxnodes = maxnodes,
        ntree = 300)
    current_iteration <- toString(maxnodes)
    store_maxnode[[current_iteration]] <- rf_maxnode
}
results_mtry <- resamples(store_maxnode)
summary(results_mtry)

## 
## Call:
## summary.resamples(object = results_mtry)
## 
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 
## Number of resamples: 10 
## 
## Accuracy 
##         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 5  0.4285714 0.5714286 0.6071429 0.6357143 0.7142857 0.8571429    0
## 6  0.5000000 0.5714286 0.6071429 0.6357143 0.6428571 0.9285714    0
## 7  0.5000000 0.5714286 0.6071429 0.6500000 0.7678571 0.8571429    0
## 8  0.4285714 0.5714286 0.5714286 0.6285714 0.7678571 0.7857143    0
## 9  0.4285714 0.5714286 0.6071429 0.6500000 0.7857143 0.8571429    0
## 10 0.5000000 0.5714286 0.6071429 0.6642857 0.7678571 0.9285714    0
## 11 0.4285714 0.5714286 0.5714286 0.6357143 0.7142857 0.9285714    0
## 12 0.5000000 0.5714286 0.6071429 0.6642857 0.7857143 0.9285714    0
## 13 0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714    0
## 14 0.5000000 0.5714286 0.6071429 0.6500000 0.7142857 0.9285714    0
## 15 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714    0
## 16 0.5000000 0.5714286 0.5714286 0.6428571 0.7142857 0.9285714    0
## 17 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714    0
## 18 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714    0
## 19 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714    0
## 20 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714    0
## 
## Kappa 
##           Min.     1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 5  -0.16666667  0.09646739 0.1911111 0.2439119 0.4370629 0.6956522    0
## 6  -0.04255319  0.14182692 0.2222222 0.2581137 0.2470449 0.8510638    0
## 7  -0.04255319  0.09646739 0.2238134 0.2840887 0.5153846 0.6956522    0
## 8  -0.16666667  0.08695652 0.1586538 0.2343539 0.5153846 0.5333333    0
## 9  -0.12000000  0.09646739 0.2238134 0.2847584 0.5333333 0.6956522    0
## 10  0.03921569  0.12500000 0.2072650 0.3169971 0.5153846 0.8510638    0
## 11 -0.16666667  0.08695652 0.1586538 0.2519241 0.4439799 0.8510638    0
## 12 -0.04255319  0.09646739 0.2072650 0.3089899 0.5333333 0.8510638    0
## 13 -0.04255319  0.09646739 0.2072650 0.2920649 0.5153846 0.8510638    0
## 14 -0.04255319  0.08695652 0.2193627 0.2764772 0.4439799 0.8510638    0
## 15 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638    0
## 16 -0.04255319  0.08695652 0.1586538 0.2643354 0.4439799 0.8510638    0
## 17 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638    0
## 18 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638    0
## 19 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638    0
## 20 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638    0

Running maxnode from 5 to 30, the optimum value is 13.

Step 4. Search the best ntrees

store_maxtrees <- list()
for (ntree in c(250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000)) {
    set.seed(100)
    rf_maxtrees <- train(direction~.,
        data = combi_train,
        method = "rf",
        metric = "Accuracy",
        tuneGrid = tuneGrid,
        trControl = trControl,
        importance = TRUE,
        nodesize = 14,
        maxnodes = 13,
        ntree = ntree)
    key <- toString(ntree)
    store_maxtrees[[key]] <- rf_maxtrees
}
results_tree <- resamples(store_maxtrees)
summary(results_tree)

## 
## Call:
## summary.resamples(object = results_tree)
## 
## Models: 250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 250  0.5000000 0.5714286 0.6071429 0.6571429 0.7142857 0.9285714    0
## 300  0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714    0
## 350  0.5000000 0.5714286 0.6071429 0.6500000 0.7500000 0.9285714    0
## 400  0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714    0
## 450  0.5000000 0.5714286 0.6071429 0.6500000 0.7500000 0.9285714    0
## 500  0.5000000 0.5178571 0.6071429 0.6500000 0.7678571 0.9285714    0
## 550  0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714    0
## 600  0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714    0
## 800  0.4285714 0.5714286 0.6071429 0.6428571 0.7142857 0.9285714    0
## 1000 0.4285714 0.5714286 0.6071429 0.6428571 0.7142857 0.9285714    0
## 2000 0.4285714 0.5714286 0.6071429 0.6428571 0.7142857 0.9285714    0
## 
## Kappa 
##             Min.       1st Qu.   Median      Mean   3rd Qu.      Max. NA's
## 250  -0.04255319  0.1250000000 0.207265 0.2946173 0.4439799 0.8510638    0
## 300  -0.04255319  0.0964673913 0.207265 0.2920649 0.5153846 0.8510638    0
## 350  -0.04255319  0.0964673913 0.207265 0.2798733 0.4849057 0.8510638    0
## 400  -0.04255319  0.0964673913 0.207265 0.2920649 0.5153846 0.8510638    0
## 450  -0.04255319  0.0964673913 0.207265 0.2798733 0.4849057 0.8510638    0
## 500  -0.08888889 -0.0006648936 0.207265 0.2744804 0.5153846 0.8510638    0
## 550  -0.04255319  0.0964673913 0.207265 0.2920649 0.5153846 0.8510638    0
## 600  -0.04255319  0.0964673913 0.207265 0.2920649 0.5153846 0.8510638    0
## 800  -0.16666667  0.0964673913 0.207265 0.2654507 0.4439799 0.8510638    0
## 1000 -0.16666667  0.0964673913 0.207265 0.2654507 0.4439799 0.8510638    0
## 2000 -0.16666667  0.0964673913 0.207265 0.2654507 0.4439799 0.8510638    0

The best value of ntree is 300. So finally, we got: - mtry = 10 - maxnodes = 13 - ntree = 300

Predict model

Now we evaluate model:

fit_rf <- train(direction~.,
    combi_train,
    method = "rf",
    metric = "Accuracy",
    tuneGrid = tuneGrid,
    trControl = trControl,
    importance = TRUE,
    nodesize = 14,
    ntree = 300,
    maxnodes = 13)

prediction <-predict(fit_rf, combi_test)
confusionMatrix(prediction, combi_test$direction)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 10  2
##          1  4 18
##                                           
##                Accuracy : 0.8235          
##                  95% CI : (0.6547, 0.9324)
##     No Information Rate : 0.5882          
##     P-Value [Acc > NIR] : 0.003193        
##                                           
##                   Kappa : 0.6277          
##                                           
##  Mcnemar's Test P-Value : 0.683091        
##                                           
##             Sensitivity : 0.7143          
##             Specificity : 0.9000          
##          Pos Pred Value : 0.8333          
##          Neg Pred Value : 0.8182          
##              Prevalence : 0.4118          
##          Detection Rate : 0.2941          
##    Detection Prevalence : 0.3529          
##       Balanced Accuracy : 0.8071          
##                                           
##        'Positive' Class : 0               
##

We got the accuracy of 0.7222 percent, which is higher than the default value.

Visualise the result

varimp_mars <- varImp(fit_rf)
plot(varimp_mars, main="Variable Importance with MARS")