knitr::opts_chunk$set(echo = TRUE)
library(reticulate)
use_python("/anaconda3/bin/python3.7")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.switch_backend('agg')
#source("vinc_gather_data.R")
# data = gatherData()
# print(data)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
combi <- read_csv("data-clean/final_combi.csv")
## Parsed with column specification:
## cols(
## asx = col_double(),
## oecd_li = col_double(),
## abs_imports = col_double(),
## abs_exports = col_double(),
## gold_price_london_fixing = col_double(),
## unemployment = col_double(),
## rba_cash_rate = col_double(),
## yearly_inflation = col_double(),
## quarterly_inflation = col_double(),
## exchange_rate = col_double(),
## djia = col_double(),
## pe_ratio = col_double(),
## dividend = col_double(),
## iron = col_double(),
## oil = col_double(),
## Date = col_date(format = "")
## )
combizs <- read_csv("data-clean/final_combi_zs.csv")
## Parsed with column specification:
## cols(
## asx = col_double(),
## oecd_li = col_double(),
## abs_imports = col_double(),
## abs_exports = col_double(),
## gold_price_london_fixing = col_double(),
## unemployment = col_double(),
## rba_cash_rate = col_double(),
## yearly_inflation = col_double(),
## quarterly_inflation = col_double(),
## exchange_rate = col_double(),
## djia = col_double(),
## pe_ratio = col_double(),
## dividend = col_double(),
## iron = col_double(),
## oil = col_double(),
## binary_asx = col_double()
## )
# Format date for combi
combi$Date = as.Date(combi$Date, "%Y-%m-%d")
glimpse(combi)
## Observations: 175
## Variables: 16
## $ asx <dbl> 4107, 4173, 4110, 3983, 4106, 4278, 438…
## $ oecd_li <dbl> 100.21160, 100.14570, 100.06870, 100.01…
## $ abs_imports <dbl> 11154780, 11123461, 12699350, 12569908,…
## $ abs_exports <dbl> 9232638, 9503409, 10451659, 11566650, 1…
## $ gold_price_london_fixing <dbl> 438.00, 423.80, 436.55, 427.50, 433.20,…
## $ unemployment <dbl> 5.056507, 5.660385, 5.796897, 5.503269,…
## $ rba_cash_rate <dbl> 5.25, 5.25, 5.50, 5.50, 5.50, 5.50, 5.5…
## $ yearly_inflation <dbl> 2.5, 2.5, 2.5, 2.4, 2.4, 2.4, 2.5, 2.5,…
## $ quarterly_inflation <dbl> 1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.6, 0.6,…
## $ exchange_rate <dbl> 0.7744, 0.7905, 0.7719, 0.7811, 0.7557,…
## $ djia <dbl> 10490, 10766, 10504, 10193, 10467, 1027…
## $ pe_ratio <dbl> 17.0, 15.1, 14.9, 14.3, 14.3, 15.2, 15.…
## $ dividend <dbl> 3.6, 3.8, 3.9, 4.0, 4.0, 3.8, 3.7, 3.4,…
## $ iron <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,…
## $ oil <dbl> 47, 48, 54, 53, 50, 56, 59, 65, 66, 62,…
## $ Date <date> 2005-01-01, 2005-02-01, 2005-03-01, 20…
# Add date to combi zs
date_seq = seq(as.Date("2005-01-01"), by = "1 month", length.out = nrow(combizs))
combizs_d = combizs
combizs_d$Date = date_seq
After mungling and merging data, we have 15 variables and 175 observations. It is the 175 months ranging from 2005 to 2019.
Missing data could give big impact on modeling. We recheck again to see if there is missing data. We are using missing plot, on which the variables are on the yaxis and instances are on the xaxis. Below graph shows that there is no red color so there is no missing data in this dataset.
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mlbench)
missmap(combi, col=c("red", "lightgreen"), legend=FALSE)
## Warning in if (class(obj) == "amelia") {: the condition has length > 1 and
## only the first element will be used
## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.
# library(nycflights13)
# missmap(flights, col=c("blue", "lightgreen"), legend=FALSE)
# combi %>% summarise_all(funs(sum(is.na(.))))
# combi %>%
# rowid_to_column() %>%
# filter(is.na(abs_imports))
combi <- combi %>% drop_na()
glimpse(combi)
## Observations: 174
## Variables: 16
## $ asx <dbl> 4107, 4173, 4110, 3983, 4106, 4278, 438…
## $ oecd_li <dbl> 100.21160, 100.14570, 100.06870, 100.01…
## $ abs_imports <dbl> 11154780, 11123461, 12699350, 12569908,…
## $ abs_exports <dbl> 9232638, 9503409, 10451659, 11566650, 1…
## $ gold_price_london_fixing <dbl> 438.00, 423.80, 436.55, 427.50, 433.20,…
## $ unemployment <dbl> 5.056507, 5.660385, 5.796897, 5.503269,…
## $ rba_cash_rate <dbl> 5.25, 5.25, 5.50, 5.50, 5.50, 5.50, 5.5…
## $ yearly_inflation <dbl> 2.5, 2.5, 2.5, 2.4, 2.4, 2.4, 2.5, 2.5,…
## $ quarterly_inflation <dbl> 1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.6, 0.6,…
## $ exchange_rate <dbl> 0.7744, 0.7905, 0.7719, 0.7811, 0.7557,…
## $ djia <dbl> 10490, 10766, 10504, 10193, 10467, 1027…
## $ pe_ratio <dbl> 17.0, 15.1, 14.9, 14.3, 14.3, 15.2, 15.…
## $ dividend <dbl> 3.6, 3.8, 3.9, 4.0, 4.0, 3.8, 3.7, 3.4,…
## $ iron <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,…
## $ oil <dbl> 47, 48, 54, 53, 50, 56, 59, 65, 66, 62,…
## $ Date <date> 2005-01-01, 2005-02-01, 2005-03-01, 20…
After removing of missing values, one line of data is cut off. Now we have 15 variables with 174 observations.
As the goal is to predict the up/down of asx variable, we add one column called ‘direction’ with value 1 for up and 0 for down. In this report, we will use direction as a response vairable, as that shows whether the asx went up or down since the previous day.
#--- Remove 1 row at top ---#
#asxt <- combi[-1,]
#asxt[nrow(asxt) + 1,] = asxt[nrow(asxt),]
#--- Add 1 row at top ---#
asxt <- rbind(combi[1,], combi)
asxt <- asxt[-nrow(asxt),]
combi$direction = combi$asx - asxt$asx
for (i in seq_along(combi$direction)) {
t = combi$direction[[i]]
if (t > 0) {
combi$direction[[i]] = 1
} else {
combi$direction[[i]] = 0
}
}
# Switch to factor direction for combi
combi$direction = as.factor(combi$direction)
# Switch to factor direction for combi zs
colnames(combizs)[colnames(combizs)=="binary_asx"] <- "direction"
combizs_nofactor = combizs
combizs$direction = as.factor(combizs$direction)
head(combi %>% select(asx, direction))
## # A tibble: 6 x 2
## asx direction
## <dbl> <fct>
## 1 4107 0
## 2 4173 1
## 3 4110 0
## 4 3983 0
## 5 4106 1
## 6 4278 1
Starting with histogram could provide us the indication of distribution of some variables
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
combi %>%
keep(is.numeric) %>%
multi.hist()
Dividend, direction, unemployment are quite skew on one side while asx, abs_imports, unemployment are quite normal distribution. The rest of other variables are about bi-modal distribution. To withdraw a little understanding from this plot, the unemployment and abs_imports are quite similar distribution as asx. Additionally, the up direction is quite bigger than down direction.
par(mfrow=c(1,8))
for(i in 1:8) {
boxplot(combi[,i], main=names(combi)[i])
}
par(mfrow=c(1,8))
for(i in 9:15) {
boxplot(combi[,i], main=names(combi)[i])
}
Distribution can also be seen by box plot. By this plot, oecd_li, pe_ratio and divident are having few outliers while rba_cash_ra and iron are quite distributed.
What should we deal with outliers here???? I am stil thinking ……..
Now, we plot the correlation between each pair of numeric variables to give an idea of which variables change together.
library(corrplot)
## corrplot 0.84 loaded
correlations <- cor(combi[,1:15])
corrplot(correlations, method="circle")
The correlation matrix is used where blue represents positive correlation and red negative and larger dot the larger correlation. In this report, we focus on the correlation between asx and other variables. On this plot, asx seems to have postive correlation with djia, pe_ratio, oecd_li, abs_imports and abs_exports as well as negative correlation with dividend, yearly_inflation, iron, exchange_rate.
Given those high correlation variables, we put those into scatter plots matrix with direction up/down as an indicator, blue for up and red for down.
group <- NA
group[combi$direction == 1] <- 1
group[combi$direction == 0] <- 2
pairs(combi%>%select(asx, djia, pe_ratio, oecd_li, abs_imports, abs_exports, direction), col=c("blue", "red")[group])
# dividend, yearly_inflation, iron, exchange_rate
The djia, pe_ratio could give some signals of positive correlation but not too strong here.
Now we take the further step to view the distribution of each variable broken down into direction. Below plot presents to us the distribution of up and down are quite similar in all variables. There are slight differences of up and down at iron, oil, quarterly_inflation and abs_exports but it is still hard to predict something here. In general, it is hard to predict up and down if using only one or two variables.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
x <- combi[,1:8]
y <- combi[,17]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=combi[,1:15], y=combi$direction, plot="density", scales=list(x=list(relation="free"), y=list(relation="free")), auto.key=list(columns=3))
## Warning in draw.key(simpleKey(...), draw = FALSE): not enough rows for
## columns
Next, we could explore trend of all variables with asx to observe the trend differences. We do log scale these variables to be able to place those on same plot without losing each individual trend.
combi_t <- combi[,c(1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16)] %>%
gather(key = "indicator", value = "value", -Date)
ggplot(combi_t, aes(x = Date, y = value)) +
geom_line(aes(color = indicator), size = 1) +
theme_minimal() +
scale_y_log10()
At this angle, even we log scale these variables but it is still hard to discover trend between multiple variables. However it seems that there is no variable having same trend as asx.
library(lattice)
library(munsell)
library(ggplot2)
library(caret)
#combi_ml <- combi[,c(-16, -1)]
combi_ml <- combizs[,-1]
# Run algorithms using 10-fold cross validation
# control <- trainControl(method="cv", number=10)
control <- trainControl(method = "cv",
number = 10,
search = "grid")
metric <- "Accuracy"
# a) linear algorithms
set.seed(100)
fit.lda <- train(direction~., data=combi_ml, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(100)
fit.cart <- train(direction~., data=combi_ml, method="rpart", metric=metric, trControl=control)
#
# kNN
set.seed(100)
fit.knn <- train(direction~., data=combi_ml, method="knn", metric=metric, trControl=control)
#
# c) advanced algorithms
# SVM
set.seed(100)
fit.svm <- train(direction~., data=combi_ml, method="svmRadial", metric=metric, trControl=control)
#
# Random Forest
set.seed(100)
fit.rf <- train(direction~., data=combi_ml, method="rf", metric=metric, trControl=control)
#
# Logistic Regression
set.seed(100)
fit.glm <- train(direction~., data=combi_ml, method="glm", metric=metric, trControl=control)
# # summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf, glm=fit.glm))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, cart, knn, svm, rf, glm
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.5000000 0.6617647 0.7058824 0.7081699 0.7638889 0.8823529 0
## cart 0.5294118 0.5939542 0.6846405 0.6830065 0.7540850 0.8823529 0
## knn 0.4117647 0.6078431 0.6862745 0.6604575 0.7058824 0.8333333 0
## svm 0.4444444 0.6470588 0.6862745 0.6669935 0.7540850 0.8235294 0
## rf 0.4444444 0.6470588 0.7140523 0.6901961 0.7540850 0.8235294 0
## glm 0.5000000 0.6519608 0.7140523 0.7019608 0.7647059 0.8823529 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda -0.02531646 0.2913577 0.3783697 0.3910914 0.5200949 0.7462687 0
## cart -0.06250000 0.1242578 0.3345929 0.3341357 0.4805141 0.7571429 0
## knn -0.21428571 0.1410707 0.3294187 0.2812441 0.3795620 0.6582278 0
## svm -0.12500000 0.2120452 0.3380725 0.2953258 0.4769979 0.6277372 0
## rf -0.12500000 0.2388060 0.3833647 0.3462883 0.4769979 0.6433566 0
## glm -0.02531646 0.2848214 0.4049709 0.3851978 0.5088486 0.7462687 0
dotplot(results)
Running logistic with random partition
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
library(e1071)
combi_ml <- combizs[,-1]
t <- createDataPartition(combi_ml$direction, p = 0.8, list=FALSE)
combi_train <- combi_ml[t,]
combi_test <- combi_ml[-t,]
# Logistic regression with train set
glm1 = glm(direction ~ ., family=binomial(logit), data = combi_train)
summary(glm1)
##
## Call:
## glm(formula = direction ~ ., family = binomial(logit), data = combi_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1880 -0.7887 0.2982 0.7460 2.2477
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.208811 0.227821 0.917 0.3594
## oecd_li -0.462097 0.362575 -1.274 0.2025
## abs_imports 0.343740 0.248223 1.385 0.1661
## abs_exports 0.135808 0.239707 0.567 0.5710
## gold_price_london_fixing 0.016457 0.257003 0.064 0.9489
## unemployment -0.383793 0.325225 -1.180 0.2380
## rba_cash_rate 0.398722 0.620285 0.643 0.5204
## yearly_inflation -0.346929 0.373206 -0.930 0.3526
## quarterly_inflation -0.549619 0.294286 -1.868 0.0618 .
## exchange_rate -0.173940 0.285152 -0.610 0.5419
## djia 1.767087 0.378839 4.664 3.09e-06 ***
## pe_ratio 0.119773 0.639629 0.187 0.8515
## dividend -0.676071 0.660572 -1.023 0.3061
## iron 0.039304 0.243980 0.161 0.8720
## oil 0.008865 0.262081 0.034 0.9730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 191.21 on 139 degrees of freedom
## Residual deviance: 130.72 on 125 degrees of freedom
## AIC: 160.72
##
## Number of Fisher Scoring iterations: 5
plot(glm1)
# predict with train set
probability<-predict(glm1, newdata = combi_test, type="response")
plot(probability)
#setting a threshold value of 0.5 for positive...
#you may want to see if there are better settings that you
#could use here (Hint: do a search for "ROC curve")
prediction <- ifelse(probability > 0.5, 1, 0)
# building a contingency table of the counts at each combination of factor levels
confusion <- table(combi_test$direction, prediction)
confusion
## prediction
## 0 1
## 0 9 5
## 1 5 15
Running default k-fold
## Generalized Linear Model
##
## 174 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 156, 157, 156, 157, 157, 157, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7019608 0.3851978
Running k-fold manual
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
##
|
| | 0%
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
# Average accuracy of the model
mean(acc)
## [1] 0.6333333
# Histogram of accuracy
hist(acc,xlab='Accuracy',ylab='Freq',
col='cyan',border='blue',density=30)
# Boxplot of accuracy
boxplot(acc,col='cyan',border='blue',horizontal=T,xlab='Accuracy',
main='Accuracy CV')
# Confusion matrix and plots of fpr and fnr
mean(fpr)
## [1] 0.4116667
mean(fnr)
## [1] 0.3483333
hist(fpr,xlab='% of fnr',ylab='Freq',main='FPR',
col='cyan',border='blue',density=30)
hist(fnr,xlab='% of fnr',ylab='Freq',main='FNR',
col='cyan',border='blue',density=30)
hist(fnr,xlab='% of misclassfication',ylab='Freq',main='Miss class',
col='cyan',border='blue',density=30)
1. Use default setting when train model
Build the model with the default values:
library(randomForest)
library(caret)
library(e1071)
combi_ml <- combizs[,-1]
t <- createDataPartition(combi_ml$direction, p = 0.8, list=FALSE)
combi_train <- combi_ml[t,]
combi_test <- combi_ml[-t,]
# Define the control
trControl <- trainControl(method = "cv",
number = 10,
search = "grid")
set.seed(1234)
# Run the model
rf_default <- train(direction~.,
data = combi_train,
method = "rf",
metric = "Accuracy",
trControl = trControl)
# Print the results
plot(rf_default)
The algorithm tested three different values of mtry: 2, 8, 14. The optimum value is 14 ith accuracy 0.64. Next step is to find a better mtry.
Step 2 Find better mtry
Test the model with values of mtry from 1 to 14.
set.seed(100)
tuneGrid <- expand.grid(.mtry = c(1: 14))
rf_mtry <- train(direction~.,
data = combi_train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 300)
plot(rf_mtry)
best_mtry <- rf_mtry$bestTune$mtry
max(rf_mtry$results$Accuracy)
## [1] 0.65
We find out that the optimum value of mtry is 10 with accuracy 0.69. We would go for 10.
Step 3. Search the best maxnodes
store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(5: 20)) {
set.seed(100)
rf_maxnode <- train(direction~.,
data = combi_train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = maxnodes,
ntree = 300)
current_iteration <- toString(maxnodes)
store_maxnode[[current_iteration]] <- rf_maxnode
}
results_mtry <- resamples(store_maxnode)
summary(results_mtry)
##
## Call:
## summary.resamples(object = results_mtry)
##
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.4285714 0.5714286 0.6071429 0.6357143 0.7142857 0.8571429 0
## 6 0.5000000 0.5714286 0.6071429 0.6357143 0.6428571 0.9285714 0
## 7 0.5000000 0.5714286 0.6071429 0.6500000 0.7678571 0.8571429 0
## 8 0.4285714 0.5714286 0.5714286 0.6285714 0.7678571 0.7857143 0
## 9 0.4285714 0.5714286 0.6071429 0.6500000 0.7857143 0.8571429 0
## 10 0.5000000 0.5714286 0.6071429 0.6642857 0.7678571 0.9285714 0
## 11 0.4285714 0.5714286 0.5714286 0.6357143 0.7142857 0.9285714 0
## 12 0.5000000 0.5714286 0.6071429 0.6642857 0.7857143 0.9285714 0
## 13 0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714 0
## 14 0.5000000 0.5714286 0.6071429 0.6500000 0.7142857 0.9285714 0
## 15 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714 0
## 16 0.5000000 0.5714286 0.5714286 0.6428571 0.7142857 0.9285714 0
## 17 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714 0
## 18 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714 0
## 19 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714 0
## 20 0.5000000 0.5178571 0.5714286 0.6357143 0.7142857 0.9285714 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 -0.16666667 0.09646739 0.1911111 0.2439119 0.4370629 0.6956522 0
## 6 -0.04255319 0.14182692 0.2222222 0.2581137 0.2470449 0.8510638 0
## 7 -0.04255319 0.09646739 0.2238134 0.2840887 0.5153846 0.6956522 0
## 8 -0.16666667 0.08695652 0.1586538 0.2343539 0.5153846 0.5333333 0
## 9 -0.12000000 0.09646739 0.2238134 0.2847584 0.5333333 0.6956522 0
## 10 0.03921569 0.12500000 0.2072650 0.3169971 0.5153846 0.8510638 0
## 11 -0.16666667 0.08695652 0.1586538 0.2519241 0.4439799 0.8510638 0
## 12 -0.04255319 0.09646739 0.2072650 0.3089899 0.5333333 0.8510638 0
## 13 -0.04255319 0.09646739 0.2072650 0.2920649 0.5153846 0.8510638 0
## 14 -0.04255319 0.08695652 0.2193627 0.2764772 0.4439799 0.8510638 0
## 15 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638 0
## 16 -0.04255319 0.08695652 0.1586538 0.2643354 0.4439799 0.8510638 0
## 17 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638 0
## 18 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638 0
## 19 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638 0
## 20 -0.08888889 -0.01017576 0.1586538 0.2467509 0.4439799 0.8510638 0
Running maxnode from 5 to 30, the optimum value is 13.
Step 4. Search the best ntrees
store_maxtrees <- list()
for (ntree in c(250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000)) {
set.seed(100)
rf_maxtrees <- train(direction~.,
data = combi_train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = 13,
ntree = ntree)
key <- toString(ntree)
store_maxtrees[[key]] <- rf_maxtrees
}
results_tree <- resamples(store_maxtrees)
summary(results_tree)
##
## Call:
## summary.resamples(object = results_tree)
##
## Models: 250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 250 0.5000000 0.5714286 0.6071429 0.6571429 0.7142857 0.9285714 0
## 300 0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714 0
## 350 0.5000000 0.5714286 0.6071429 0.6500000 0.7500000 0.9285714 0
## 400 0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714 0
## 450 0.5000000 0.5714286 0.6071429 0.6500000 0.7500000 0.9285714 0
## 500 0.5000000 0.5178571 0.6071429 0.6500000 0.7678571 0.9285714 0
## 550 0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714 0
## 600 0.5000000 0.5714286 0.6071429 0.6571429 0.7678571 0.9285714 0
## 800 0.4285714 0.5714286 0.6071429 0.6428571 0.7142857 0.9285714 0
## 1000 0.4285714 0.5714286 0.6071429 0.6428571 0.7142857 0.9285714 0
## 2000 0.4285714 0.5714286 0.6071429 0.6428571 0.7142857 0.9285714 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 250 -0.04255319 0.1250000000 0.207265 0.2946173 0.4439799 0.8510638 0
## 300 -0.04255319 0.0964673913 0.207265 0.2920649 0.5153846 0.8510638 0
## 350 -0.04255319 0.0964673913 0.207265 0.2798733 0.4849057 0.8510638 0
## 400 -0.04255319 0.0964673913 0.207265 0.2920649 0.5153846 0.8510638 0
## 450 -0.04255319 0.0964673913 0.207265 0.2798733 0.4849057 0.8510638 0
## 500 -0.08888889 -0.0006648936 0.207265 0.2744804 0.5153846 0.8510638 0
## 550 -0.04255319 0.0964673913 0.207265 0.2920649 0.5153846 0.8510638 0
## 600 -0.04255319 0.0964673913 0.207265 0.2920649 0.5153846 0.8510638 0
## 800 -0.16666667 0.0964673913 0.207265 0.2654507 0.4439799 0.8510638 0
## 1000 -0.16666667 0.0964673913 0.207265 0.2654507 0.4439799 0.8510638 0
## 2000 -0.16666667 0.0964673913 0.207265 0.2654507 0.4439799 0.8510638 0
The best value of ntree is 300. So finally, we got: - mtry = 10 - maxnodes = 13 - ntree = 300
Predict model
Now we evaluate model:
fit_rf <- train(direction~.,
combi_train,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 300,
maxnodes = 13)
prediction <-predict(fit_rf, combi_test)
confusionMatrix(prediction, combi_test$direction)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10 2
## 1 4 18
##
## Accuracy : 0.8235
## 95% CI : (0.6547, 0.9324)
## No Information Rate : 0.5882
## P-Value [Acc > NIR] : 0.003193
##
## Kappa : 0.6277
##
## Mcnemar's Test P-Value : 0.683091
##
## Sensitivity : 0.7143
## Specificity : 0.9000
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.8182
## Prevalence : 0.4118
## Detection Rate : 0.2941
## Detection Prevalence : 0.3529
## Balanced Accuracy : 0.8071
##
## 'Positive' Class : 0
##
We got the accuracy of 0.7222 percent, which is higher than the default value.
Visualise the result
varimp_mars <- varImp(fit_rf)
plot(varimp_mars, main="Variable Importance with MARS")