If any issues, questions or suggestions feel free to reach me out via e-mail or Linkedin. You can also visit my Github.

if(!require(pacman)) install.packages('pacman')
pacman::p_load(dplyr, tidyr, ggplot2, pROC, broom, gridExtra, ggpubr, car, stringr)

In this post we fit logistic regression model in order to predict increase or decrease of stock market indices based on technical analysis indicators. Our goal will be selection of the most informative technical indicators for making investment decisions for 1-day ahead. In the blog post Technical Indicators - Data Preparation we downloaded data, calculated technical indicators and spllited the data into train set and test set.

train_set <- dget('technical_indicators_train_set.txt')
train_set <- lapply(train_set, function(x) lapply(x, function(y) if(is.character(y)) as.factor(y) else y))
train_set <- lapply(train_set, as.data.frame)
test_set <- dget('technical_indicators_test_set.txt')
test_set <- lapply(test_set, function(x) lapply(x, function(y) if(is.character(y)) as.factor(y) else y))
test_set <- lapply(test_set, as.data.frame)

In the below loop we do the following steps for stock indices in the dataset:
- fit logistic regression model using all predictors
- apply two-directional stepwise algorithm to select model with minimum AIC
- predict probabilities on the test set
- if predicted probability is higher than \(0.5\) then we assume Increase (i.e. higher probability of increase of stock index), otherwise Decrease
- create missclasification table
- calculate accuracy, specificity and sensititivity
- calculate ROC.

tickers <- names(train_set)

logistic_models <- list()
logistic_predictions <- list()
misclass_tables <- list()
evaluation_metrics <- list()
evaluation_roc_train <- list()
evaluation_roc_test <- list()

for (i in 1:length(tickers)){
  initial_model <- glm(formula = Class_Returns ~., family = binomial, data = train_set[[i]])
  step_model <- step(initial_model, direction = 'both', trace = 0)
  logistic_models[[tickers[i]]] <- step_model
  
  prediction <- data.frame(Probability = predict(step_model, test_set[[i]], type = 'response')) %>%
    mutate(Class_Returns = if_else(Probability > 0.5, 'Increase', 'Deacrease') %>%
             factor(levels = c('Deacrease', 'Increase')))
  logistic_predictions[[tickers[i]]] <- prediction
  
  misclass_table <- table(Predicted = prediction$Class_Returns
                          ,Actual = test_set[[i]]$Class_Returns)
  misclass_tables[[tickers[i]]] <- misclass_table
  
  accuracy <- round(100*sum(diag(misclass_table)) / sum(misclass_table), 2)
  sensitivity <- round(100*misclass_table[2,2] / (misclass_table[2,2] + misclass_table[1,2]), 2)
  specificity <- round(100*misclass_table[1,1] / (misclass_table[1,1] + misclass_table[2,1]), 2)
  evaluation_metrics[[tickers[i]]] <- c(accuracy = accuracy
                                        ,sensitivity = sensitivity
                                        ,specificity = specificity)
  
  roc_train <- roc(train_set[[i]]$Class_Returns, step_model$fitted.values)
  evaluation_roc_train[[tickers[i]]] <- roc_train
  
  roc_test <- roc(test_set[[i]]$Class_Returns, prediction$Probability)
  evaluation_roc_test[[tickers[i]]] <- roc_test
}

The most informative variables in our study are Parkinson_Volatility which appeared in 4 models and position of EMA20 relative to close prices which appeared in 3 models,

lapply(seq_along(logistic_models)
       ,function(x) names(logistic_models[[x]]$coefficients[-1])) %>%
  unlist() %>%
  table() %>%
  as.data.frame() %>%
  ggplot() +
  geom_bar(aes(x = Freq, y = reorder(., Freq)), stat = 'identity'
           ,fill = 'gray', color = 'black') +
  theme_bw() +
  labs(x = '', y = '', title = 'How often indicators appeared in logistic models?')

Overall accuracy in our models is about \(46 \%\), sensitivity about \(18 \%\) and specificity \(78 \%\) which means the logistic models perform well in predicting decreases of stock indices and perform poorly in predicting increases of stock indices.

as.data.frame(evaluation_metrics) %>%
  t() %>%
  as.data.frame() %>%
  pivot_longer(cols = 1:3, names_to = 'Metric', values_to = 'Value') %>%
  ggplot() +
  geom_boxplot(aes(x = Metric, y = Value, fill = Metric)) +
  facet_wrap(vars(Metric), scales = 'free') +
  theme_bw() +
  theme(legend.position = 'none', axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  labs(x = '', y = '')

NA

In all cases AUC is about \(50 \%\) which indicates that logistic models wouldn’t outperform guessing by change if stock indices increase either decrease.

for (i in 1:7) {
  
  if (!(i %% 2 == 0)) par(mfrow = c(1,2))
  
  plot(x = 1 - evaluation_roc_test[[i]]$specificities
     ,y = evaluation_roc_test[[i]]$sensitivities
     ,type = 'l'
     ,lwd = 2
     ,col = 'orange'
     ,xlab = ''
     ,ylab = '')
par(new = TRUE)
plot(x = 1 - evaluation_roc_train[[i]]$specificities
     ,y = evaluation_roc_train[[i]]$sensitivities
     ,type = 'l'
     ,lwd = 2
     ,col = 'cyan'
     ,xlab = 'FPR'
     ,ylab = 'TPR'
     ,main = names(evaluation_roc_train)[[i]])
abline(a = 0
       ,b = 1)
legend('bottomright'
       ,legend = c(paste0('Train ROC, AUC = ', round(auc(evaluation_roc_train[[i]])[1], 4))
                   ,paste0('Test ROC, AUC = ', round(auc(evaluation_roc_test[[i]])[1], 4)))
       ,col = c('orange'
                ,'cyan')
       ,lty = 1
       ,lwd = 2)
  
}

NA

Let’s do some diagnostics. Below we plot standardized residuals from our models. Standardized residuals with abolute value freater tghan 3 would indicate influential outliers in the continuous predictors. This ain’t the case in our models.

Variance inflation factor (VIF) above 10 would indicate intercorrelation among predictors. This also ain’t the case in our models.

lapply(logistic_models, function(x) tryCatch(
  expr = {vif(x)}
  ,error = function(e){'No predictors in the model'}
))
$WIG20
             Stoch_K Parkinson_Volatility 
            1.001808             1.001808 

$mWIG40
                 RSI              Stoch_D Parkinson_Volatility 
            3.682298             3.679802             1.044789 

$sWIG80
               EMA15               EMA200                  RSI Parkinson_Volatility 
            3.323025             1.245302             3.456034             1.035835 

$`^SPX`
[1] "No predictors in the model"

$`^DAX`
    Close_Volatility Parkinson_Volatility 
            6.213605             6.213605 

$`^NDQ`
[1] "No predictors in the model"

$EEM.US
          EMA200              ADX          Stoch_K Close_Volatility 
        1.314077         1.012254         1.137232         1.178884 

In most cases only MACD Histogram is in linear relationship to the logit of the outcome in the traning set. In further research we will try polynomial transformation to technical indicators like Stoch, ADX, RSI.

for (i in 1:7) {
  
  if (!((logistic_models[[i]]$coefficients[-1] %>%
         names() %>%
         str_detect('EMA.+', negate = TRUE) %>%
         sum()) == 0)){
    
  continuous_predictors <- train_set[[i]] %>%
    select_if(is.numeric) %>%
    mutate(prob = predict(logistic_models[[i]], train_set[[i]], type = 'response')) %>%
    mutate(logit = log(prob / (1 - prob))) %>%
    select(-prob) %>%
    pivot_longer(ADX:Parkinson_Volatility, names_to = 'Indicator', values_to = 'Value')
  
  linearity_plots <- ggplot(continuous_predictors, aes(logit, Value)) +
    geom_point(size = 0.5) +
    geom_smooth(method = 'loess') + 
    facet_wrap(vars(Indicator), scales = 'free') +
    labs(title = names(train_set)[i])
  
  print(linearity_plots)
  
  }
  
}

---
title: "Technical Indicators - Logistic Regression"
output: html_notebook
---

If any issues, questions or suggestions feel free to reach me out via e-mail <wieczynskipawel@gmail.com> or [Linkedin](https://www.linkedin.com/in/pawel-wieczynski/). You can also visit my [Github](https://github.com/pawel-wieczynski).

```{r libraries, warning=FALSE, message=FALSE}
if(!require(pacman)) install.packages('pacman')
pacman::p_load(dplyr, tidyr, ggplot2, pROC, broom, gridExtra, ggpubr, car, stringr)
```

In this post we fit logistic regression model in order to predict increase or decrease of stock market indices based on technical analysis indicators. Our goal will be selection of the most informative technical indicators for making investment decisions for 1-day ahead. In the blog post [Technical Indicators - Data Preparation](https://rpubs.com/pawel-wieczynski/871393) we downloaded data, calculated technical indicators and spllited the data into train set and test set.

```{r load_data}
train_set <- dget('technical_indicators_train_set.txt')
train_set <- lapply(train_set, function(x) lapply(x, function(y) if(is.character(y)) as.factor(y) else y))
train_set <- lapply(train_set, as.data.frame)
test_set <- dget('technical_indicators_test_set.txt')
test_set <- lapply(test_set, function(x) lapply(x, function(y) if(is.character(y)) as.factor(y) else y))
test_set <- lapply(test_set, as.data.frame)
```

In the below loop we do the following steps for stock indices in the dataset: \
 - fit logistic regression model using all predictors \
 - apply two-directional stepwise algorithm to select model with minimum AIC \
 - predict probabilities on the test set \
 - if predicted probability is higher than $0.5$ then we assume *Increase* (i.e. higher probability of increase of stock index), otherwise *Decrease* \
 - create missclasification table \
 - calculate accuracy, specificity and sensititivity \
 - calculate ROC.
```{r build_models_predict_and_evaluate, warning=FALSE, message=FALSE}
tickers <- names(train_set)

logistic_models <- list()
logistic_predictions <- list()
misclass_tables <- list()
evaluation_metrics <- list()
evaluation_roc_train <- list()
evaluation_roc_test <- list()

for (i in 1:length(tickers)){
  initial_model <- glm(formula = Class_Returns ~., family = binomial, data = train_set[[i]])
  step_model <- step(initial_model, direction = 'both', trace = 0)
  logistic_models[[tickers[i]]] <- step_model
  
  prediction <- data.frame(Probability = predict(step_model, test_set[[i]], type = 'response')) %>%
    mutate(Class_Returns = if_else(Probability > 0.5, 'Increase', 'Deacrease') %>%
             factor(levels = c('Deacrease', 'Increase')))
  logistic_predictions[[tickers[i]]] <- prediction
  
  misclass_table <- table(Predicted = prediction$Class_Returns
                          ,Actual = test_set[[i]]$Class_Returns)
  misclass_tables[[tickers[i]]] <- misclass_table
  
  accuracy <- round(100*sum(diag(misclass_table)) / sum(misclass_table), 2)
  sensitivity <- round(100*misclass_table[2,2] / (misclass_table[2,2] + misclass_table[1,2]), 2)
  specificity <- round(100*misclass_table[1,1] / (misclass_table[1,1] + misclass_table[2,1]), 2)
  evaluation_metrics[[tickers[i]]] <- c(accuracy = accuracy
                                        ,sensitivity = sensitivity
                                        ,specificity = specificity)
  
  roc_train <- roc(train_set[[i]]$Class_Returns, step_model$fitted.values)
  evaluation_roc_train[[tickers[i]]] <- roc_train
  
  roc_test <- roc(test_set[[i]]$Class_Returns, prediction$Probability)
  evaluation_roc_test[[tickers[i]]] <- roc_test
}

```

The most informative variables in our study are *Parkinson_Volatility* which appeared in 4 models and position of *EMA20* relative to close prices which appeared in 3 models, 
```{r informative_variables}
lapply(seq_along(logistic_models)
       ,function(x) names(logistic_models[[x]]$coefficients[-1])) %>%
  unlist() %>%
  table() %>%
  as.data.frame() %>%
  ggplot() +
  geom_bar(aes(x = Freq, y = reorder(., Freq)), stat = 'identity'
           ,fill = 'gray', color = 'black') +
  theme_bw() +
  labs(x = '', y = '', title = 'How often indicators appeared in logistic models?')
```

Overall accuracy in our models is about $46 \%$, sensitivity about $18 \%$ and specificity $78 \%$ which means the logistic models perform well in predicting decreases of stock indices and perform poorly in predicting increases of stock indices.
```{r metrics_visualization}
as.data.frame(evaluation_metrics) %>%
  t() %>%
  as.data.frame() %>%
  pivot_longer(cols = 1:3, names_to = 'Metric', values_to = 'Value') %>%
  ggplot() +
  geom_boxplot(aes(x = Metric, y = Value, fill = Metric)) +
  facet_wrap(vars(Metric), scales = 'free') +
  theme_bw() +
  theme(legend.position = 'none', axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  labs(x = '', y = '')
  
```

In all cases AUC is about $50 \%$ which indicates that logistic models wouldn't outperform guessing by change if stock indices increase either decrease.
```{r roc, warning=FALSE, message=FALSE}
for (i in 1:7) {
  
  if (!(i %% 2 == 0)) par(mfrow = c(1,2))
  
  plot(x = 1 - evaluation_roc_test[[i]]$specificities
     ,y = evaluation_roc_test[[i]]$sensitivities
     ,type = 'l'
     ,lwd = 2
     ,col = 'orange'
     ,xlab = ''
     ,ylab = '')
par(new = TRUE)
plot(x = 1 - evaluation_roc_train[[i]]$specificities
     ,y = evaluation_roc_train[[i]]$sensitivities
     ,type = 'l'
     ,lwd = 2
     ,col = 'cyan'
     ,xlab = 'FPR'
     ,ylab = 'TPR'
     ,main = names(evaluation_roc_train)[[i]])
abline(a = 0
       ,b = 1)
legend('bottomright'
       ,legend = c(paste0('Train ROC, AUC = ', round(auc(evaluation_roc_train[[i]])[1], 4))
                   ,paste0('Test ROC, AUC = ', round(auc(evaluation_roc_test[[i]])[1], 4)))
       ,col = c('orange'
                ,'cyan')
       ,lty = 1
       ,lwd = 2)
  
}

```

Let's do some diagnostics. Below we plot standardized residuals from our models. Standardized residuals with abolute value freater tghan 3 would indicate influential outliers in the continuous predictors. This ain't the case in our models.
```{r diagnostics1, message=FALSE}
data_with_model <- lapply(logistic_models, function(x) augment(x) %>%
                            mutate(index = 1:n()))

outlier_plots <- list()
for (i in 1:7) {
  outlier_plots[[i]] <- ggplot(data_with_model[[i]], aes(x = index, y = .std.resid)) +
    geom_point(aes(color = Class_Returns)) +
    theme(axis.text.x = element_blank()
          ,axis.title.x = element_blank()
          ,axis.ticks.x = element_blank()
          ,axis.title.y = element_blank()) +
    labs(title = names(data_with_model)[[i]])
}

p <- ggarrange(plotlist = outlier_plots, ncol = 4, nrow = 2, common.legend = TRUE)
p
```

Variance inflation factor (VIF) above 10 would indicate intercorrelation among predictors. This also ain't the case in our models.
```{r vif}
lapply(logistic_models, function(x) tryCatch(
  expr = {vif(x)}
  ,error = function(e){'No predictors in the model'}
))

```

In most cases only MACD Histogram is in linear relationship to the logit of the outcome in the traning set. In further research we will try polynomial transformation to technical indicators like Stoch, ADX, RSI.
```{r linearity, message=FALSE, warning=FALSE}
for (i in 1:7) {
  
  if (!((logistic_models[[i]]$coefficients[-1] %>%
         names() %>%
         str_detect('EMA.+', negate = TRUE) %>%
         sum()) == 0)){
    
  continuous_predictors <- train_set[[i]] %>%
    select_if(is.numeric) %>%
    mutate(prob = predict(logistic_models[[i]], train_set[[i]], type = 'response')) %>%
    mutate(logit = log(prob / (1 - prob))) %>%
    select(-prob) %>%
    pivot_longer(ADX:Parkinson_Volatility, names_to = 'Indicator', values_to = 'Value')
  
  linearity_plots <- ggplot(continuous_predictors, aes(logit, Value)) +
    geom_point(size = 0.5) +
    geom_smooth(method = 'loess') + 
    facet_wrap(vars(Indicator), scales = 'free') +
    labs(title = names(train_set)[i])
  
  print(linearity_plots)
  
  }
  
}
```