requesting data from yfinance

import yfinance as yf
# import daily price action for spy, over a 10 year period, and download to working directory
spy_max = yf.download(tickers='SPY',period='max',interval = '1d')
## 
[*********************100%***********************]  1 of 1 completed
spy_max.to_csv('spy_max.csv')

# 2y period on 5/11/2021
spy_1h = yf.download(tickers='SPY',period='2y',interval = '1h')
## 
[*********************100%***********************]  1 of 1 completed
spy_1h.to_csv('spy_1h.csv')

loading packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.5
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5

functions used

# function that outputs a list of moving averages of a list given a bucket(times)
moving_avg = function(arr,times=200){
  len = length(arr)
  avgs = c(rep(0,times))
  for (i in (times+1):len){
    avgs = c(avgs,mean(arr[(i-times):(i-1)]))
  }
  return(avgs)
}

# groups data by count of common instance
number_instances = function(dataframe,instance){
  count_instance = dataframe%>%
    group_by_at(instance)%>%
    summarize(count=n())%>%
    return()
}

# outputs a list of each instance formatted with an 'h' in front
list_of_instance = function(dataframe,instance){
  instances = number_instances(dataframe,instance)
  hour_label = c()
  for (count in instances$count){
    for (i in seq(1:count)){
      hour_label = c(hour_label,paste0("h",i))
    }
  }
  return(hour_label)
}

# normalizes a vector using the mean
normalize = function(vector){
  avg = mean(vector)
  std = sd(vector)
  return((vector-avg)/std)
}

# changes vector to binary based on set threshhold
to_binary = function(vector,thresh){
  binary = c()
  for(i in vector){
    if (i>thresh){
      binary = c(binary,1)
    }else{
      binary = c(binary,0)
    }
  }
  return(binary)
}

loading data

# load downloaded csv file, from working directory, to R
spy.max = "spy_max.csv"
daily.spy= read.csv(spy.max)
head(daily.spy)
##         Date     Open     High      Low    Close Adj.Close  Volume
## 1 1993-01-29 43.96875 43.96875 43.75000 43.93750  25.88418 1003200
## 2 1993-02-01 43.96875 44.25000 43.96875 44.25000  26.06828  480500
## 3 1993-02-02 44.21875 44.37500 44.12500 44.34375  26.12350  201300
## 4 1993-02-03 44.40625 44.84375 44.37500 44.81250  26.39965  529400
## 5 1993-02-04 44.96875 45.09375 44.46875 45.00000  26.51011  531500
## 6 1993-02-05 44.96875 45.06250 44.71875 44.96875  26.49171  492100

tidying and adding features to dataset

# turn index into datetime
daily.spy$Date = as.Date(daily.spy$Date)

len=dim(daily.spy)[1]

# adding 200 and 50 day moving averages
daily.spy$ma_200=moving_avg(daily.spy$Close,200)
daily.spy$ma_50=moving_avg(daily.spy$Close,50)

# add percent change column
daily.spy$change_day = 100*(daily.spy$Close - daily.spy$Open)/daily.spy$Open

# add change_next column (percent change next day)
daily.spy$change_next = c(daily.spy$change_day[2:len],0)

# create binary 'tmro_green' column based on next days price action
daily.spy$tmro_green = to_binary(daily.spy$change_next,0)

plotting SPY price action

# drop first 200 rows and last row for plotting
plot.spy = daily.spy[201:(len-1),]

# stocks only go up
plot.spy%>%
  ggplot(aes(x=Date,group=1))+geom_line(aes(y=Close))+geom_line(aes(y=ma_200),color='red')+geom_line(aes(y=ma_50),color='blue')+scale_x_date(date_labels = '%Y') +labs(title = 'Price Action for SPY', y='Price',x='Date')

Price vs Volume

# As Volume gets larger the mangnitude of change increases. However, there is no clear direction (Green or red)
plot.spy%>%
  ggplot(aes(x=log(Volume), y=change_next))+geom_point()+ geom_smooth(method='lm', formula= y~x)+labs(y= '% Change Next Day', title = "Price Action vs Volume")

daily change vs change next day

# daily change does not seem predictive of next days change
plot.spy%>%
  ggplot(aes(x=change_day,y=change_next))+geom_point()+geom_smooth(method='lm',formula=y~x)+labs(title='Daily % Change')

Since daily price action did not seem to be predictive of next days price action, I will look at hourly data to get more features

#loading hourly data
spy.1h = 'spy_1h.csv'
hourly = read.csv(spy.1h)

# to date object
hourly$X = as.Date(hourly$X)
# remove today
now = Sys.Date()
hourly = hourly[hourly$X!=now,]

# create change %
size_hourly = dim(hourly)[1]
hourly$change = 100*(hourly$Close-hourly$Open)/hourly$Open

# create green (binary:1,0) column
hourly$green = to_binary(hourly$change,0)

# normalized change
hourly$change_var = normalize(hourly$change)

#create hour labels
hourly$hour=list_of_instance(hourly,'X')

head(hourly)
##            X     Open   High    Low   Close Adj.Close   Volume     change green
## 1 2019-05-13 282.4200 283.49 281.07 281.720   281.720 30577306 -0.2478621     0
## 2 2019-05-13 281.7100 282.54 280.96 281.230   281.230 12211553 -0.1703811     0
## 3 2019-05-13 281.2287 281.56 280.48 280.720   280.720 10115490 -0.1808839     0
## 4 2019-05-13 280.7100 281.16 279.93 280.025   280.025 10870050 -0.2440232     0
## 5 2019-05-13 280.0200 281.97 279.98 281.550   281.550 16279994  0.5463891     1
## 6 2019-05-13 281.5600 282.24 280.70 280.820   280.820 13127449 -0.2628180     0
##   change_var hour
## 1 -0.6146077   h1
## 2 -0.4243478   h2
## 3 -0.4501383   h3
## 4 -0.6051811   h4
## 5  1.3357294   h5
## 6 -0.6513328   h6

hour vs next hour

#create % change for next hour
hourly$next_change = c(hourly$change[2:size_hourly],0)

# hourly % change is not very predictive of the next hours change
hourly%>% ggplot(aes(x=change,y=next_change))+geom_point()+geom_smooth(method='lm',formula=y~x)+labs(title='Hourly % Change')

distribution of change

# % change variable is normally distributed
hourly%>%
  ggplot(aes(x=change))+geom_histogram(bins=30)

Create dataframes for modeling

% Change

change = hourly[c('X','hour','change')]%>%
  pivot_wider(names_from = hour,values_from=change)
change
## # A tibble: 503 x 8
##    X               h1       h2      h3       h4       h5      h6      h7
##    <date>       <dbl>    <dbl>   <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
##  1 2019-05-13 -0.248  -0.170   -0.181  -0.244    0.546   -0.263   0.0534
##  2 2019-05-14  0.514   0.205    0.0510  0.243    0.0124  -0.172  -0.324 
##  3 2019-05-15  0.838   0.347    0.0105 -0.0322   0.234    0.0420 -0.221 
##  4 2019-05-16  0.878   0.0798   0.0936 -0.0831  -0.00347 -0.217  -0.118 
##  5 2019-05-17  0.628   0.410   -0.371   0.139   -0.0296  -0.305  -0.164 
##  6 2019-05-20  0.324  -0.0877   0.0158 -0.0983  -0.116   -0.113   0.0546
##  7 2019-05-21  0.0140  0.217   -0.0454  0.0384   0.126   -0.0680 -0.0523
##  8 2019-05-22  0.217   0.00700 -0.149  -0.0770   0.149    0.0595 -0.147 
##  9 2019-05-23 -0.332  -0.0283  -0.252  -0.0604  -0.139    0.420   0.0674
## 10 2019-05-24 -0.233  -0.240    0.202   0.00706 -0.0565   0.0919 -0.187 
## # ... with 493 more rows

Change normalized

norm = hourly[c('X','hour','change_var')]%>%
  pivot_wider(names_from = hour,values_from=change_var)
head(norm)
## # A tibble: 6 x 8
##   X              h1     h2      h3      h4      h5      h6     h7
##   <date>      <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
## 1 2019-05-13 -0.615 -0.424 -0.450  -0.605   1.34   -0.651   0.125
## 2 2019-05-14  1.26   0.497  0.119   0.590   0.0244 -0.428  -0.800
## 3 2019-05-15  2.05   0.846  0.0199 -0.0850  0.569   0.0972 -0.548
## 4 2019-05-16  2.15   0.190  0.224  -0.210  -0.0145 -0.540  -0.296
## 5 2019-05-17  1.54   1.00  -0.918   0.336  -0.0786 -0.754  -0.409
## 6 2019-05-20  0.789 -0.221  0.0329 -0.247  -0.291  -0.283   0.128

Binary Changes

green = hourly[c('X','hour','green')]%>%
  pivot_wider(names_from = hour,values_from=green)
green
## # A tibble: 503 x 8
##    X             h1    h2    h3    h4    h5    h6    h7
##    <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2019-05-13     0     0     0     0     1     0     1
##  2 2019-05-14     1     1     1     1     1     0     0
##  3 2019-05-15     1     1     1     0     1     1     0
##  4 2019-05-16     1     1     1     0     0     0     0
##  5 2019-05-17     1     1     0     1     0     0     0
##  6 2019-05-20     1     0     1     0     0     0     1
##  7 2019-05-21     1     1     0     1     1     0     0
##  8 2019-05-22     1     1     0     0     1     1     0
##  9 2019-05-23     0     0     0     0     0     1     1
## 10 2019-05-24     0     0     1     1     0     1     0
## # ... with 493 more rows

Create dataframe with spy date, volume and next days price

next.spy = daily.spy[c('Date','Volume','tmro_green')]

# merge days hourly price action with next days outcome
hourly.change = next.spy%>%
  inner_join(change,by=(c('Date'='X')))%>%
  replace(is.na(.),0)

# create training/ evaluation sets
size=dim(hourly.change)[1]
set.seed(1111)

training = sample(seq(size),size = round(size*.7))

change.train = hourly.change[training,]
change.test = hourly.change[-training,]

#model1 

m.change = glm(tmro_green~.-Date, change.train,family='binomial')
summary(m.change)
## 
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = change.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.652  -1.289   0.957   1.061   1.348  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  3.342e-01  2.217e-01   1.508    0.132
## Volume      -2.961e-10  2.283e-09  -0.130    0.897
## h1          -2.874e-01  2.470e-01  -1.164    0.245
## h2           2.907e-01  2.573e-01   1.130    0.259
## h3          -6.325e-02  3.596e-01  -0.176    0.860
## h4          -5.523e-01  3.500e-01  -1.578    0.115
## h5          -1.137e-01  4.081e-01  -0.279    0.781
## h6          -2.317e-01  3.363e-01  -0.689    0.491
## h7          -1.068e-01  2.452e-01  -0.436    0.663
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 480.27  on 351  degrees of freedom
## Residual deviance: 473.73  on 343  degrees of freedom
## AIC: 491.73
## 
## Number of Fisher Scoring iterations: 4

model 2

# merge with normalized change
hourly.norm = next.spy%>%
  inner_join(norm,by=(c('Date'='X')))%>%
  replace(is.na(.),0)

norm.train = hourly.norm[training,]
norm.test = hourly.norm[-training,]

m.norm = glm(tmro_green~.-Date,norm.train,family='binomial')
summary(m.norm)
## 
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = norm.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6524  -1.2895   0.9571   1.0608   1.3479  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  3.316e-01  2.212e-01   1.499    0.134
## Volume      -2.956e-10  2.283e-09  -0.129    0.897
## h1          -1.171e-01  1.006e-01  -1.164    0.245
## h2           1.184e-01  1.048e-01   1.130    0.259
## h3          -2.578e-02  1.464e-01  -0.176    0.860
## h4          -2.249e-01  1.425e-01  -1.578    0.115
## h5          -4.618e-02  1.662e-01  -0.278    0.781
## h6          -9.429e-02  1.369e-01  -0.689    0.491
## h7          -4.348e-02  9.986e-02  -0.435    0.663
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 480.27  on 351  degrees of freedom
## Residual deviance: 473.73  on 343  degrees of freedom
## AIC: 491.73
## 
## Number of Fisher Scoring iterations: 4

model 3

# merge with binomial change
hourly.green = next.spy%>%
  inner_join(green,by=(c('Date'='X')))%>%
  replace(is.na(.),0)

green.train = hourly.green[training,]
green.test = hourly.green[-training,]

m.green = glm(tmro_green~. -Date,green.train,family='binomial')
summary(m.green)
## 
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = green.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6358  -1.2228   0.9088   1.0373   1.3478  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  8.712e-01  4.079e-01   2.136   0.0327 *
## Volume      -8.832e-10  1.987e-09  -0.444   0.6567  
## h1          -2.415e-01  2.233e-01  -1.081   0.2797  
## h2           1.804e-01  2.223e-01   0.811   0.4171  
## h3          -2.187e-01  2.216e-01  -0.987   0.3238  
## h4          -1.526e-01  2.250e-01  -0.678   0.4977  
## h5          -1.782e-01  2.237e-01  -0.797   0.4255  
## h6          -4.171e-01  2.199e-01  -1.897   0.0578 .
## h7           8.246e-02  2.215e-01   0.372   0.7098  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 480.27  on 351  degrees of freedom
## Residual deviance: 472.08  on 343  degrees of freedom
## AIC: 490.08
## 
## Number of Fisher Scoring iterations: 4

testing models

#setting up test df
tmro.green = hourly.change[c('tmro_green')]
test = data.frame(tmro_green=tmro.green[-training,])
test$change=to_binary(predict(m.change,change.test,type='response'),0.5)
test$norm=to_binary(predict(m.norm,norm.test,type='response'),0.5)
test$green=to_binary(predict(m.green,green.test,type='response'),0.5)
head(test)
##   tmro_green change norm green
## 1          1      1    1     1
## 2          1      1    1     0
## 3          0      1    1     1
## 4          1      1    1     1
## 5          0      1    1     0
## 6          1      1    1     1
#change data to factors
test = test%>%
  mutate(tmro_green = as.factor(tmro_green),
         change = as.factor(change),
         norm = as.factor(norm),
         green = as.factor(green)
         )
# % change model
confusionMatrix(data=test$change,reference=test$tmro_green)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  5  7
##          1 66 73
##                                           
##                Accuracy : 0.5166          
##                  95% CI : (0.4339, 0.5985)
##     No Information Rate : 0.5298          
##     P-Value [Acc > NIR] : 0.6587          
##                                           
##                   Kappa : -0.0179         
##                                           
##  Mcnemar's Test P-Value : 1.134e-11       
##                                           
##             Sensitivity : 0.07042         
##             Specificity : 0.91250         
##          Pos Pred Value : 0.41667         
##          Neg Pred Value : 0.52518         
##              Prevalence : 0.47020         
##          Detection Rate : 0.03311         
##    Detection Prevalence : 0.07947         
##       Balanced Accuracy : 0.49146         
##                                           
##        'Positive' Class : 0               
## 
# normed model
confusionMatrix(data=test$norm,reference=test$tmro_green)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  5  7
##          1 66 73
##                                           
##                Accuracy : 0.5166          
##                  95% CI : (0.4339, 0.5985)
##     No Information Rate : 0.5298          
##     P-Value [Acc > NIR] : 0.6587          
##                                           
##                   Kappa : -0.0179         
##                                           
##  Mcnemar's Test P-Value : 1.134e-11       
##                                           
##             Sensitivity : 0.07042         
##             Specificity : 0.91250         
##          Pos Pred Value : 0.41667         
##          Neg Pred Value : 0.52518         
##              Prevalence : 0.47020         
##          Detection Rate : 0.03311         
##    Detection Prevalence : 0.07947         
##       Balanced Accuracy : 0.49146         
##                                           
##        'Positive' Class : 0               
## 
#binary model
confusionMatrix(data=test$green,reference=test$tmro_green)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 16 15
##          1 55 65
##                                           
##                Accuracy : 0.5364          
##                  95% CI : (0.4535, 0.6178)
##     No Information Rate : 0.5298          
##     P-Value [Acc > NIR] : 0.4682          
##                                           
##                   Kappa : 0.0391          
##                                           
##  Mcnemar's Test P-Value : 3.141e-06       
##                                           
##             Sensitivity : 0.2254          
##             Specificity : 0.8125          
##          Pos Pred Value : 0.5161          
##          Neg Pred Value : 0.5417          
##              Prevalence : 0.4702          
##          Detection Rate : 0.1060          
##    Detection Prevalence : 0.2053          
##       Balanced Accuracy : 0.5189          
##                                           
##        'Positive' Class : 0               
## 

did we perform better than average?

# comparing model 3 -- we beat the average slightly with the binary model
only_up = sum(green.test$tmro_green)/dim(test)[1]
only_up
## [1] 0.5298013

roc curve for model 3

green.roc = roc(green.train$tmro_green,predict(m.green))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# curve maximizes around threshhold of 0.5
plot(green.roc, main='ROC for Binary Model')

#auc
auc(green.roc)
## Area under the curve: 0.5908

naive bayes model

# since our variables seem to be independent lets try naive bayes
row = dim(hourly.change)[2]
hourly.nb = hourly.change[3:row]
nb.train = hourly.nb[training,]
nb.test = hourly.nb[-training,]

m.nb = naiveBayes(tmro_green~.,nb.train)
test$nb = predict(m.nb,nb.test)
test$nb = as.factor(test$nb)

# we beat the average with 56.29% accuracy!
confusionMatrix(data=test$nb,reference=test$tmro_green)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 52 47
##          1 19 33
##                                           
##                Accuracy : 0.5629          
##                  95% CI : (0.4799, 0.6434)
##     No Information Rate : 0.5298          
##     P-Value [Acc > NIR] : 0.231849        
##                                           
##                   Kappa : 0.1417          
##                                           
##  Mcnemar's Test P-Value : 0.000889        
##                                           
##             Sensitivity : 0.7324          
##             Specificity : 0.4125          
##          Pos Pred Value : 0.5253          
##          Neg Pred Value : 0.6346          
##              Prevalence : 0.4702          
##          Detection Rate : 0.3444          
##    Detection Prevalence : 0.6556          
##       Balanced Accuracy : 0.5724          
##                                           
##        'Positive' Class : 0               
## 

test on new data

today = yf.download(tickers='SPY',period='1d',interval = '1h')
## 
[*********************100%***********************]  1 of 1 completed
today.to_csv('today.csv')
today = read.csv('today.csv')


pipeline = function(dataframe,model){
  
  #normalize dates
  df = dataframe%>%
    mutate(X = as.Date(X))
  
  #create %change field
  df$change = 100*(df$Close-df$Open)/df$Open
  
  #create hour grouping
  df$hour = list_of_instance(df,'X')
  
  #subset data to only contain needed rows
  subset = df[c('X','hour','change')]%>%
    pivot_wider(names_from=hour, values_from=change)
  #return model results
  return(predict(model,subset))
}


pipeline(today,m.nb)
## [1] 1
## Levels: 0 1

conclusion

stock market is hard to predict :(
- daily price action is not predictive of next days price action
- hourly price action is not predictive of next days price action
- Volume is not very predictive of price action

naive bayes is a decent predictor since variables are not correlated
-Stocks going up tomorrow (according to this model, not financial advice )