requesting data from yfinance

import yfinance as yf
# import daily price action for spy, over a 10 year period, and download to working directory
spy_max = yf.download(tickers='SPY',period='max',interval = '1d')

## 
[*********************100%***********************]  1 of 1 completed

spy_max.to_csv('spy_max.csv')

# 2y period on 5/11/2021
spy_1h = yf.download(tickers='SPY',period='2y',interval = '1h')

## 
[*********************100%***********************]  1 of 1 completed

spy_1h.to_csv('spy_1h.csv')

loading packages

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'dplyr' was built under R version 4.0.5

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Warning: package 'caret' was built under R version 4.0.5

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(pROC)

## Warning: package 'pROC' was built under R version 4.0.5

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(e1071)

## Warning: package 'e1071' was built under R version 4.0.5

functions used

# function that outputs a list of moving averages of a list given a bucket(times)
moving_avg = function(arr,times=200){
  len = length(arr)
  avgs = c(rep(0,times))
  for (i in (times+1):len){
    avgs = c(avgs,mean(arr[(i-times):(i-1)]))
  }
  return(avgs)
}

# groups data by count of common instance
number_instances = function(dataframe,instance){
  count_instance = dataframe%>%
    group_by_at(instance)%>%
    summarize(count=n())%>%
    return()
}

# outputs a list of each instance formatted with an 'h' in front
list_of_instance = function(dataframe,instance){
  instances = number_instances(dataframe,instance)
  hour_label = c()
  for (count in instances$count){
    for (i in seq(1:count)){
      hour_label = c(hour_label,paste0("h",i))
    }
  }
  return(hour_label)
}

# normalizes a vector using the mean
normalize = function(vector){
  avg = mean(vector)
  std = sd(vector)
  return((vector-avg)/std)
}

# changes vector to binary based on set threshhold
to_binary = function(vector,thresh){
  binary = c()
  for(i in vector){
    if (i>thresh){
      binary = c(binary,1)
    }else{
      binary = c(binary,0)
    }
  }
  return(binary)
}

loading data

# load downloaded csv file, from working directory, to R
spy.max = "spy_max.csv"
daily.spy= read.csv(spy.max)
head(daily.spy)

##         Date     Open     High      Low    Close Adj.Close  Volume
## 1 1993-01-29 43.96875 43.96875 43.75000 43.93750  25.88418 1003200
## 2 1993-02-01 43.96875 44.25000 43.96875 44.25000  26.06828  480500
## 3 1993-02-02 44.21875 44.37500 44.12500 44.34375  26.12350  201300
## 4 1993-02-03 44.40625 44.84375 44.37500 44.81250  26.39965  529400
## 5 1993-02-04 44.96875 45.09375 44.46875 45.00000  26.51011  531500
## 6 1993-02-05 44.96875 45.06250 44.71875 44.96875  26.49171  492100

tidying and adding features to dataset

# turn index into datetime
daily.spy$Date = as.Date(daily.spy$Date)

len=dim(daily.spy)[1]

# adding 200 and 50 day moving averages
daily.spy$ma_200=moving_avg(daily.spy$Close,200)
daily.spy$ma_50=moving_avg(daily.spy$Close,50)

# add percent change column
daily.spy$change_day = 100*(daily.spy$Close - daily.spy$Open)/daily.spy$Open

# add change_next column (percent change next day)
daily.spy$change_next = c(daily.spy$change_day[2:len],0)

# create binary 'tmro_green' column based on next days price action
daily.spy$tmro_green = to_binary(daily.spy$change_next,0)

plotting SPY price action

# drop first 200 rows and last row for plotting
plot.spy = daily.spy[201:(len-1),]

# stocks only go up
plot.spy%>%
  ggplot(aes(x=Date,group=1))+geom_line(aes(y=Close))+geom_line(aes(y=ma_200),color='red')+geom_line(aes(y=ma_50),color='blue')+scale_x_date(date_labels = '%Y') +labs(title = 'Price Action for SPY', y='Price',x='Date')

Price vs Volume

# As Volume gets larger the mangnitude of change increases. However, there is no clear direction (Green or red)
plot.spy%>%
  ggplot(aes(x=log(Volume), y=change_next))+geom_point()+ geom_smooth(method='lm', formula= y~x)+labs(y= '% Change Next Day', title = "Price Action vs Volume")

daily change vs change next day

# daily change does not seem predictive of next days change
plot.spy%>%
  ggplot(aes(x=change_day,y=change_next))+geom_point()+geom_smooth(method='lm',formula=y~x)+labs(title='Daily % Change')

Since daily price action did not seem to be predictive of next days price action, I will look at hourly data to get more features

#loading hourly data
spy.1h = 'spy_1h.csv'
hourly = read.csv(spy.1h)

# to date object
hourly$X = as.Date(hourly$X)
# remove today
now = Sys.Date()
hourly = hourly[hourly$X!=now,]

# create change %
size_hourly = dim(hourly)[1]
hourly$change = 100*(hourly$Close-hourly$Open)/hourly$Open

# create green (binary:1,0) column
hourly$green = to_binary(hourly$change,0)

# normalized change
hourly$change_var = normalize(hourly$change)

#create hour labels
hourly$hour=list_of_instance(hourly,'X')

head(hourly)

##            X     Open   High    Low   Close Adj.Close   Volume     change green
## 1 2019-05-13 282.4200 283.49 281.07 281.720   281.720 30577306 -0.2478621     0
## 2 2019-05-13 281.7100 282.54 280.96 281.230   281.230 12211553 -0.1703811     0
## 3 2019-05-13 281.2287 281.56 280.48 280.720   280.720 10115490 -0.1808839     0
## 4 2019-05-13 280.7100 281.16 279.93 280.025   280.025 10870050 -0.2440232     0
## 5 2019-05-13 280.0200 281.97 279.98 281.550   281.550 16279994  0.5463891     1
## 6 2019-05-13 281.5600 282.24 280.70 280.820   280.820 13127449 -0.2628180     0
##   change_var hour
## 1 -0.6146077   h1
## 2 -0.4243478   h2
## 3 -0.4501383   h3
## 4 -0.6051811   h4
## 5  1.3357294   h5
## 6 -0.6513328   h6

hour vs next hour

#create % change for next hour
hourly$next_change = c(hourly$change[2:size_hourly],0)

# hourly % change is not very predictive of the next hours change
hourly%>% ggplot(aes(x=change,y=next_change))+geom_point()+geom_smooth(method='lm',formula=y~x)+labs(title='Hourly % Change')

distribution of change

# % change variable is normally distributed
hourly%>%
  ggplot(aes(x=change))+geom_histogram(bins=30)

Create dataframes for modeling

% Change

change = hourly[c('X','hour','change')]%>%
  pivot_wider(names_from = hour,values_from=change)
change

## # A tibble: 503 x 8
##    X               h1       h2      h3       h4       h5      h6      h7
##    <date>       <dbl>    <dbl>   <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
##  1 2019-05-13 -0.248  -0.170   -0.181  -0.244    0.546   -0.263   0.0534
##  2 2019-05-14  0.514   0.205    0.0510  0.243    0.0124  -0.172  -0.324 
##  3 2019-05-15  0.838   0.347    0.0105 -0.0322   0.234    0.0420 -0.221 
##  4 2019-05-16  0.878   0.0798   0.0936 -0.0831  -0.00347 -0.217  -0.118 
##  5 2019-05-17  0.628   0.410   -0.371   0.139   -0.0296  -0.305  -0.164 
##  6 2019-05-20  0.324  -0.0877   0.0158 -0.0983  -0.116   -0.113   0.0546
##  7 2019-05-21  0.0140  0.217   -0.0454  0.0384   0.126   -0.0680 -0.0523
##  8 2019-05-22  0.217   0.00700 -0.149  -0.0770   0.149    0.0595 -0.147 
##  9 2019-05-23 -0.332  -0.0283  -0.252  -0.0604  -0.139    0.420   0.0674
## 10 2019-05-24 -0.233  -0.240    0.202   0.00706 -0.0565   0.0919 -0.187 
## # ... with 493 more rows

Change normalized

norm = hourly[c('X','hour','change_var')]%>%
  pivot_wider(names_from = hour,values_from=change_var)
head(norm)

## # A tibble: 6 x 8
##   X              h1     h2      h3      h4      h5      h6     h7
##   <date>      <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
## 1 2019-05-13 -0.615 -0.424 -0.450  -0.605   1.34   -0.651   0.125
## 2 2019-05-14  1.26   0.497  0.119   0.590   0.0244 -0.428  -0.800
## 3 2019-05-15  2.05   0.846  0.0199 -0.0850  0.569   0.0972 -0.548
## 4 2019-05-16  2.15   0.190  0.224  -0.210  -0.0145 -0.540  -0.296
## 5 2019-05-17  1.54   1.00  -0.918   0.336  -0.0786 -0.754  -0.409
## 6 2019-05-20  0.789 -0.221  0.0329 -0.247  -0.291  -0.283   0.128

Binary Changes

green = hourly[c('X','hour','green')]%>%
  pivot_wider(names_from = hour,values_from=green)
green

## # A tibble: 503 x 8
##    X             h1    h2    h3    h4    h5    h6    h7
##    <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2019-05-13     0     0     0     0     1     0     1
##  2 2019-05-14     1     1     1     1     1     0     0
##  3 2019-05-15     1     1     1     0     1     1     0
##  4 2019-05-16     1     1     1     0     0     0     0
##  5 2019-05-17     1     1     0     1     0     0     0
##  6 2019-05-20     1     0     1     0     0     0     1
##  7 2019-05-21     1     1     0     1     1     0     0
##  8 2019-05-22     1     1     0     0     1     1     0
##  9 2019-05-23     0     0     0     0     0     1     1
## 10 2019-05-24     0     0     1     1     0     1     0
## # ... with 493 more rows

Create dataframe with spy date, volume and next days price

next.spy = daily.spy[c('Date','Volume','tmro_green')]

# merge days hourly price action with next days outcome
hourly.change = next.spy%>%
  inner_join(change,by=(c('Date'='X')))%>%
  replace(is.na(.),0)

# create training/ evaluation sets
size=dim(hourly.change)[1]
set.seed(1111)

training = sample(seq(size),size = round(size*.7))

change.train = hourly.change[training,]
change.test = hourly.change[-training,]

#model1 

m.change = glm(tmro_green~.-Date, change.train,family='binomial')
summary(m.change)

## 
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = change.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.652  -1.289   0.957   1.061   1.348  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  3.342e-01  2.217e-01   1.508    0.132
## Volume      -2.961e-10  2.283e-09  -0.130    0.897
## h1          -2.874e-01  2.470e-01  -1.164    0.245
## h2           2.907e-01  2.573e-01   1.130    0.259
## h3          -6.325e-02  3.596e-01  -0.176    0.860
## h4          -5.523e-01  3.500e-01  -1.578    0.115
## h5          -1.137e-01  4.081e-01  -0.279    0.781
## h6          -2.317e-01  3.363e-01  -0.689    0.491
## h7          -1.068e-01  2.452e-01  -0.436    0.663
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 480.27  on 351  degrees of freedom
## Residual deviance: 473.73  on 343  degrees of freedom
## AIC: 491.73
## 
## Number of Fisher Scoring iterations: 4

model 2

# merge with normalized change
hourly.norm = next.spy%>%
  inner_join(norm,by=(c('Date'='X')))%>%
  replace(is.na(.),0)

norm.train = hourly.norm[training,]
norm.test = hourly.norm[-training,]

m.norm = glm(tmro_green~.-Date,norm.train,family='binomial')
summary(m.norm)

## 
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = norm.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6524  -1.2895   0.9571   1.0608   1.3479  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  3.316e-01  2.212e-01   1.499    0.134
## Volume      -2.956e-10  2.283e-09  -0.129    0.897
## h1          -1.171e-01  1.006e-01  -1.164    0.245
## h2           1.184e-01  1.048e-01   1.130    0.259
## h3          -2.578e-02  1.464e-01  -0.176    0.860
## h4          -2.249e-01  1.425e-01  -1.578    0.115
## h5          -4.618e-02  1.662e-01  -0.278    0.781
## h6          -9.429e-02  1.369e-01  -0.689    0.491
## h7          -4.348e-02  9.986e-02  -0.435    0.663
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 480.27  on 351  degrees of freedom
## Residual deviance: 473.73  on 343  degrees of freedom
## AIC: 491.73
## 
## Number of Fisher Scoring iterations: 4

model 3

# merge with binomial change
hourly.green = next.spy%>%
  inner_join(green,by=(c('Date'='X')))%>%
  replace(is.na(.),0)

green.train = hourly.green[training,]
green.test = hourly.green[-training,]

m.green = glm(tmro_green~. -Date,green.train,family='binomial')
summary(m.green)

## 
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = green.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6358  -1.2228   0.9088   1.0373   1.3478  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  8.712e-01  4.079e-01   2.136   0.0327 *
## Volume      -8.832e-10  1.987e-09  -0.444   0.6567  
## h1          -2.415e-01  2.233e-01  -1.081   0.2797  
## h2           1.804e-01  2.223e-01   0.811   0.4171  
## h3          -2.187e-01  2.216e-01  -0.987   0.3238  
## h4          -1.526e-01  2.250e-01  -0.678   0.4977  
## h5          -1.782e-01  2.237e-01  -0.797   0.4255  
## h6          -4.171e-01  2.199e-01  -1.897   0.0578 .
## h7           8.246e-02  2.215e-01   0.372   0.7098  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 480.27  on 351  degrees of freedom
## Residual deviance: 472.08  on 343  degrees of freedom
## AIC: 490.08
## 
## Number of Fisher Scoring iterations: 4

testing models

#setting up test df
tmro.green = hourly.change[c('tmro_green')]
test = data.frame(tmro_green=tmro.green[-training,])
test$change=to_binary(predict(m.change,change.test,type='response'),0.5)
test$norm=to_binary(predict(m.norm,norm.test,type='response'),0.5)
test$green=to_binary(predict(m.green,green.test,type='response'),0.5)
head(test)

##   tmro_green change norm green
## 1          1      1    1     1
## 2          1      1    1     0
## 3          0      1    1     1
## 4          1      1    1     1
## 5          0      1    1     0
## 6          1      1    1     1

#change data to factors
test = test%>%
  mutate(tmro_green = as.factor(tmro_green),
         change = as.factor(change),
         norm = as.factor(norm),
         green = as.factor(green)
         )
# % change model
confusionMatrix(data=test$change,reference=test$tmro_green)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  5  7
##          1 66 73
##                                           
##                Accuracy : 0.5166          
##                  95% CI : (0.4339, 0.5985)
##     No Information Rate : 0.5298          
##     P-Value [Acc > NIR] : 0.6587          
##                                           
##                   Kappa : -0.0179         
##                                           
##  Mcnemar's Test P-Value : 1.134e-11       
##                                           
##             Sensitivity : 0.07042         
##             Specificity : 0.91250         
##          Pos Pred Value : 0.41667         
##          Neg Pred Value : 0.52518         
##              Prevalence : 0.47020         
##          Detection Rate : 0.03311         
##    Detection Prevalence : 0.07947         
##       Balanced Accuracy : 0.49146         
##                                           
##        'Positive' Class : 0               
##

# normed model
confusionMatrix(data=test$norm,reference=test$tmro_green)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  5  7
##          1 66 73
##                                           
##                Accuracy : 0.5166          
##                  95% CI : (0.4339, 0.5985)
##     No Information Rate : 0.5298          
##     P-Value [Acc > NIR] : 0.6587          
##                                           
##                   Kappa : -0.0179         
##                                           
##  Mcnemar's Test P-Value : 1.134e-11       
##                                           
##             Sensitivity : 0.07042         
##             Specificity : 0.91250         
##          Pos Pred Value : 0.41667         
##          Neg Pred Value : 0.52518         
##              Prevalence : 0.47020         
##          Detection Rate : 0.03311         
##    Detection Prevalence : 0.07947         
##       Balanced Accuracy : 0.49146         
##                                           
##        'Positive' Class : 0               
##

#binary model
confusionMatrix(data=test$green,reference=test$tmro_green)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 16 15
##          1 55 65
##                                           
##                Accuracy : 0.5364          
##                  95% CI : (0.4535, 0.6178)
##     No Information Rate : 0.5298          
##     P-Value [Acc > NIR] : 0.4682          
##                                           
##                   Kappa : 0.0391          
##                                           
##  Mcnemar's Test P-Value : 3.141e-06       
##                                           
##             Sensitivity : 0.2254          
##             Specificity : 0.8125          
##          Pos Pred Value : 0.5161          
##          Neg Pred Value : 0.5417          
##              Prevalence : 0.4702          
##          Detection Rate : 0.1060          
##    Detection Prevalence : 0.2053          
##       Balanced Accuracy : 0.5189          
##                                           
##        'Positive' Class : 0               
##

did we perform better than average?

# comparing model 3 -- we beat the average slightly with the binary model
only_up = sum(green.test$tmro_green)/dim(test)[1]
only_up

## [1] 0.5298013

roc curve for model 3

green.roc = roc(green.train$tmro_green,predict(m.green))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# curve maximizes around threshhold of 0.5
plot(green.roc, main='ROC for Binary Model')

#auc
auc(green.roc)

## Area under the curve: 0.5908

naive bayes model

# since our variables seem to be independent lets try naive bayes
row = dim(hourly.change)[2]
hourly.nb = hourly.change[3:row]
nb.train = hourly.nb[training,]
nb.test = hourly.nb[-training,]

m.nb = naiveBayes(tmro_green~.,nb.train)
test$nb = predict(m.nb,nb.test)
test$nb = as.factor(test$nb)

# we beat the average with 56.29% accuracy!
confusionMatrix(data=test$nb,reference=test$tmro_green)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 52 47
##          1 19 33
##                                           
##                Accuracy : 0.5629          
##                  95% CI : (0.4799, 0.6434)
##     No Information Rate : 0.5298          
##     P-Value [Acc > NIR] : 0.231849        
##                                           
##                   Kappa : 0.1417          
##                                           
##  Mcnemar's Test P-Value : 0.000889        
##                                           
##             Sensitivity : 0.7324          
##             Specificity : 0.4125          
##          Pos Pred Value : 0.5253          
##          Neg Pred Value : 0.6346          
##              Prevalence : 0.4702          
##          Detection Rate : 0.3444          
##    Detection Prevalence : 0.6556          
##       Balanced Accuracy : 0.5724          
##                                           
##        'Positive' Class : 0               
##

test on new data

today = yf.download(tickers='SPY',period='1d',interval = '1h')

## 
[*********************100%***********************]  1 of 1 completed

today.to_csv('today.csv')

today = read.csv('today.csv')


pipeline = function(dataframe,model){
  
  #normalize dates
  df = dataframe%>%
    mutate(X = as.Date(X))
  
  #create %change field
  df$change = 100*(df$Close-df$Open)/df$Open
  
  #create hour grouping
  df$hour = list_of_instance(df,'X')
  
  #subset data to only contain needed rows
  subset = df[c('X','hour','change')]%>%
    pivot_wider(names_from=hour, values_from=change)
  #return model results
  return(predict(model,subset))
}


pipeline(today,m.nb)

## [1] 1
## Levels: 0 1

conclusion

stock market is hard to predict :(
- daily price action is not predictive of next days price action
- hourly price action is not predictive of next days price action
- Volume is not very predictive of price action

naive bayes is a decent predictor since variables are not correlated
-Stocks going up tomorrow (according to this model, not financial advice )

S&P Price Action Analysis (SPY)

Richard

5/8/2021

requesting data from yfinance

loading packages

functions used

loading data

tidying and adding features to dataset

plotting SPY price action

Price vs Volume

daily change vs change next day

Since daily price action did not seem to be predictive of next days price action, I will look at hourly data to get more features

hour vs next hour

distribution of change

Create dataframes for modeling

% Change

Change normalized

Binary Changes

Create dataframe with spy date, volume and next days price

model 2

model 3

testing models

did we perform better than average?

roc curve for model 3

naive bayes model

test on new data

conclusion