requesting data from yfinance
import yfinance as yf
# import daily price action for spy, over a 10 year period, and download to working directory
spy_max = yf.download(tickers='SPY',period='max',interval = '1d')
##
[*********************100%***********************] 1 of 1 completed
spy_max.to_csv('spy_max.csv')
# 2y period on 5/11/2021
spy_1h = yf.download(tickers='SPY',period='2y',interval = '1h')
##
[*********************100%***********************] 1 of 1 completed
spy_1h.to_csv('spy_1h.csv')
loading packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.5
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
functions used
# function that outputs a list of moving averages of a list given a bucket(times)
moving_avg = function(arr,times=200){
len = length(arr)
avgs = c(rep(0,times))
for (i in (times+1):len){
avgs = c(avgs,mean(arr[(i-times):(i-1)]))
}
return(avgs)
}
# groups data by count of common instance
number_instances = function(dataframe,instance){
count_instance = dataframe%>%
group_by_at(instance)%>%
summarize(count=n())%>%
return()
}
# outputs a list of each instance formatted with an 'h' in front
list_of_instance = function(dataframe,instance){
instances = number_instances(dataframe,instance)
hour_label = c()
for (count in instances$count){
for (i in seq(1:count)){
hour_label = c(hour_label,paste0("h",i))
}
}
return(hour_label)
}
# normalizes a vector using the mean
normalize = function(vector){
avg = mean(vector)
std = sd(vector)
return((vector-avg)/std)
}
# changes vector to binary based on set threshhold
to_binary = function(vector,thresh){
binary = c()
for(i in vector){
if (i>thresh){
binary = c(binary,1)
}else{
binary = c(binary,0)
}
}
return(binary)
}
loading data
# load downloaded csv file, from working directory, to R
spy.max = "spy_max.csv"
daily.spy= read.csv(spy.max)
head(daily.spy)
## Date Open High Low Close Adj.Close Volume
## 1 1993-01-29 43.96875 43.96875 43.75000 43.93750 25.88418 1003200
## 2 1993-02-01 43.96875 44.25000 43.96875 44.25000 26.06828 480500
## 3 1993-02-02 44.21875 44.37500 44.12500 44.34375 26.12350 201300
## 4 1993-02-03 44.40625 44.84375 44.37500 44.81250 26.39965 529400
## 5 1993-02-04 44.96875 45.09375 44.46875 45.00000 26.51011 531500
## 6 1993-02-05 44.96875 45.06250 44.71875 44.96875 26.49171 492100
tidying and adding features to dataset
# turn index into datetime
daily.spy$Date = as.Date(daily.spy$Date)
len=dim(daily.spy)[1]
# adding 200 and 50 day moving averages
daily.spy$ma_200=moving_avg(daily.spy$Close,200)
daily.spy$ma_50=moving_avg(daily.spy$Close,50)
# add percent change column
daily.spy$change_day = 100*(daily.spy$Close - daily.spy$Open)/daily.spy$Open
# add change_next column (percent change next day)
daily.spy$change_next = c(daily.spy$change_day[2:len],0)
# create binary 'tmro_green' column based on next days price action
daily.spy$tmro_green = to_binary(daily.spy$change_next,0)
plotting SPY price action
# drop first 200 rows and last row for plotting
plot.spy = daily.spy[201:(len-1),]
# stocks only go up
plot.spy%>%
ggplot(aes(x=Date,group=1))+geom_line(aes(y=Close))+geom_line(aes(y=ma_200),color='red')+geom_line(aes(y=ma_50),color='blue')+scale_x_date(date_labels = '%Y') +labs(title = 'Price Action for SPY', y='Price',x='Date')

Price vs Volume
# As Volume gets larger the mangnitude of change increases. However, there is no clear direction (Green or red)
plot.spy%>%
ggplot(aes(x=log(Volume), y=change_next))+geom_point()+ geom_smooth(method='lm', formula= y~x)+labs(y= '% Change Next Day', title = "Price Action vs Volume")

daily change vs change next day
# daily change does not seem predictive of next days change
plot.spy%>%
ggplot(aes(x=change_day,y=change_next))+geom_point()+geom_smooth(method='lm',formula=y~x)+labs(title='Daily % Change')

Since daily price action did not seem to be predictive of next days price action, I will look at hourly data to get more features
#loading hourly data
spy.1h = 'spy_1h.csv'
hourly = read.csv(spy.1h)
# to date object
hourly$X = as.Date(hourly$X)
# remove today
now = Sys.Date()
hourly = hourly[hourly$X!=now,]
# create change %
size_hourly = dim(hourly)[1]
hourly$change = 100*(hourly$Close-hourly$Open)/hourly$Open
# create green (binary:1,0) column
hourly$green = to_binary(hourly$change,0)
# normalized change
hourly$change_var = normalize(hourly$change)
#create hour labels
hourly$hour=list_of_instance(hourly,'X')
head(hourly)
## X Open High Low Close Adj.Close Volume change green
## 1 2019-05-13 282.4200 283.49 281.07 281.720 281.720 30577306 -0.2478621 0
## 2 2019-05-13 281.7100 282.54 280.96 281.230 281.230 12211553 -0.1703811 0
## 3 2019-05-13 281.2287 281.56 280.48 280.720 280.720 10115490 -0.1808839 0
## 4 2019-05-13 280.7100 281.16 279.93 280.025 280.025 10870050 -0.2440232 0
## 5 2019-05-13 280.0200 281.97 279.98 281.550 281.550 16279994 0.5463891 1
## 6 2019-05-13 281.5600 282.24 280.70 280.820 280.820 13127449 -0.2628180 0
## change_var hour
## 1 -0.6146077 h1
## 2 -0.4243478 h2
## 3 -0.4501383 h3
## 4 -0.6051811 h4
## 5 1.3357294 h5
## 6 -0.6513328 h6
hour vs next hour
#create % change for next hour
hourly$next_change = c(hourly$change[2:size_hourly],0)
# hourly % change is not very predictive of the next hours change
hourly%>% ggplot(aes(x=change,y=next_change))+geom_point()+geom_smooth(method='lm',formula=y~x)+labs(title='Hourly % Change')

distribution of change
# % change variable is normally distributed
hourly%>%
ggplot(aes(x=change))+geom_histogram(bins=30)

Create dataframes for modeling
% Change
change = hourly[c('X','hour','change')]%>%
pivot_wider(names_from = hour,values_from=change)
change
## # A tibble: 503 x 8
## X h1 h2 h3 h4 h5 h6 h7
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-05-13 -0.248 -0.170 -0.181 -0.244 0.546 -0.263 0.0534
## 2 2019-05-14 0.514 0.205 0.0510 0.243 0.0124 -0.172 -0.324
## 3 2019-05-15 0.838 0.347 0.0105 -0.0322 0.234 0.0420 -0.221
## 4 2019-05-16 0.878 0.0798 0.0936 -0.0831 -0.00347 -0.217 -0.118
## 5 2019-05-17 0.628 0.410 -0.371 0.139 -0.0296 -0.305 -0.164
## 6 2019-05-20 0.324 -0.0877 0.0158 -0.0983 -0.116 -0.113 0.0546
## 7 2019-05-21 0.0140 0.217 -0.0454 0.0384 0.126 -0.0680 -0.0523
## 8 2019-05-22 0.217 0.00700 -0.149 -0.0770 0.149 0.0595 -0.147
## 9 2019-05-23 -0.332 -0.0283 -0.252 -0.0604 -0.139 0.420 0.0674
## 10 2019-05-24 -0.233 -0.240 0.202 0.00706 -0.0565 0.0919 -0.187
## # ... with 493 more rows
Change normalized
norm = hourly[c('X','hour','change_var')]%>%
pivot_wider(names_from = hour,values_from=change_var)
head(norm)
## # A tibble: 6 x 8
## X h1 h2 h3 h4 h5 h6 h7
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-05-13 -0.615 -0.424 -0.450 -0.605 1.34 -0.651 0.125
## 2 2019-05-14 1.26 0.497 0.119 0.590 0.0244 -0.428 -0.800
## 3 2019-05-15 2.05 0.846 0.0199 -0.0850 0.569 0.0972 -0.548
## 4 2019-05-16 2.15 0.190 0.224 -0.210 -0.0145 -0.540 -0.296
## 5 2019-05-17 1.54 1.00 -0.918 0.336 -0.0786 -0.754 -0.409
## 6 2019-05-20 0.789 -0.221 0.0329 -0.247 -0.291 -0.283 0.128
Binary Changes
green = hourly[c('X','hour','green')]%>%
pivot_wider(names_from = hour,values_from=green)
green
## # A tibble: 503 x 8
## X h1 h2 h3 h4 h5 h6 h7
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-05-13 0 0 0 0 1 0 1
## 2 2019-05-14 1 1 1 1 1 0 0
## 3 2019-05-15 1 1 1 0 1 1 0
## 4 2019-05-16 1 1 1 0 0 0 0
## 5 2019-05-17 1 1 0 1 0 0 0
## 6 2019-05-20 1 0 1 0 0 0 1
## 7 2019-05-21 1 1 0 1 1 0 0
## 8 2019-05-22 1 1 0 0 1 1 0
## 9 2019-05-23 0 0 0 0 0 1 1
## 10 2019-05-24 0 0 1 1 0 1 0
## # ... with 493 more rows
Create dataframe with spy date, volume and next days price
next.spy = daily.spy[c('Date','Volume','tmro_green')]
# merge days hourly price action with next days outcome
hourly.change = next.spy%>%
inner_join(change,by=(c('Date'='X')))%>%
replace(is.na(.),0)
# create training/ evaluation sets
size=dim(hourly.change)[1]
set.seed(1111)
training = sample(seq(size),size = round(size*.7))
change.train = hourly.change[training,]
change.test = hourly.change[-training,]
#model1
m.change = glm(tmro_green~.-Date, change.train,family='binomial')
summary(m.change)
##
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = change.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.652 -1.289 0.957 1.061 1.348
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.342e-01 2.217e-01 1.508 0.132
## Volume -2.961e-10 2.283e-09 -0.130 0.897
## h1 -2.874e-01 2.470e-01 -1.164 0.245
## h2 2.907e-01 2.573e-01 1.130 0.259
## h3 -6.325e-02 3.596e-01 -0.176 0.860
## h4 -5.523e-01 3.500e-01 -1.578 0.115
## h5 -1.137e-01 4.081e-01 -0.279 0.781
## h6 -2.317e-01 3.363e-01 -0.689 0.491
## h7 -1.068e-01 2.452e-01 -0.436 0.663
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 480.27 on 351 degrees of freedom
## Residual deviance: 473.73 on 343 degrees of freedom
## AIC: 491.73
##
## Number of Fisher Scoring iterations: 4
model 2
# merge with normalized change
hourly.norm = next.spy%>%
inner_join(norm,by=(c('Date'='X')))%>%
replace(is.na(.),0)
norm.train = hourly.norm[training,]
norm.test = hourly.norm[-training,]
m.norm = glm(tmro_green~.-Date,norm.train,family='binomial')
summary(m.norm)
##
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = norm.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6524 -1.2895 0.9571 1.0608 1.3479
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.316e-01 2.212e-01 1.499 0.134
## Volume -2.956e-10 2.283e-09 -0.129 0.897
## h1 -1.171e-01 1.006e-01 -1.164 0.245
## h2 1.184e-01 1.048e-01 1.130 0.259
## h3 -2.578e-02 1.464e-01 -0.176 0.860
## h4 -2.249e-01 1.425e-01 -1.578 0.115
## h5 -4.618e-02 1.662e-01 -0.278 0.781
## h6 -9.429e-02 1.369e-01 -0.689 0.491
## h7 -4.348e-02 9.986e-02 -0.435 0.663
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 480.27 on 351 degrees of freedom
## Residual deviance: 473.73 on 343 degrees of freedom
## AIC: 491.73
##
## Number of Fisher Scoring iterations: 4
model 3
# merge with binomial change
hourly.green = next.spy%>%
inner_join(green,by=(c('Date'='X')))%>%
replace(is.na(.),0)
green.train = hourly.green[training,]
green.test = hourly.green[-training,]
m.green = glm(tmro_green~. -Date,green.train,family='binomial')
summary(m.green)
##
## Call:
## glm(formula = tmro_green ~ . - Date, family = "binomial", data = green.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6358 -1.2228 0.9088 1.0373 1.3478
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.712e-01 4.079e-01 2.136 0.0327 *
## Volume -8.832e-10 1.987e-09 -0.444 0.6567
## h1 -2.415e-01 2.233e-01 -1.081 0.2797
## h2 1.804e-01 2.223e-01 0.811 0.4171
## h3 -2.187e-01 2.216e-01 -0.987 0.3238
## h4 -1.526e-01 2.250e-01 -0.678 0.4977
## h5 -1.782e-01 2.237e-01 -0.797 0.4255
## h6 -4.171e-01 2.199e-01 -1.897 0.0578 .
## h7 8.246e-02 2.215e-01 0.372 0.7098
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 480.27 on 351 degrees of freedom
## Residual deviance: 472.08 on 343 degrees of freedom
## AIC: 490.08
##
## Number of Fisher Scoring iterations: 4
testing models
#setting up test df
tmro.green = hourly.change[c('tmro_green')]
test = data.frame(tmro_green=tmro.green[-training,])
test$change=to_binary(predict(m.change,change.test,type='response'),0.5)
test$norm=to_binary(predict(m.norm,norm.test,type='response'),0.5)
test$green=to_binary(predict(m.green,green.test,type='response'),0.5)
head(test)
## tmro_green change norm green
## 1 1 1 1 1
## 2 1 1 1 0
## 3 0 1 1 1
## 4 1 1 1 1
## 5 0 1 1 0
## 6 1 1 1 1
#change data to factors
test = test%>%
mutate(tmro_green = as.factor(tmro_green),
change = as.factor(change),
norm = as.factor(norm),
green = as.factor(green)
)
# % change model
confusionMatrix(data=test$change,reference=test$tmro_green)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5 7
## 1 66 73
##
## Accuracy : 0.5166
## 95% CI : (0.4339, 0.5985)
## No Information Rate : 0.5298
## P-Value [Acc > NIR] : 0.6587
##
## Kappa : -0.0179
##
## Mcnemar's Test P-Value : 1.134e-11
##
## Sensitivity : 0.07042
## Specificity : 0.91250
## Pos Pred Value : 0.41667
## Neg Pred Value : 0.52518
## Prevalence : 0.47020
## Detection Rate : 0.03311
## Detection Prevalence : 0.07947
## Balanced Accuracy : 0.49146
##
## 'Positive' Class : 0
##
# normed model
confusionMatrix(data=test$norm,reference=test$tmro_green)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5 7
## 1 66 73
##
## Accuracy : 0.5166
## 95% CI : (0.4339, 0.5985)
## No Information Rate : 0.5298
## P-Value [Acc > NIR] : 0.6587
##
## Kappa : -0.0179
##
## Mcnemar's Test P-Value : 1.134e-11
##
## Sensitivity : 0.07042
## Specificity : 0.91250
## Pos Pred Value : 0.41667
## Neg Pred Value : 0.52518
## Prevalence : 0.47020
## Detection Rate : 0.03311
## Detection Prevalence : 0.07947
## Balanced Accuracy : 0.49146
##
## 'Positive' Class : 0
##
#binary model
confusionMatrix(data=test$green,reference=test$tmro_green)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16 15
## 1 55 65
##
## Accuracy : 0.5364
## 95% CI : (0.4535, 0.6178)
## No Information Rate : 0.5298
## P-Value [Acc > NIR] : 0.4682
##
## Kappa : 0.0391
##
## Mcnemar's Test P-Value : 3.141e-06
##
## Sensitivity : 0.2254
## Specificity : 0.8125
## Pos Pred Value : 0.5161
## Neg Pred Value : 0.5417
## Prevalence : 0.4702
## Detection Rate : 0.1060
## Detection Prevalence : 0.2053
## Balanced Accuracy : 0.5189
##
## 'Positive' Class : 0
##
roc curve for model 3
green.roc = roc(green.train$tmro_green,predict(m.green))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# curve maximizes around threshhold of 0.5
plot(green.roc, main='ROC for Binary Model')

#auc
auc(green.roc)
## Area under the curve: 0.5908
naive bayes model
# since our variables seem to be independent lets try naive bayes
row = dim(hourly.change)[2]
hourly.nb = hourly.change[3:row]
nb.train = hourly.nb[training,]
nb.test = hourly.nb[-training,]
m.nb = naiveBayes(tmro_green~.,nb.train)
test$nb = predict(m.nb,nb.test)
test$nb = as.factor(test$nb)
# we beat the average with 56.29% accuracy!
confusionMatrix(data=test$nb,reference=test$tmro_green)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 47
## 1 19 33
##
## Accuracy : 0.5629
## 95% CI : (0.4799, 0.6434)
## No Information Rate : 0.5298
## P-Value [Acc > NIR] : 0.231849
##
## Kappa : 0.1417
##
## Mcnemar's Test P-Value : 0.000889
##
## Sensitivity : 0.7324
## Specificity : 0.4125
## Pos Pred Value : 0.5253
## Neg Pred Value : 0.6346
## Prevalence : 0.4702
## Detection Rate : 0.3444
## Detection Prevalence : 0.6556
## Balanced Accuracy : 0.5724
##
## 'Positive' Class : 0
##
test on new data
today = yf.download(tickers='SPY',period='1d',interval = '1h')
##
[*********************100%***********************] 1 of 1 completed
today.to_csv('today.csv')
today = read.csv('today.csv')
pipeline = function(dataframe,model){
#normalize dates
df = dataframe%>%
mutate(X = as.Date(X))
#create %change field
df$change = 100*(df$Close-df$Open)/df$Open
#create hour grouping
df$hour = list_of_instance(df,'X')
#subset data to only contain needed rows
subset = df[c('X','hour','change')]%>%
pivot_wider(names_from=hour, values_from=change)
#return model results
return(predict(model,subset))
}
pipeline(today,m.nb)
## [1] 1
## Levels: 0 1
conclusion
stock market is hard to predict :(
- daily price action is not predictive of next days price action
- hourly price action is not predictive of next days price action
- Volume is not very predictive of price action
naive bayes is a decent predictor since variables are not correlated
-Stocks going up tomorrow (according to this model, not financial advice )