Â
library(readr)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(tibble)
library(dplyr)
# https://r4ds.had.co.nz/dates-and-times.html
# Schwab
SCHW <- read_csv("data/SCHW.csv")
ggplot(data=SCHW, mapping=aes(x=Date, y=Close))+
geom_point() +
geom_smooth()
#============================
# Data Prep
#
data.prep <- SCHW[,c(1, 5, 6, 7)]
data.prep %>% arrange(ymd(data.prep$Date))
# A tibble: 1,259 x 4
Date Close `Adj Close` Volume
<date> <dbl> <dbl> <dbl>
1 2017-03-15 42.7 40.1 8169800
2 2017-03-16 43.5 40.8 7391600
3 2017-03-17 42.9 40.2 8913900
4 2017-03-20 42.2 39.6 6191400
5 2017-03-21 40.4 37.9 10427900
6 2017-03-22 40.2 37.7 8935000
7 2017-03-23 40.2 37.7 7387800
8 2017-03-24 40.0 37.5 7528300
9 2017-03-27 39.4 37.0 13739500
10 2017-03-28 40.5 38.0 8934400
# ... with 1,249 more rows
data.prep$Trend <- 1:nrow(data.prep)
data.prep$Day <- mday(data.prep$Date)
data.prep$Month <- month(data.prep$Date)
data.prep <- data.prep %>% add_column(jan=0,
feb=0,
mar=0,
apr=0,
may=0,
jun=0,
jul=0,
aug=0,
sep=0,
oct=0,
nov=0,
dec=0)
head(data.prep)
# A tibble: 6 x 19
Date Close `Adj Close` Volume Trend Day Month jan feb mar apr
<date> <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2017-03-15 42.7 40.1 8.17e6 1 15 3 0 0 0 0
2 2017-03-16 43.5 40.8 7.39e6 2 16 3 0 0 0 0
3 2017-03-17 42.9 40.2 8.91e6 3 17 3 0 0 0 0
4 2017-03-20 42.2 39.6 6.19e6 4 20 3 0 0 0 0
5 2017-03-21 40.4 37.9 1.04e7 5 21 3 0 0 0 0
6 2017-03-22 40.2 37.7 8.94e6 6 22 3 0 0 0 0
# ... with 8 more variables: may <dbl>, jun <dbl>, jul <dbl>, aug <dbl>,
# sep <dbl>, oct <dbl>, nov <dbl>, dec <dbl>
for(i in 1:nrow(data.prep)){
month.count = 1
month = as.numeric(data.prep[i,7])
for(r in 8:19){
if(month == month.count){
data.prep[i,r] = 1
}
month.count = month.count + 1
}
}
data.filter <- data.prep[,c(5,6,8:19,3,4)] # dropping unneccesary variables and re-ordering
names(data.filter)[15] <- "Close" # renaming
head(data.filter)
# A tibble: 6 x 16
Trend Day jan feb mar apr may jun jul aug sep oct nov
<int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 15 0 0 1 0 0 0 0 0 0 0 0
2 2 16 0 0 1 0 0 0 0 0 0 0 0
3 3 17 0 0 1 0 0 0 0 0 0 0 0
4 4 20 0 0 1 0 0 0 0 0 0 0 0
5 5 21 0 0 1 0 0 0 0 0 0 0 0
6 6 22 0 0 1 0 0 0 0 0 0 0 0
# ... with 3 more variables: dec <dbl>, Close <dbl>, Volume <dbl>
###
# 80/20 split
e <- round(.8*nrow(data.filter)) # oldest eighty
data.train <- data.filter[1:e,]
data.test <- data.filter[(e+1):nrow(data.filter), c(2:14)]
test.actuals <- data.filter[(e+1):nrow(data.filter), c(1,15,16)]
#============================
# Model 1
#
m1 <- lm(Close~ Day + jan + feb + mar + apr + may + jun +
jul + aug + sep + oct + nov + dec, data=data.train)
summary(m1)
Call:
lm(formula = Close ~ Day + jan + feb + mar + apr + may + jun +
jul + aug + sep + oct + nov + dec, data = data.train)
Residuals:
Min 1Q Median 3Q Max
-16.6670 -4.2423 -0.8245 2.8935 22.6797
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 47.05103 0.76255 61.702 < 2e-16 ***
Day -0.02572 0.02217 -1.160 0.2463
jan 2.36330 0.95995 2.462 0.0140 *
feb 2.33786 0.97884 2.388 0.0171 *
mar -2.40272 0.94326 -2.547 0.0110 *
apr -5.39618 0.95979 -5.622 2.45e-08 ***
may -4.47244 0.94855 -4.715 2.76e-06 ***
jun -4.88704 0.95126 -5.137 3.35e-07 ***
jul -5.97836 0.95148 -6.283 4.95e-10 ***
aug -7.20749 0.94076 -7.661 4.36e-14 ***
sep -6.29789 0.96577 -6.521 1.11e-10 ***
oct -5.94084 0.93821 -6.332 3.66e-10 ***
nov -2.68790 0.95981 -2.800 0.0052 **
dec NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.145 on 994 degrees of freedom
Multiple R-squared: 0.212, Adjusted R-squared: 0.2024
F-statistic: 22.28 on 12 and 994 DF, p-value: < 2.2e-16
pred1 <- predict(m1, data.test)
pred1.comp <- data.frame(test.actuals[,c(1,2)], pred1)
##==== Function 1 ====
#
# Preps dataset with predicted and actual values allowing easier plots.
#
plot.prep <- function(t,a,p){
trend=as.integer(t)
actual=as.integer(a)
pred=as.integer(p)
prep = as.data.frame(cbind(trend,actual,pred))
data = prep %>%
select("trend", "actual", "pred") %>%
pivot_longer(2:3, names_to="variable", values_to="value")
return(data)
}
## ========
pred1.plot <- plot.prep(pred1.comp$Trend, pred1.comp$Close, pred1.comp$pred1)
ggplot(data=pred1.plot, mapping=aes(x=trend, y=value, color=variable))+
geom_point() +
geom_smooth()
#============================
# Evaluation
#
# Training ===
train.pred <- predict(m1, data.train)
train.comp <- data.frame(data.train[,c(1,15)], train.pred)
train.plot <- plot.prep(train.comp$Trend, train.comp$Close, train.comp$train.pred)
ggplot(data=train.plot, mapping=aes(x=trend, y=value, color=variable))+
geom_point() +
geom_smooth()
# All ===
all.pred <- rbind(train.plot, pred1.plot)
ggplot(data=all.pred, mapping=aes(x=trend, y=value, color=variable))+
geom_point() +
geom_smooth()
#============================
# Model 2 - Auto-regressive model
#
data2 <- data.filter[c(1:15)]
data2 <- data2 %>% add_column(t_1=0, diff=0)
data2[1,16] = data2[1,15]
for(i in 2:nrow(data2)){
num = i-1
data2[i,16] = data2[num,15]
data2[i,17] = data2[i,16] - data2[i,15]
}
###
# 80/20 split
e <- round(.8*nrow(data2)) # oldest eighty
data2.train <- data2[1:e,]
#data2.test <- data2[(e+1):nrow(data2), c(2:14, 16, 17)]
data2.test <- data2[(e+1):nrow(data2), c(1:14, 16)]
test2.actuals <- data2[(e+1):nrow(data2), c(1,15,16)]
#============================
# Model 2
#
#m2 <- lm(Close~ Day + jan + feb + mar + apr + may + jun +
# jul + aug + sep + oct + nov + dec + t_1 + diff, data=data2.train)
m2 <- lm(Close~ Day + jan + feb + mar + apr + may + jun +
jul + aug + sep + oct + nov + dec + t_1, data=data2.train)
summary(m2)
Call:
lm(formula = Close ~ Day + jan + feb + mar + apr + may + jun +
jul + aug + sep + oct + nov + dec + t_1, data = data2.train)
Residuals:
Min 1Q Median 3Q Max
-4.0366 -0.4527 0.0156 0.4456 5.2798
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.361603 0.245549 1.473 0.1412
Day -0.005498 0.003242 -1.696 0.0902 .
jan 0.039282 0.140730 0.279 0.7802
feb 0.052076 0.143468 0.363 0.7167
mar -0.123237 0.138281 -0.891 0.3730
apr 0.049778 0.142586 0.349 0.7271
may -0.108312 0.140142 -0.773 0.4398
jun -0.091871 0.140841 -0.652 0.5144
jul -0.021356 0.141844 -0.151 0.8804
aug -0.110872 0.141467 -0.784 0.4334
sep 0.024313 0.144234 0.169 0.8662
oct -0.029263 0.139899 -0.209 0.8344
nov 0.192977 0.140935 1.369 0.1712
dec NA NA NA NA
t_1 0.994523 0.004661 213.392 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.8982 on 993 degrees of freedom
Multiple R-squared: 0.9832, Adjusted R-squared: 0.983
F-statistic: 4465 on 13 and 993 DF, p-value: < 2.2e-16
pred2 <- predict(m2, data2.test)
pred2.comp <- data.frame(test2.actuals[,c(1,2)], pred2)
pred2.plot <- plot.prep(pred2.comp$Trend, pred2.comp$Close, pred2.comp$pred2)
ggplot(data=pred2.plot, mapping=aes(x=trend, y=value, color=variable))+
geom_point() +
geom_smooth()
#============================
# Evaluation -- Model 2
#
# Training ===
train.pred2 <- predict(m2, data2.train)
train.comp2 <- data.frame(data2.train[,c(1,15)], train.pred2)
train2.plot <- plot.prep(train.comp2$Trend, train.comp2$Close, train.comp2$train.pred)
ggplot(data=train2.plot, mapping=aes(x=trend, y=value, color=variable))+
geom_point() +
geom_smooth()
# All ===
all.pred2 <- rbind(train2.plot, pred2.plot)
ggplot(data=all.pred2, mapping=aes(x=trend, y=value, color=variable))+
geom_point() +
geom_smooth()
#============================
# Model 3 - Next Month Predictions
#
m3 <- lm(Close~ jan + feb + mar + apr + may + jun +
jul + aug + sep + oct + nov + dec + t_1, data=data2.train)
data3 <- data2
same.dates.raw <- data3[503:533,]
same.dates <- same.dates.raw[,2:14]
today <- data3[nrow(data3),1:16]
days <- 1:31
days <- 1259:1289
next.month <- cbind(days, same.dates)
next.month$Pred <- 0
next.month$t_1 <- 0
next.month[1,16] <- today[1,15]
future.pred <- function(day){
tomorrow.pred <- m3 %>% predict(day)
return(tomorrow.pred)
}
for(i in 1:30){
upd = i + 1
next.month[i,15] = future.pred(today)
next.month[upd,16] = next.month[i,15]
today <- next.month[i,]
}
ggplot(data=next.month, mapping=aes(x=days, y=Pred))+
geom_point() +
geom_smooth()