1. Libraries

library(readr)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(tibble)
library(dplyr)
# https://r4ds.had.co.nz/dates-and-times.html

2. Reading Data

# Schwab
SCHW <- read_csv("data/SCHW.csv")

ggplot(data=SCHW, mapping=aes(x=Date, y=Close))+
  geom_point() + 
  geom_smooth()

3. Data Prep

#============================
# Data Prep
#

data.prep <- SCHW[,c(1, 5, 6, 7)]
data.prep %>% arrange(ymd(data.prep$Date))
# A tibble: 1,259 x 4
   Date       Close `Adj Close`   Volume
   <date>     <dbl>       <dbl>    <dbl>
 1 2017-03-15  42.7        40.1  8169800
 2 2017-03-16  43.5        40.8  7391600
 3 2017-03-17  42.9        40.2  8913900
 4 2017-03-20  42.2        39.6  6191400
 5 2017-03-21  40.4        37.9 10427900
 6 2017-03-22  40.2        37.7  8935000
 7 2017-03-23  40.2        37.7  7387800
 8 2017-03-24  40.0        37.5  7528300
 9 2017-03-27  39.4        37.0 13739500
10 2017-03-28  40.5        38.0  8934400
# ... with 1,249 more rows
data.prep$Trend <- 1:nrow(data.prep)
data.prep$Day <- mday(data.prep$Date)
data.prep$Month <- month(data.prep$Date)


data.prep <- data.prep %>% add_column(jan=0,
                                       feb=0,
                                       mar=0,
                                       apr=0,
                                       may=0,
                                       jun=0,
                                       jul=0,
                                       aug=0,
                                       sep=0,
                                       oct=0,
                                       nov=0,
                                       dec=0)

head(data.prep)
# A tibble: 6 x 19
  Date       Close `Adj Close`  Volume Trend   Day Month   jan   feb   mar   apr
  <date>     <dbl>       <dbl>   <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2017-03-15  42.7        40.1  8.17e6     1    15     3     0     0     0     0
2 2017-03-16  43.5        40.8  7.39e6     2    16     3     0     0     0     0
3 2017-03-17  42.9        40.2  8.91e6     3    17     3     0     0     0     0
4 2017-03-20  42.2        39.6  6.19e6     4    20     3     0     0     0     0
5 2017-03-21  40.4        37.9  1.04e7     5    21     3     0     0     0     0
6 2017-03-22  40.2        37.7  8.94e6     6    22     3     0     0     0     0
# ... with 8 more variables: may <dbl>, jun <dbl>, jul <dbl>, aug <dbl>,
#   sep <dbl>, oct <dbl>, nov <dbl>, dec <dbl>
for(i in 1:nrow(data.prep)){
  month.count = 1
  month = as.numeric(data.prep[i,7])
  
  for(r in 8:19){
    if(month == month.count){
      data.prep[i,r] = 1
    }
    month.count = month.count + 1
  }
}


data.filter <- data.prep[,c(5,6,8:19,3,4)] # dropping unneccesary variables and re-ordering
names(data.filter)[15] <- "Close" # renaming

head(data.filter)
# A tibble: 6 x 16
  Trend   Day   jan   feb   mar   apr   may   jun   jul   aug   sep   oct   nov
  <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1    15     0     0     1     0     0     0     0     0     0     0     0
2     2    16     0     0     1     0     0     0     0     0     0     0     0
3     3    17     0     0     1     0     0     0     0     0     0     0     0
4     4    20     0     0     1     0     0     0     0     0     0     0     0
5     5    21     0     0     1     0     0     0     0     0     0     0     0
6     6    22     0     0     1     0     0     0     0     0     0     0     0
# ... with 3 more variables: dec <dbl>, Close <dbl>, Volume <dbl>
###
# 80/20 split
e <- round(.8*nrow(data.filter)) # oldest eighty

data.train <- data.filter[1:e,]
data.test <- data.filter[(e+1):nrow(data.filter), c(2:14)]
test.actuals <- data.filter[(e+1):nrow(data.filter), c(1,15,16)]

4. Model 1

#============================
# Model 1
#

m1 <- lm(Close~ Day + jan + feb + mar + apr + may + jun + 
           jul + aug + sep + oct + nov + dec, data=data.train)

summary(m1)

Call:
lm(formula = Close ~ Day + jan + feb + mar + apr + may + jun + 
    jul + aug + sep + oct + nov + dec, data = data.train)

Residuals:
     Min       1Q   Median       3Q      Max 
-16.6670  -4.2423  -0.8245   2.8935  22.6797 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 47.05103    0.76255  61.702  < 2e-16 ***
Day         -0.02572    0.02217  -1.160   0.2463    
jan          2.36330    0.95995   2.462   0.0140 *  
feb          2.33786    0.97884   2.388   0.0171 *  
mar         -2.40272    0.94326  -2.547   0.0110 *  
apr         -5.39618    0.95979  -5.622 2.45e-08 ***
may         -4.47244    0.94855  -4.715 2.76e-06 ***
jun         -4.88704    0.95126  -5.137 3.35e-07 ***
jul         -5.97836    0.95148  -6.283 4.95e-10 ***
aug         -7.20749    0.94076  -7.661 4.36e-14 ***
sep         -6.29789    0.96577  -6.521 1.11e-10 ***
oct         -5.94084    0.93821  -6.332 3.66e-10 ***
nov         -2.68790    0.95981  -2.800   0.0052 ** 
dec               NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.145 on 994 degrees of freedom
Multiple R-squared:  0.212, Adjusted R-squared:  0.2024 
F-statistic: 22.28 on 12 and 994 DF,  p-value: < 2.2e-16
pred1 <- predict(m1, data.test)
pred1.comp <- data.frame(test.actuals[,c(1,2)], pred1)


##==== Function 1 ====
#
# Preps dataset with predicted and actual values allowing easier plots.
#
plot.prep <- function(t,a,p){
  trend=as.integer(t)
  actual=as.integer(a)
  pred=as.integer(p)
  
  prep = as.data.frame(cbind(trend,actual,pred))
  
  data = prep %>% 
    select("trend", "actual", "pred") %>%
    pivot_longer(2:3, names_to="variable", values_to="value")
  
  return(data)
}
## ========

pred1.plot <- plot.prep(pred1.comp$Trend, pred1.comp$Close, pred1.comp$pred1)


ggplot(data=pred1.plot, mapping=aes(x=trend, y=value, color=variable))+
  geom_point() + 
  geom_smooth()

#============================
# Evaluation
#

# Training ===
train.pred <- predict(m1, data.train)
train.comp <- data.frame(data.train[,c(1,15)], train.pred)

train.plot <- plot.prep(train.comp$Trend, train.comp$Close, train.comp$train.pred)

ggplot(data=train.plot, mapping=aes(x=trend, y=value, color=variable))+
  geom_point() + 
  geom_smooth()

# All ===
all.pred <- rbind(train.plot, pred1.plot)

ggplot(data=all.pred, mapping=aes(x=trend, y=value, color=variable))+
  geom_point() + 
  geom_smooth()

5. Model 2

#============================
# Model 2 - Auto-regressive model
#

data2 <- data.filter[c(1:15)]
data2 <- data2 %>% add_column(t_1=0, diff=0)


data2[1,16] = data2[1,15]
for(i in 2:nrow(data2)){
  num = i-1
  data2[i,16] = data2[num,15]
  data2[i,17] = data2[i,16] - data2[i,15]
}

###
# 80/20 split
e <- round(.8*nrow(data2)) # oldest eighty

data2.train <- data2[1:e,]
#data2.test <- data2[(e+1):nrow(data2), c(2:14, 16, 17)]
data2.test <- data2[(e+1):nrow(data2), c(1:14, 16)]

test2.actuals <- data2[(e+1):nrow(data2), c(1,15,16)]


#============================
# Model 2
#

#m2 <- lm(Close~ Day + jan + feb + mar + apr + may + jun + 
#           jul + aug + sep + oct + nov + dec + t_1 + diff, data=data2.train)

m2 <- lm(Close~ Day + jan + feb + mar + apr + may + jun + 
           jul + aug + sep + oct + nov + dec + t_1, data=data2.train)

summary(m2)

Call:
lm(formula = Close ~ Day + jan + feb + mar + apr + may + jun + 
    jul + aug + sep + oct + nov + dec + t_1, data = data2.train)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.0366 -0.4527  0.0156  0.4456  5.2798 

Coefficients: (1 not defined because of singularities)
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.361603   0.245549   1.473   0.1412    
Day         -0.005498   0.003242  -1.696   0.0902 .  
jan          0.039282   0.140730   0.279   0.7802    
feb          0.052076   0.143468   0.363   0.7167    
mar         -0.123237   0.138281  -0.891   0.3730    
apr          0.049778   0.142586   0.349   0.7271    
may         -0.108312   0.140142  -0.773   0.4398    
jun         -0.091871   0.140841  -0.652   0.5144    
jul         -0.021356   0.141844  -0.151   0.8804    
aug         -0.110872   0.141467  -0.784   0.4334    
sep          0.024313   0.144234   0.169   0.8662    
oct         -0.029263   0.139899  -0.209   0.8344    
nov          0.192977   0.140935   1.369   0.1712    
dec                NA         NA      NA       NA    
t_1          0.994523   0.004661 213.392   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.8982 on 993 degrees of freedom
Multiple R-squared:  0.9832,    Adjusted R-squared:  0.983 
F-statistic:  4465 on 13 and 993 DF,  p-value: < 2.2e-16
pred2 <- predict(m2, data2.test)
pred2.comp <- data.frame(test2.actuals[,c(1,2)], pred2)

pred2.plot <- plot.prep(pred2.comp$Trend, pred2.comp$Close, pred2.comp$pred2)

ggplot(data=pred2.plot, mapping=aes(x=trend, y=value, color=variable))+
  geom_point() + 
  geom_smooth()

#============================
# Evaluation -- Model 2
#


# Training ===
train.pred2 <- predict(m2, data2.train)
train.comp2 <- data.frame(data2.train[,c(1,15)], train.pred2)

train2.plot <- plot.prep(train.comp2$Trend, train.comp2$Close, train.comp2$train.pred)

ggplot(data=train2.plot, mapping=aes(x=trend, y=value, color=variable))+
  geom_point() + 
  geom_smooth()

# All ===
all.pred2 <- rbind(train2.plot, pred2.plot)

ggplot(data=all.pred2, mapping=aes(x=trend, y=value, color=variable))+
  geom_point() + 
  geom_smooth()

6. Model 3

#============================
# Model 3 - Next Month Predictions
#


m3 <- lm(Close~ jan + feb + mar + apr + may + jun + 
           jul + aug + sep + oct + nov + dec + t_1, data=data2.train)


data3 <- data2
same.dates.raw <- data3[503:533,]
same.dates <- same.dates.raw[,2:14]


today <- data3[nrow(data3),1:16]


days <- 1:31
days <- 1259:1289
next.month <- cbind(days, same.dates)
next.month$Pred <- 0
next.month$t_1 <- 0
next.month[1,16] <- today[1,15]


future.pred <- function(day){
  tomorrow.pred <- m3 %>% predict(day)
  
  return(tomorrow.pred)
}

for(i in 1:30){
  upd = i + 1
  next.month[i,15] = future.pred(today)
  next.month[upd,16] = next.month[i,15]
  
  
  today <- next.month[i,]
}

ggplot(data=next.month, mapping=aes(x=days, y=Pred))+
  geom_point() + 
  geom_smooth()