Getting started

+ the DS workflow
+ on visualization
+ our packages for today + reproducibility

Import our data

 + Get 5 years of price data on 5 ETFs
 + Convert to returns
# The symbols vector holds our tickers. 
symbols <- 
  sort(c("SPY","EFA", "IJS", "EEM","AGG"))

prices_tq <- 
  tq_get(symbols, 
         get = "stock.prices",
         from = "2013-01-01") %>% 
  select(symbol, date, adjusted) %>% 
  group_by(symbol)

returns_tidy_tibble <- 
  prices_tq %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn,
               period     = "daily",
               col_rename = "return")

returns_wide_tibble <- 
  returns_tidy_tibble %>% 
  spread(symbol, return)
returns_wide_tibble %>% 
  head()
returns_tidy_tibble %>% 
  head()

We have our daily returns, which we will treat as the dependent variables, the stuff we want to explain.

Let’s go get our independent variables. We are going to use the Fama French 5-factor data set, meaning we will have 5 independent variables to explain the return/risk of our ETF returns.

 + Fama French 5 factors
 + Import zip file, unzip, read in the csv
# Homepage URL: 
# http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html#Research

factors_data_address <- 
"http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Global_5_Factors_Daily_CSV.zip"

factors_csv_name <- "Global_5_Factors_Daily.csv"

temp <- tempfile()

download.file(
  # location of file to be downloaded
  factors_data_address,
  # where we want R to store that file
  temp)

Global_5_Factors <- 
  read_csv(unz(temp, factors_csv_name), skip = 6 ) %>%
  rename(date = X1, MKT = `Mkt-RF`) %>%
  mutate(date = ymd(parse_date_time(date, "%Y%m%d")))%>%
  mutate_if(is.numeric, funs(. / 100)) %>% 
  select(-RF)
head(Global_5_Factors)

Join our data

Before we visualize, we need to join the data and will use their common column, “date”.

 + join by "date"
 + note what happens when dates do not align
 + what if had different names
 + how is wide different from tidy tibble
data_joined_wide <-
  returns_wide_tibble %>% 
  left_join(Global_5_Factors, by = "date") %>% 
  na.omit()

data_joined_tidy <- 
  returns_tidy_tibble %>%
  left_join(Global_5_Factors, by = "date") %>% 
  na.omit()

data_joined_tidy %>% 
  slice(1)
# Let's save our data as an rds file.
# For use in Shiny, and as a good practice. 
# Why useful for Shiny? We can load it faster.
saveRDS(data_joined_tidy, file = "data-joined-tidy.rds")
saveRDS(data_joined_wide, file = "data-joined-wide.rds")

We have mashed together our data, we visualize….

ggplot2

A little background

+ part of the tidyverse and works well with tidy data
+ grammar of graphics
+ most popular data vis package
+ layers and geoms

Let’s start with a line chart of our wide data

data_joined_wide %>% 
  ggplot(aes(x = date, y = EEM)) + 
  # add a geom, see what happens if we don't
  geom_line(color = "cornflowerblue")

# what if want more than one line? 
# add them one at a time because in diff columns
# in general, we can keep adding layers to ggplot, even if they were to overlap

data_joined_wide %>% 
  ggplot(aes(x = date)) + 
  geom_line(aes(y = EEM), color = "cornflowerblue") +
  geom_line(aes(y = SPY), color = "pink")

Now with our tidy data, where we can efficiently plot all of our data.

data_joined_tidy %>% 
  # If want just one of our funds, use filter()
  # filter(symbol == "EEM") %>% 
  ggplot(aes(x = date, 
             y = return,
             color = symbol)) + 
  geom_line() +
  facet_wrap(~symbol)

Same code flows for scatter plots.

Wide data:

data_joined_wide  %>% 
  ggplot(aes(x = SMB, 
             y = EEM)) + 
  geom_point(color = "cornflowerblue")

# How to add more? Add them one at a time
# data_joined_wide %>% 
#   ggplot(aes(x = date)) + 
#   geom_point(aes(y = EEM), color = "cornflowerblue") +
#   geom_point(aes(y = SPY), color = "pink")

Tidy data:

data_joined_tidy %>% 
  # If want just one of our funds, use filter()
  # filter(symbol == "EEM") %>% 
  ggplot(aes(x = SMB, 
             y = return,
             color = symbol)) + 
  geom_point() +
  facet_wrap(~symbol)

Let’s add a regression line:

Wide data:

data_joined_wide %>% 
  ggplot(aes(x = SMB, y = EEM)) +
  geom_point(color = "pink") +
  geom_smooth(method = "lm", #try loess
              formula = y ~ x,
              se = TRUE,
              color = "cornflowerblue")

Tidy data:

data_joined_tidy %>% 
  # If want just one of our funds, use filter()
  # filter(symbol == "EEM") %>% 
  ggplot(aes(x = SMB, 
             y = return,
             color = symbol)) + 
  geom_point(shape = 17, # try 2, 3, 4...20
             alpha = .5) +
  geom_smooth(method = "lm", #try loess
              formula = y ~ x,
              se = TRUE,
              color = "cornflowerblue") +
  facet_wrap(~symbol)

Slightly different for a histogram and density plot, because we supply only an x-axis value:

Wide data:

data_joined_wide %>% 
  ggplot(aes(x = EEM)) +
  geom_histogram(color = "blue", fill = "pink", bins = 60)# +

  # geom_density()

Tidy data:

data_joined_tidy %>% 
  ggplot(aes(x = return, color = symbol, fill = symbol)) +
  geom_histogram(bins = 60) +
  facet_wrap(~symbol)  #+

  # labs(title = "Fund Returns Histogram",
  #      subtitle = "2013 - 2018",
  #      y = "fund freq",
  #      x = "",
  #      caption = "source: fama french website") +
  # theme(plot.title = element_text(hjust = 0.5, colour = "cornflowerblue"),
  #       plot.subtitle = element_text(hjust = 0.5, color = "cornflowerblue"),
  #       plot.caption = element_text(hjust = 0),
  #       strip.text.x = element_text(size = 8, colour = "white"),
  #       strip.background = element_rect(colour = "white", fill = "cornflowerblue"),
  #       axis.text.x = element_text(colour = "cornflowerblue"),
  #       axis.text = element_text(colour = "cornflowerblue"),
  #       axis.ticks.x = element_line(colour = "cornflowerblue"),
  #       axis.text.y = element_text(colour = "cornflowerblue"),
  #       axis.ticks.y = element_line(colour = "cornflowerblue"),
  #       axis.title = element_text(colour = "cornflowerblue"),
  #       legend.title = element_text(colour = "cornflowerblue"),
  #       legend.text = element_text(colour = "cornflowerblue"))

How about data summarised? Here tidy data really comes in handy:

  data_joined_tidy %>% 
  # calculate the mean return of each fund
  summarise(mean_return = mean(return)) %>%
  ggplot(aes(x = symbol, 
             y = mean_return, 
             fill = symbol, 
             color = symbol, 
             # add labels based on our original symbols vector
             label = symbols)) +
  # visualize as a column
  # geom_col(width = .4) +
  # visalize with a point
  geom_point(size = 3) +
  # Add text to the points
  geom_text(nudge_y = 0.00003,
            family = "Times New Roman")

plotly

Wrap our ggplot objects in ggplotly()

ggplotly(
  data_joined_wide %>% 
  ggplot(aes(x = date, y = EEM)) + 
  # add a geom, see what happens if we don't
  geom_line(color = "cornflowerblue")
)
ggplotly(
  data_joined_tidy %>% 
  # If want just one of our funds, use filter()
  # filter(symbol == "EEM") %>% 
  ggplot(aes(x = SMB, 
             y = return,
             color = symbol)) + 
  geom_point() +
  facet_wrap(~symbol)
)
ggplotly(
  data_joined_tidy %>% 
  ggplot(aes(x = return, color = symbol, fill = symbol)) +
  geom_histogram(bins = 60) +
  facet_wrap(~symbol) +
    labs(title = "Fund Returns Histogram",
       #subtitle = "2013 - 2018",
       y = "returns freq", 
       x = "",
       caption = "source: fama french website") +
  theme(plot.title = element_text(hjust = 0.5, colour = "purple"),
        plot.subtitle = element_text(hjust = 0.5, color = "green"),
        plot.caption = element_text(hjust=0), 
        axis.text.x = element_text(angle = 90, hjust = 1),
        axis.title.y = element_text(colour = "blue"),
        axis.text = element_text(colour = "green"))
)

Highcharter

Similar logic to ggplot - a good reason to learn it is other packages use it’s logic.

data_joined_tidy %>%
  filter(symbol == "EEM") %>% 
  hchart(., hcaes(x = SMB, 
                  y = return),
         type = "scatter",
         color = "cornflowerblue")
data_joined_tidy %>%
  filter(symbol == "IJS") %>% 
  hchart(., hcaes(x = SMB, 
                  y = return,
                  # add the date
                  date = date),
         type = "scatter",
         color = "cornflowerblue", 
         name = "our name") %>% 
  hc_title(text = "IJS v. SMB") %>% 
  # display it in the tooltip
  # could be other data besides date! 
  hc_tooltip(pointFormat = '{point.date} <br> 
                            SMB: {point.x: .4f}% <br> 
                            IJS: {point.y: .4f}%') %>% 
  hc_yAxis(title =  list(text="IJS returns"),
           labels = list(format = "{value}%")) %>% 
  hc_xAxis(labels = list(format = "{value}%"))
data_joined_tidy %>%
  hchart(., hcaes(x = SMB, 
                  y = return,
                  group = symbol,
                  date = date),
         type = "scatter") %>% 
  hc_title(text = "Funds v. SMB") %>% 
  # display it in the tooltip
  # could be other data besides date! 
  hc_tooltip(pointFormat = '{point.date} <br> 
                            factor: {point.x: .4f}% <br> 
                            fund: {point.y: .4f}%') %>% 
  hc_yAxis(title =  list(text="Fund returns"),
           labels = list(format = "{value}%")) %>% 
  hc_xAxis(labels = list(format = "{value}%"))
data_joined_tidy %>% 
  summarise(mean_return = mean(return)) %>% 
  hchart(., hcaes(x = symbol, 
                  y = mean_return, 
                  group = symbol),
         type = "column") %>% 
  hc_xAxis(categories = .$symbol) %>% 
  hc_tooltip(pointFormat = '{point.group} <br> 
                          {point.y: .4f}%') %>% 
  hc_yAxis(title =  list(text="mean returns"),
           labels = list(format = "{value}%"))

Modeling interlude

One model, one fund, one factor. We will use this for our Shiny example but better to preprocess.

data_joined_tidy %>% 
  filter(symbol == "SPY") %>% 
  do(model = lm(return ~ MKT, data = .)) %>% 
  glance(model)
  # augment(model)
  # tidy(model)
data_joined_tidy %>% 
  filter(symbol == "SPY") %>% 
  do(model = lm(return ~ MKT, data = .)) %>% 
  augment(model) %>% 
  ggplot(aes(y = .resid, x = .fitted)) +
  geom_point(color = "purple")

data_joined_tidy %>% 
  filter(symbol == "SPY") %>% 
  do(model = lm(return ~ MKT, data = .)) %>% 
  augment(model) %>% 
  hchart(., hcaes(y = .resid, x = .fitted), color = "pink", type = "scatter")

Explore many models on all our funds. What if we had 50? This would be important if we wished to pre run these models, every day and store the results. Then have Shiny pull those results.

model_one <- function(df) {
  lm(return ~ MKT, data = df)
}

model_two <- function(df) {
  lm(return ~ MKT + SMB + HML, data = df)
}

model_three <- function(df) {
  lm(return ~ MKT + SMB + HML + RMW + CMA, data = df)
}

data_joined_tidy %>% 
  group_by(symbol) %>% 
  nest()
data_joined_tidy %>% 
  group_by(symbol) %>% 
  nest() %>%
  # tidy, glance and augment help clean up
  # model results. broom package
  mutate(one_factor_model = map(data, model_one),
         tidied_one = map(one_factor_model, tidy),
         glanced_one = map(one_factor_model, glance),
         augmented_one = map(one_factor_model, augment)) #%>% 
  # unnest(tidied_one) 
  # unnest(glanced_one)
  # unnest(augmented_one)

Visualize model results

data_joined_tidy %>% 
  group_by(symbol) %>% 
  nest() %>%
  # tidy, glance and augment help clean up
  # model results. broom package
  mutate(three_factor_model = map(data, model_three),
         tidied_one = map(three_factor_model, tidy, conf.int = T, conf.level = .95),
         glanced_one = map(three_factor_model, glance),
         augmented_one = map(three_factor_model, augment)) %>% 
  unnest(tidied_one) %>%
  mutate_if(is.numeric, funs(round(., 3))) %>%
  filter(term != "(Intercept)" &
           symbol == "SPY") %>% 
  ggplot(aes(x = term, y = estimate, color = term, shape = term)) + 
  geom_point() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) +
  labs(title = "Model results",
       subtitle = "",
       x = "",
       y = "coefficient",
       caption = "data source: Fama French website") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        plot.caption  = element_text(hjust = 0))

ggplotly(
data_joined_tidy %>% 
  group_by(symbol) %>% 
  nest() %>%
  # tidy, glance and augment help clean up
  # model results. broom package
  mutate(three_factor_model = map(data, model_three),
         tidied_one = map(three_factor_model, tidy, conf.int = T, conf.level = .95),
         glanced_one = map(three_factor_model, glance),
         augmented_one = map(three_factor_model, augment)) %>% 
  unnest(tidied_one) %>%
  mutate_if(is.numeric, funs(round(., 3))) %>%
  filter(term != "(Intercept)" &
           symbol == "SPY") %>% 
  ggplot(aes(x = term, y = estimate, color = term, shape = term)) + 
  geom_point() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) +
  labs(title = "Model results",
       subtitle = "",
       x = "",
       y = "coefficient",
       caption = "data source: Fama French website") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        plot.caption  = element_text(hjust = 0))
)
data_joined_tidy %>% 
  group_by(symbol) %>% 
  nest() %>%
  # tidy, glance and augment help clean up
  # model results. broom package
  mutate(one_factor_model = map(data, model_one),
         tidied_one = map(one_factor_model, tidy),
         glanced_one = map(one_factor_model, glance),
         augmented_one = map(one_factor_model, augment)) %>% 
  unnest(augmented_one) %>% 
  ggplot(aes(x = return, y = .fitted, color = symbol)) +
  geom_point() +
  facet_wrap(~symbol)

ggplotly(
data_joined_tidy %>% 
  group_by(symbol) %>% 
  nest() %>%
  # tidy, glance and augment help clean up
  # model results. broom package
  mutate(one_factor_model = map(data, model_one),
         tidied_one = map(one_factor_model, tidy),
         glanced_one = map(one_factor_model, glance),
         augmented_one = map(one_factor_model, augment)) %>% 
  unnest(augmented_one) %>% 
  ggplot(aes(x = return, y = .fitted, color = symbol)) +
  geom_point() +
  facet_wrap(~symbol)
)

We can use the fitted values in highcharter too.

augmented_results_spy <- 
  data_joined_tidy %>% 
  filter(symbol == "SPY") %>% 
  nest(-symbol) %>% 
  mutate(one_factor_model = map(data, model_one),
         augmented_one = map(one_factor_model, augment)) %>% 
  unnest(augmented_one)


  hchart(augmented_results_spy, 
         hcaes(x = MKT, 
               y = return), 
         type = "scatter") %>% 
  hc_add_series(augmented_results_spy, 
                type = "line", 
                hcaes(x = MKT, y = .fitted), 
                name = "Regression Line")