+ the DS workflow
+ on visualization
+ our packages for today + reproducibility
+ Get 5 years of price data on 5 ETFs
+ Convert to returns
# The symbols vector holds our tickers.
symbols <-
sort(c("SPY","EFA", "IJS", "EEM","AGG"))
prices_tq <-
tq_get(symbols,
get = "stock.prices",
from = "2013-01-01") %>%
select(symbol, date, adjusted) %>%
group_by(symbol)
returns_tidy_tibble <-
prices_tq %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "daily",
col_rename = "return")
returns_wide_tibble <-
returns_tidy_tibble %>%
spread(symbol, return)
returns_wide_tibble %>%
head()
returns_tidy_tibble %>%
head()
We have our daily returns, which we will treat as the dependent variables, the stuff we want to explain.
Let’s go get our independent variables. We are going to use the Fama French 5-factor data set, meaning we will have 5 independent variables to explain the return/risk of our ETF returns.
+ Fama French 5 factors
+ Import zip file, unzip, read in the csv
# Homepage URL:
# http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html#Research
factors_data_address <-
"http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Global_5_Factors_Daily_CSV.zip"
factors_csv_name <- "Global_5_Factors_Daily.csv"
temp <- tempfile()
download.file(
# location of file to be downloaded
factors_data_address,
# where we want R to store that file
temp)
Global_5_Factors <-
read_csv(unz(temp, factors_csv_name), skip = 6 ) %>%
rename(date = X1, MKT = `Mkt-RF`) %>%
mutate(date = ymd(parse_date_time(date, "%Y%m%d")))%>%
mutate_if(is.numeric, funs(. / 100)) %>%
select(-RF)
head(Global_5_Factors)
Before we visualize, we need to join the data and will use their common column, “date”.
+ join by "date"
+ note what happens when dates do not align
+ what if had different names
+ how is wide different from tidy tibble
data_joined_wide <-
returns_wide_tibble %>%
left_join(Global_5_Factors, by = "date") %>%
na.omit()
data_joined_tidy <-
returns_tidy_tibble %>%
left_join(Global_5_Factors, by = "date") %>%
na.omit()
data_joined_tidy %>%
slice(1)
# Let's save our data as an rds file.
# For use in Shiny, and as a good practice.
# Why useful for Shiny? We can load it faster.
saveRDS(data_joined_tidy, file = "data-joined-tidy.rds")
saveRDS(data_joined_wide, file = "data-joined-wide.rds")
We have mashed together our data, we visualize….
A little background
+ part of the tidyverse and works well with tidy data
+ grammar of graphics
+ most popular data vis package
+ layers and geoms
Let’s start with a line chart of our wide data
data_joined_wide %>%
ggplot(aes(x = date, y = EEM)) +
# add a geom, see what happens if we don't
geom_line(color = "cornflowerblue")
# what if want more than one line?
# add them one at a time because in diff columns
# in general, we can keep adding layers to ggplot, even if they were to overlap
data_joined_wide %>%
ggplot(aes(x = date)) +
geom_line(aes(y = EEM), color = "cornflowerblue") +
geom_line(aes(y = SPY), color = "pink")
Now with our tidy data, where we can efficiently plot all of our data.
data_joined_tidy %>%
# If want just one of our funds, use filter()
# filter(symbol == "EEM") %>%
ggplot(aes(x = date,
y = return,
color = symbol)) +
geom_line() +
facet_wrap(~symbol)
Same code flows for scatter plots.
Wide data:
data_joined_wide %>%
ggplot(aes(x = SMB,
y = EEM)) +
geom_point(color = "cornflowerblue")
# How to add more? Add them one at a time
# data_joined_wide %>%
# ggplot(aes(x = date)) +
# geom_point(aes(y = EEM), color = "cornflowerblue") +
# geom_point(aes(y = SPY), color = "pink")
Tidy data:
data_joined_tidy %>%
# If want just one of our funds, use filter()
# filter(symbol == "EEM") %>%
ggplot(aes(x = SMB,
y = return,
color = symbol)) +
geom_point() +
facet_wrap(~symbol)
Let’s add a regression line:
Wide data:
data_joined_wide %>%
ggplot(aes(x = SMB, y = EEM)) +
geom_point(color = "pink") +
geom_smooth(method = "lm", #try loess
formula = y ~ x,
se = TRUE,
color = "cornflowerblue")
Tidy data:
data_joined_tidy %>%
# If want just one of our funds, use filter()
# filter(symbol == "EEM") %>%
ggplot(aes(x = SMB,
y = return,
color = symbol)) +
geom_point(shape = 17, # try 2, 3, 4...20
alpha = .5) +
geom_smooth(method = "lm", #try loess
formula = y ~ x,
se = TRUE,
color = "cornflowerblue") +
facet_wrap(~symbol)
Slightly different for a histogram and density plot, because we supply only an x-axis value:
Wide data:
data_joined_wide %>%
ggplot(aes(x = EEM)) +
geom_histogram(color = "blue", fill = "pink", bins = 60)# +
# geom_density()
Tidy data:
data_joined_tidy %>%
ggplot(aes(x = return, color = symbol, fill = symbol)) +
geom_histogram(bins = 60) +
facet_wrap(~symbol) #+
# labs(title = "Fund Returns Histogram",
# subtitle = "2013 - 2018",
# y = "fund freq",
# x = "",
# caption = "source: fama french website") +
# theme(plot.title = element_text(hjust = 0.5, colour = "cornflowerblue"),
# plot.subtitle = element_text(hjust = 0.5, color = "cornflowerblue"),
# plot.caption = element_text(hjust = 0),
# strip.text.x = element_text(size = 8, colour = "white"),
# strip.background = element_rect(colour = "white", fill = "cornflowerblue"),
# axis.text.x = element_text(colour = "cornflowerblue"),
# axis.text = element_text(colour = "cornflowerblue"),
# axis.ticks.x = element_line(colour = "cornflowerblue"),
# axis.text.y = element_text(colour = "cornflowerblue"),
# axis.ticks.y = element_line(colour = "cornflowerblue"),
# axis.title = element_text(colour = "cornflowerblue"),
# legend.title = element_text(colour = "cornflowerblue"),
# legend.text = element_text(colour = "cornflowerblue"))
How about data summarised? Here tidy data really comes in handy:
data_joined_tidy %>%
# calculate the mean return of each fund
summarise(mean_return = mean(return)) %>%
ggplot(aes(x = symbol,
y = mean_return,
fill = symbol,
color = symbol,
# add labels based on our original symbols vector
label = symbols)) +
# visualize as a column
# geom_col(width = .4) +
# visalize with a point
geom_point(size = 3) +
# Add text to the points
geom_text(nudge_y = 0.00003,
family = "Times New Roman")
Wrap our ggplot objects in ggplotly()
ggplotly(
data_joined_wide %>%
ggplot(aes(x = date, y = EEM)) +
# add a geom, see what happens if we don't
geom_line(color = "cornflowerblue")
)
ggplotly(
data_joined_tidy %>%
# If want just one of our funds, use filter()
# filter(symbol == "EEM") %>%
ggplot(aes(x = SMB,
y = return,
color = symbol)) +
geom_point() +
facet_wrap(~symbol)
)
ggplotly(
data_joined_tidy %>%
ggplot(aes(x = return, color = symbol, fill = symbol)) +
geom_histogram(bins = 60) +
facet_wrap(~symbol) +
labs(title = "Fund Returns Histogram",
#subtitle = "2013 - 2018",
y = "returns freq",
x = "",
caption = "source: fama french website") +
theme(plot.title = element_text(hjust = 0.5, colour = "purple"),
plot.subtitle = element_text(hjust = 0.5, color = "green"),
plot.caption = element_text(hjust=0),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.title.y = element_text(colour = "blue"),
axis.text = element_text(colour = "green"))
)
Similar logic to ggplot - a good reason to learn it is other packages use it’s logic.
data_joined_tidy %>%
filter(symbol == "EEM") %>%
hchart(., hcaes(x = SMB,
y = return),
type = "scatter",
color = "cornflowerblue")
data_joined_tidy %>%
filter(symbol == "IJS") %>%
hchart(., hcaes(x = SMB,
y = return,
# add the date
date = date),
type = "scatter",
color = "cornflowerblue",
name = "our name") %>%
hc_title(text = "IJS v. SMB") %>%
# display it in the tooltip
# could be other data besides date!
hc_tooltip(pointFormat = '{point.date} <br>
SMB: {point.x: .4f}% <br>
IJS: {point.y: .4f}%') %>%
hc_yAxis(title = list(text="IJS returns"),
labels = list(format = "{value}%")) %>%
hc_xAxis(labels = list(format = "{value}%"))
data_joined_tidy %>%
hchart(., hcaes(x = SMB,
y = return,
group = symbol,
date = date),
type = "scatter") %>%
hc_title(text = "Funds v. SMB") %>%
# display it in the tooltip
# could be other data besides date!
hc_tooltip(pointFormat = '{point.date} <br>
factor: {point.x: .4f}% <br>
fund: {point.y: .4f}%') %>%
hc_yAxis(title = list(text="Fund returns"),
labels = list(format = "{value}%")) %>%
hc_xAxis(labels = list(format = "{value}%"))
data_joined_tidy %>%
summarise(mean_return = mean(return)) %>%
hchart(., hcaes(x = symbol,
y = mean_return,
group = symbol),
type = "column") %>%
hc_xAxis(categories = .$symbol) %>%
hc_tooltip(pointFormat = '{point.group} <br>
{point.y: .4f}%') %>%
hc_yAxis(title = list(text="mean returns"),
labels = list(format = "{value}%"))
One model, one fund, one factor. We will use this for our Shiny example but better to preprocess.
data_joined_tidy %>%
filter(symbol == "SPY") %>%
do(model = lm(return ~ MKT, data = .)) %>%
glance(model)
# augment(model)
# tidy(model)
data_joined_tidy %>%
filter(symbol == "SPY") %>%
do(model = lm(return ~ MKT, data = .)) %>%
augment(model) %>%
ggplot(aes(y = .resid, x = .fitted)) +
geom_point(color = "purple")
data_joined_tidy %>%
filter(symbol == "SPY") %>%
do(model = lm(return ~ MKT, data = .)) %>%
augment(model) %>%
hchart(., hcaes(y = .resid, x = .fitted), color = "pink", type = "scatter")
Explore many models on all our funds. What if we had 50? This would be important if we wished to pre run these models, every day and store the results. Then have Shiny pull those results.
model_one <- function(df) {
lm(return ~ MKT, data = df)
}
model_two <- function(df) {
lm(return ~ MKT + SMB + HML, data = df)
}
model_three <- function(df) {
lm(return ~ MKT + SMB + HML + RMW + CMA, data = df)
}
data_joined_tidy %>%
group_by(symbol) %>%
nest()
data_joined_tidy %>%
group_by(symbol) %>%
nest() %>%
# tidy, glance and augment help clean up
# model results. broom package
mutate(one_factor_model = map(data, model_one),
tidied_one = map(one_factor_model, tidy),
glanced_one = map(one_factor_model, glance),
augmented_one = map(one_factor_model, augment)) #%>%
# unnest(tidied_one)
# unnest(glanced_one)
# unnest(augmented_one)
Visualize model results
data_joined_tidy %>%
group_by(symbol) %>%
nest() %>%
# tidy, glance and augment help clean up
# model results. broom package
mutate(three_factor_model = map(data, model_three),
tidied_one = map(three_factor_model, tidy, conf.int = T, conf.level = .95),
glanced_one = map(three_factor_model, glance),
augmented_one = map(three_factor_model, augment)) %>%
unnest(tidied_one) %>%
mutate_if(is.numeric, funs(round(., 3))) %>%
filter(term != "(Intercept)" &
symbol == "SPY") %>%
ggplot(aes(x = term, y = estimate, color = term, shape = term)) +
geom_point() +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) +
labs(title = "Model results",
subtitle = "",
x = "",
y = "coefficient",
caption = "data source: Fama French website") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0))
ggplotly(
data_joined_tidy %>%
group_by(symbol) %>%
nest() %>%
# tidy, glance and augment help clean up
# model results. broom package
mutate(three_factor_model = map(data, model_three),
tidied_one = map(three_factor_model, tidy, conf.int = T, conf.level = .95),
glanced_one = map(three_factor_model, glance),
augmented_one = map(three_factor_model, augment)) %>%
unnest(tidied_one) %>%
mutate_if(is.numeric, funs(round(., 3))) %>%
filter(term != "(Intercept)" &
symbol == "SPY") %>%
ggplot(aes(x = term, y = estimate, color = term, shape = term)) +
geom_point() +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) +
labs(title = "Model results",
subtitle = "",
x = "",
y = "coefficient",
caption = "data source: Fama French website") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0))
)
data_joined_tidy %>%
group_by(symbol) %>%
nest() %>%
# tidy, glance and augment help clean up
# model results. broom package
mutate(one_factor_model = map(data, model_one),
tidied_one = map(one_factor_model, tidy),
glanced_one = map(one_factor_model, glance),
augmented_one = map(one_factor_model, augment)) %>%
unnest(augmented_one) %>%
ggplot(aes(x = return, y = .fitted, color = symbol)) +
geom_point() +
facet_wrap(~symbol)
ggplotly(
data_joined_tidy %>%
group_by(symbol) %>%
nest() %>%
# tidy, glance and augment help clean up
# model results. broom package
mutate(one_factor_model = map(data, model_one),
tidied_one = map(one_factor_model, tidy),
glanced_one = map(one_factor_model, glance),
augmented_one = map(one_factor_model, augment)) %>%
unnest(augmented_one) %>%
ggplot(aes(x = return, y = .fitted, color = symbol)) +
geom_point() +
facet_wrap(~symbol)
)
We can use the fitted values in highcharter too.
augmented_results_spy <-
data_joined_tidy %>%
filter(symbol == "SPY") %>%
nest(-symbol) %>%
mutate(one_factor_model = map(data, model_one),
augmented_one = map(one_factor_model, augment)) %>%
unnest(augmented_one)
hchart(augmented_results_spy,
hcaes(x = MKT,
y = return),
type = "scatter") %>%
hc_add_series(augmented_results_spy,
type = "line",
hcaes(x = MKT, y = .fitted),
name = "Regression Line")