r.Sys.Date()# Core libraries
library(tidyverse)
library(timetk)
library(tidyquant)
# Explorations
library(DataExplorer)
# Visualization
library(plotly)
library(highcharter)
library(gt)
library(ggthemes)complete_stock_tbl <-read_csv("/Users/nguyenbuiminh/Desktop/coding_python/ml/time_series/all_stocks_2006-01-01_to_2018-01-01.csv")
complete_stock_tbl %>% head(10)## # A tibble: 10 × 7
## Date Open High Low Close Volume Name
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2006-01-03 77.8 79.4 77.2 79.1 3117200 MMM
## 2 2006-01-04 79.5 79.5 78.2 78.7 2558000 MMM
## 3 2006-01-05 78.4 78.6 77.6 78.0 2529500 MMM
## 4 2006-01-06 78.6 78.9 77.6 78.6 2479500 MMM
## 5 2006-01-09 78.5 79.8 78.5 79.0 1845600 MMM
## 6 2006-01-10 79 79.0 78.1 78.5 1919900 MMM
## 7 2006-01-11 78.4 78.7 77.8 78.4 1911900 MMM
## 8 2006-01-12 78.2 78.2 77.2 77.7 2121100 MMM
## 9 2006-01-13 77.0 78.2 77.0 77.5 1925300 MMM
## 10 2006-01-17 77.1 77.6 77 77.1 2073400 MMM
## Rows: 93,612
## Columns: 7
## $ Date <date> 2006-01-03, 2006-01-04, 2006-01-05, 2006-01-06, 2006-01-09, 20…
## $ Open <dbl> 77.76, 79.49, 78.41, 78.64, 78.50, 79.00, 78.44, 78.20, 76.95, …
## $ High <dbl> 79.35, 79.49, 78.65, 78.90, 79.83, 79.01, 78.66, 78.23, 78.20, …
## $ Low <dbl> 77.24, 78.25, 77.56, 77.64, 78.46, 78.08, 77.84, 77.20, 76.95, …
## $ Close <dbl> 79.11, 78.71, 77.99, 78.63, 79.02, 78.53, 78.37, 77.70, 77.50, …
## $ Volume <dbl> 3117200, 2558000, 2529500, 2479500, 1845600, 1919900, 1911900, …
## $ Name <chr> "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", …
final_stock_tbl <- complete_stock_tbl %>%
set_names(names(.) %>% str_to_lower()) %>%
mutate(
sector = case_when(
name == "MMM" ~ "Business/Consumer Services",
name == "AXP" ~ "Financial Services",
name == "AAPL" ~ "Technology",
name == "BA" ~ "Industrial Goods",
name == "CAT" ~ "Industrial Goods",
name == "CVX" ~ "Companies on the Energy Service",
name == "CSCO" ~ "Technology",
name == "KO" ~ "Consumer Goods",
name == "DIS" ~ "Media/Entertainment",
name == "XOM" ~ "Companies on the Energy Service",
name == "GE" ~ "Business/Consumer Services",
name == "GS" ~ "Financial Services",
name == "HD" ~ "Retail/Wholesale",
name == "IBM" ~ "Business/Consumer Services",
name == "INTC" ~ "Technology",
name == "JNJ" ~ "Health Care/Life Sciences",
name == "JPM" ~ "Financial Services",
name == "MCD" ~ "Leisure/Arts/Hospitality",
name == "MRK" ~ "Health Care/Life Sciences",
name == "MSFT" ~ "Technology",
name == "NKE" ~ "Consumer Goods",
name == "PFE" ~ "Health Care/Life Sciences",
name == "PG" ~ "Consumer Goods",
name == "TRV" ~ "Financial Services",
name == "UTX" ~ "Technology",
name == "UNH" ~ "Financial Services",
name == "VZ" ~ "Telecommunication Services",
name == "WMT" ~ "Retail/Wholesale",
name == "GOOGL" ~ "Technology",
name == "AMZN" ~ "Retail/Wholesale",
TRUE ~ "Technology"
)
)stocks_by_sector <- final_stock_tbl %>%
distinct(sector, name) %>%
group_by(sector) %>%
summarise(
count = n()
) %>%
ungroup() %>%
arrange(desc(count))
# Treemap with Tidyquant colors
treemap_by_sector <- stocks_by_sector %>%
hctreemap2(group_vars = "sector",
size_var = "count",
layoutAlgorithm = "squarified") %>%
hc_plotOptions(treemap = list(colorByPoint = TRUE)) %>%
hc_colors(c("#2C3E50", "#E31A1C", "#18BC9C", "#CCBE93",
"#A6CEE3", "#1F78B4", "#B2DF8A", "#FB9A99",
"#FDBF6F", "#FF7F00", "#6A3D9A" )) %>%
hc_colorAxis(dataClasses = color_classes(stocks_by_sector$sector)) %>%
hc_legend(enabled = FALSE) %>%
hc_title(text = "Number of Companies by Sector") %>%
hc_subtitle(text = "Dow Jones Index") %>%
hc_tooltip(pointFormat = "<b>{point.name}</b>:<br>
Count: {point.value:,.0f}")
# Return the plot
treemap_by_sectorThe following companies will be analyzed: - Apple, Caterpillar, Google, Exxon, Golman Sachs, Walmart
As shown in the plots below we see some companies which are less volatile than others, companies that have a more upward trend line than others. LOESS stands for Locally Weighted Scatterplot Smoothing and think of this as a regression within a specific “window” (range of time) in which the closest observation is given more weight than “father” observations of a specific “focal point”. We performed weighted least squared (best fitting line).
Nevertheless, I would say these are the most important points for understanding how LOESS works
Number of observations within a window: The higher the number of observations within a specific window, the smoother the trendline will be.
Focal Point:: Is the observation for which we are performing LOESS (The closer other observations are from this focal point, the “higher the weights assigned to these observations” causing those values (closer to the focal point) to have a higher impact to the trendline)
Formula for Assignning weights (Tricube weight function) Where: - k: Number of observations (within each window) - xi: Focal Point - xk: The neighboring point - di: Distance from Focal point to the Nth neighboring point
w(xk) = (1 - |(xi - xk)/di|^3 )^3
After that linear regression is apply to find the line that reduces error the most. This will be done for all points in the “actual” time series.
#Function to get average price by week, month and quarter
#If we use the mean(close) daily we get the actual values
#Since values are unique
convert_date_ts <- function(data, unit = "day") {
new_data <- data %>%
mutate(date = floor_date(date, unit = unit)) %>%
group_by(date, name) %>%
summarise(
close = mean(close)
) %>%
ungroup()
return(new_data)
}# Pick by stock (6 stocks from different industries)
filtered_final_stock_tbl <- final_stock_tbl %>%
filter(name %in% c("AAPL", "GS", "GOOGL", "XOM", "CAT", "WMT")) %>%
mutate(
date = ymd(date)
)
# Daily
filtered_final_stock_tbl %>%
convert_date_ts() %>%
group_by(name) %>%
plot_time_series(
.date_var = date,
.value = close,
.facet_ncol = 2,
.smooth_color = "#18BC9C",
.smooth_size = 0.5
)filtered_final_stock_tbl%>%
plot_missing(
ggtheme = theme_calc(),
title = "Percent Missing Values by Column"
)Note: As expected, during the years of 2007 and 2008 is where we have the highest volatility due to the financial crisis that occur during those years caused mainly by uncertainty in the markets
# Daily stock return by company
# Using Tidyquant package
daily_stock_return <- filtered_final_stock_tbl %>%
mutate(
name = name %>% as_factor()
) %>%
group_by(name) %>%
tq_transmute(
select = close,
mutate_fun = periodReturn,
period = "daily",
col_rename = "stock_return"
)
# Visualization with ggplot
daily_stock_return %>%
ggplot(aes(x = date, y = stock_return)) +
geom_line(color = palette_light()[[1]]) +
facet_wrap(~name) +
theme_calc() +
labs(
title = "Stock Returns",
subtitle = "by Company",
y= "returns"
) +
theme(plot.title = element_text(hjust = 0.5))return_volatility_tbl <- function(data, unit="daily", ...){
vars_col <- quos(...)
tbl_return_volatility <- data %>%
mutate(
name = name %>% as_factor()
) %>%
group_by(name) %>%
tq_transmute(
select = close,
mutate_fun = periodReturn,
period = unit,
col_rename = "stock_return"
) %>%
mutate(
company = case_when(
name == "AAPL" ~ "Apple",
name == "GOOGL" ~ "Google",
name == "XOM" ~ "Exxon",
name == "GS" ~ "Goldman Sachs",
name == "WMT" ~ "Walmart",
TRUE ~ "Caterpillar"
)
) %>%
group_by(!!!vars_col) %>%
summarise(
`percent return` = mean(stock_return),
`standard deviation` = sd(stock_return)
) %>%
ungroup() %>%
arrange(desc(`percent return`)) %>% gt() %>%
data_color(
columns = vars(`standard deviation`),
colors = scales::col_numeric(
palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"),
domain = NULL
)) %>%
data_color(
columns = vars(`percent return`),
colors = scales::col_numeric(
palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"),
domain = NULL
)
) %>% fmt_percent(
columns = vars(`percent return`),
decimals = 4
) %>%
fmt_number(
columns = vars(`standard deviation`),
decimals = 4
) %>%
tab_header(
title = md("**2006 - 2017 Measure of Return and Volatility**"),
subtitle = "An overview of Risk and Return"
)
return(tbl_return_volatility)
}| 2006 - 2017 Measure of Return and Volatility | |||
| An overview of Risk and Return | |||
| company | name | percent return | standard deviation |
|---|---|---|---|
| Apple | AAPL | 0.1124% | 0.0204 |
| GOOGL | 0.0691% | 0.0184 | |
| Caterpillar | CAT | 0.0535% | 0.0201 |
| Goldman Sachs | GS | 0.0509% | 0.0239 |
| Walmart | WMT | 0.0326% | 0.0122 |
| Exxon | XOM | 0.0233% | 0.0151 |
| 2006 - 2017 Measure of Return and Volatility | |||
| An overview of Risk and Return | |||
| company | name | percent return | standard deviation |
|---|---|---|---|
| Apple | AAPL | 0.5444% | 0.0451 |
| GOOGL | 0.3329% | 0.0404 | |
| Caterpillar | CAT | 0.2638% | 0.0455 |
| Goldman Sachs | GS | 0.2426% | 0.0525 |
| Walmart | WMT | 0.1545% | 0.0257 |
| Exxon | XOM | 0.0960% | 0.0277 |
| 2006 - 2017 Measure of Return and Volatility | |||
| An overview of Risk and Return | |||
| company | name | percent return | standard deviation |
|---|---|---|---|
| Apple | AAPL | 2.3579% | 0.0907 |
| GOOGL | 1.4307% | 0.0824 | |
| Caterpillar | CAT | 1.1438% | 0.0935 |
| Goldman Sachs | GS | 0.8721% | 0.0892 |
| Walmart | WMT | 0.6391% | 0.0473 |
| Exxon | XOM | 0.3549% | 0.0462 |
| 2006 - 2017 Measure of Return and Volatility | |||
| An overview of Risk and Return | |||
| company | name | percent return | standard deviation |
|---|---|---|---|
| Apple | AAPL | 36.8065% | 0.5619 |
| GOOGL | 20.8735% | 0.4000 | |
| Goldman Sachs | GS | 14.7381% | 0.4343 |
| Caterpillar | CAT | 13.0254% | 0.3249 |
| Walmart | WMT | 7.7882% | 0.1674 |
| Exxon | XOM | 4.1885% | 0.1624 |
To better understand certain seasonality patterns it is critical to see how the target variable behaves at certain period of time. In this case we would analyze how the closing price of company x behaves at different days of the week, months, quarters and years. This could help us understnading if there are other external factors that influeneces the price of the stock
Summary: - *Different periods in time: We will analyze the distribution at during weekdays, week, month, quarter and year - Exxon (XOM): For this case we will look at how the stock of Exxon behaves. We will see the distribution of the closing price at different points in time - Weekdays: During weekday we dont see an obvious differents in the distribution of the closing price - Months: Months of June and December tend to have a slightly higher closing price - Quarters: The second and fourth quarter has a slightly median price in the distribution of the closing price - Year**: 2010 had the worst distribution in the closing price for the company (probably an external factor impacted the sector), while 2014 was the year in which the distribution of the closing price was the highest
# Let's pick Exxon for seasonality
filtered_final_stock_tbl %>%
filter(name == "XOM") %>%
plot_seasonal_diagnostics(
.date_var = date,
.value = close,
.interactive = FALSE
)# Function to create seasonal decomposition with loess for the stocks
plot_seasonal_decomposition <- function(data, interactive=FALSE){
stl_plot <- data %>%
group_by(name) %>%
plot_stl_diagnostics(
.date_var = date,
.value = log1p(close),
.interactive = interactive
)
return(stl_plot)
}
# Visualization Not interactive
filtered_final_stock_tbl %>%
filter(name %in% c("AAPL", "XOM", "GS")) %>%
group_by(name) %>%
plot_seasonal_decomposition()Anomalies are used mainly for finding out events that were not expected in our time series. It can also be that a mítake in the data could have caused an outlier in the time series. Detecting Outliers gives us the possiblity of investigating further if events that were not expected actually drove the time series to behaved in an anomally manner.
filtered_final_stock_tbl %>%
group_by(name) %>%
plot_anomaly_diagnostics(
.date = date,
.value = close,
.facet_ncol =2,
.interactive = FALSE,
.title = "Anomaly Diagnostics Dow Jones",
.anom_color = "#FB3029",
.max_anomalies = 0.07,
.alpha = 0.05
)As we have seen previouslu, Exxon belongs to the energy industry more specifically the oil industry. The question we will answer will be : What events occurred that caused a large drop in the stock price?. Below we will se attached a time series of the West Texas Intermediate (WTI) to see if it has a similar behavior with the XOM stock. For those that dont know WTI is a benchmark for oil pricing in the markets.
Summary:
Definition Lists:
filtered_final_stock_tbl %>%
group_by(name) %>%
summarise_by_time(
.date_var = date,
.value = mean(close, na.rm = TRUE),
.by = "month"
) %>%
tk_acf_diagnostics(
.date_var = date,
.value = log1p(.value)
) %>%
ggplot(aes(x=lag, y=ACF, color=name, group=name)) +
# Add horizontal line a y=0
geom_hline(yintercept = 0) +
# Plot autocorrelation
geom_point(size=2) +
geom_segment(aes(xend=lag, yend =0), size =1) +
# Add cutoffs
geom_line(aes(y = .white_noise_upper), color = "black", linetype = 2) +
geom_line(aes(y =.white_noise_lower), color = "black", linetype =2 ) +
# Add facets
facet_wrap(~ name, ncol = 3) +
# Aesthetics
expand_limits(y = c(-1,1)) +
scale_color_stata() +
theme_calc() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
labs(
title = "AutoCorrelation (ACF)",
subtitle = "Dow Jones 30 Index"
)filtered_final_stock_tbl %>%
group_by(name) %>%
summarise_by_time(
.date_var = date,
.value = mean(close, na.rm = TRUE),
.by="month"
) %>%
tk_acf_diagnostics(
.date_var = date,
.value = log1p(.value)
) %>%
ggplot(aes(x = lag, y = PACF, color = name, group = name)) +
# Add horizontal line a y=0
geom_hline(yintercept = 0) +
# Plot autocorrelations
geom_point(size = 2) +
geom_segment(aes(xend = lag, yend = 0), size = 1) +
# Add cutoffs
geom_line(aes(y = .white_noise_upper), color = "black", linetype = 2) +
geom_line(aes(y = .white_noise_lower), color = "black", linetype = 2) +
# Add facets
facet_wrap(~ name, ncol = 3) +
# Aesthetics
expand_limits(y = c(-1, 1)) +
scale_color_stata() +
theme_calc() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) + labs(
title = "Partial AutoCorrelation (PACF)",
subtitle = "Dow Jones 30 Index"
)The first time I heard about the concept “cross correlation”
tech_stocks_tbl <- filtered_final_stock_tbl %>%
filter(sector == "Technology") %>%
select(date, name, close) %>%
pivot_wider(names_from = name, values_from = close) %>%
summarise_by_time(
.date_var = date,
.by = "month",
across(AAPL:GOOGL, .fns=mean)
)
tech_stocks_tbl %>%
tk_acf_diagnostics(
date,
AAPL,
.ccf_vars = GOOGL
) %>%
select(
lag, CCF_GOOGL
) %>%
slice(1:10) %>%
gt() %>%
data_color(
columns = vars(CCF_GOOGL) ,
colors = scales::col_numeric(
palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"),
domain = NULL
))| lag | CCF_GOOGL |
|---|---|
| 0 | 0.9137803 |
| 1 | 0.8816792 |
| 2 | 0.8501216 |
| 3 | 0.8231274 |
| 4 | 0.8001255 |
| 5 | 0.7770242 |
| 6 | 0.7513068 |
| 7 | 0.7243054 |
| 8 | 0.6970645 |
| 9 | 0.6747878 |
Summary: -CCF on AAPL Close Price: The closing price of AAPL is the target variable and the closing price of GOOGL is the independent variable.
Moving averages help ú smooth the time series and helps us to better understand where the trend is heading. Believe it or not, moving averages is used a lot for forecasting although, it is not the most optimal technique to do so especially since it does not handle that well the impact of outliers. Moreover, moving averages helps us detect as well seasonality in our time series.
Most Common Moving Average in the Stock Market - 15 days MA - 30 days MA - 100 days MA - 200 days MA
Note: Notice how the higher the moving average the smoother the line. Usually, we use moving averages to detect trends in our time series data
gs_ma_sample <- filtered_final_stock_tbl %>%
select(date, close, name) %>%
filter(name == "GS") %>%
filter_by_time(.date_var = date, .start_date = "2015", .end_date = "2017") %>%
mutate(
adjusted_15_mm_close = slidify_vec(
.x = close,
.period = 15,
.f = mean,
na.rm = TRUE,
.align = "center",
.partital = TRUE
),
adjusted_30_mm_close = slidify_vec(
.x = close,
.period = 30,
.f = mean,
na.rm = TRUE,
.align = "center",
.partial = TRUE
)
) %>%
pivot_longer(
contains("close"), names_repair = "unique") %>%
rename(name = name...2,
metric = name...3
)
gs_ma_sample %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color = metric), size = 1) +
theme_calc() +
theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5)) +
guides(color = guide_legend(nrow = 2, byrow = TRUE)) +
labs(
title = "Moving Average (MA) as a Metric",
subtitle = str_glue("GS Stock 2015-2017"),
y = "Closing Price") +
scale_color_stata() +
scale_x_date(expand = c(0,0))Note: In this case I used a “Rolling” Median since I want to avoid the effect of outliers. Moving average are more prone to be affected by outliers! Nevertheless, you can use moving average by changing the function from median to mean.
mn_final_tbl <- filtered_final_stock_tbl %>%
select(date, close, name) %>%
mutate(
adjusted_15_mm_close = slidify_vec(
.x = close,
.period = 15,
.f = median,
na.rm = TRUE,
.align = "center",
.partial = TRUE),
adjusted_30_mm_close = slidify_vec(
.x = close,
.period = 30,
.f = median,
na.rm = TRUE,
.align = "center",
.partial = TRUE
),
adjusted_100_mm_close = slidify_vec(
.x = close,
.period = 100,
.f = median,
na.rm = TRUE,
.align = "center",
.partial = TRUE),
adjusted_300_mm_close = slidify_vec(
.x = close,
.period = 300,
.f = median,
na.rm = TRUE,
.align = "center",
.partial = TRUE
)
) %>%
pivot_longer(contains("close"), names_repair = "unique") %>%
rename(
name = name...2,
metric = name...3
)
mn_final_tbl## # A tibble: 90,590 × 4
## date name metric value
## <date> <chr> <chr> <dbl>
## 1 2006-01-03 AAPL close 10.7
## 2 2006-01-03 AAPL adjusted_15_mm_close 10.9
## 3 2006-01-03 AAPL adjusted_30_mm_close 11
## 4 2006-01-03 AAPL adjusted_100_mm_close 10.2
## 5 2006-01-03 AAPL adjusted_300_mm_close 9.38
## 6 2006-01-04 AAPL close 10.7
## 7 2006-01-04 AAPL adjusted_15_mm_close 10.9
## 8 2006-01-04 AAPL adjusted_30_mm_close 10.9
## 9 2006-01-04 AAPL adjusted_100_mm_close 10.2
## 10 2006-01-04 AAPL adjusted_300_mm_close 9.38
## # ℹ 90,580 more rows
mn_final_tbl %>%
ggplot(aes(x=date, y =value, color = metric)) +
geom_line(size=1, alpha = 0.8) +
facet_wrap(~name, scales = "free_y") +
theme_calc() +
scale_color_stata() +
labs(
title = "Moving Medians",
subtitle = "Dow Jones 30",
y = "price"
) +
theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5),
legend.text = element_text(size = 10)) +
guides(color = guide_legend(nrow = 2, byrow = TRUE))The goal of this section is to showcase in which scenarios moving average could be useful and in which ones could it be that they are not that useful.
Summary:gs_filtered_data <- filtered_final_stock_tbl %>%
filter(name == "GS") %>%
filter_by_time(
.start_date = "2009",
.end_date = "2012"
) %>%
select(date, name, close)
ma_60day_forecasting <- gs_filtered_data %>%
mutate(
mavg_60_days = slidify_vec(close, .f = ~ mean(.x, na.rm = TRUE), .period = 60, .align = "right")) %>%
bind_rows(
future_frame(., .length_out = 60)
) %>%
# fill function: replace NA with our moving averages
fill(
mavg_60_days,
.direction = "down"
)
ma_60day_forecasting## # A tibble: 1,066 × 4
## date name close mavg_60_days
## <date> <chr> <dbl> <dbl>
## 1 2009-01-02 GS 86.8 NA
## 2 2009-01-05 GS 88.8 NA
## 3 2009-01-06 GS 88.7 NA
## 4 2009-01-07 GS 84.5 NA
## 5 2009-01-08 GS 85.4 NA
## 6 2009-01-09 GS 83.9 NA
## 7 2009-01-12 GS 77.7 NA
## 8 2009-01-13 GS 77.9 NA
## 9 2009-01-14 GS 75.7 NA
## 10 2009-01-15 GS 73.8 NA
## # ℹ 1,056 more rows
ma_60day_forecasting %>% pivot_longer(close:mavg_60_days,names_repair="unique") %>%
rename(
"type" = "name...3"
) %>%
ggplot(aes(x=date, y=value)) + geom_line(aes(color=type), size=1) +
theme_calc() + scale_color_stata() +
labs(
title = "60 day Moving Average",
subtitle = "MA as a Forecasting Technique"
) + theme(plot.title = element_text(hjust=0.5))In this scenario we can
actual_60days <- filtered_final_stock_tbl %>%
filter(name == "GS") %>%
filter_by_time(
.start_date = "2013",
.end_date = "2013-03-01"
) %>%
select(date, name, close)
plot_series <- ma_60day_forecasting %>%
filter_by_time(
.start_date = "2013-01-02", .end_date = "2013-03-01"
) %>%
left_join(actual_60days, by="date") %>%
select(date, mavg_60_days, close.y) %>%
rename(
"actual_close_price" = "close.y"
) %>%
pivot_longer(mavg_60_days:actual_close_price) %>%
# Remove weekends from moving average
drop_na() %>%
ggplot(aes(x=date, value)) + geom_line(aes(color=name), size=1) + geom_point(aes(color=name), size=2) + theme_calc() + scale_color_stata() +
labs(
title = "Residuals from 60 day Moving average",
subtitle = "GS stock Moving Average"
) + theme(legend.position = "bottom") + guides(color = guide_legend(nrow=1, byrow=TRUE))
plot_series
## Rolling Corelation
Another way of finding a relationship between two time series is through the use of Rolling Correlations. What we do here is find a correlation at different points in time between two time series and determine if the relationship between the both is significant.
Summary:tech_stocks_tbl <- filtered_final_stock_tbl %>%
filter(sector == "Technology") %>%
select(date, name, close) %>%
pivot_wider(names_from = name, values_from = close) %>%
summarise_by_time(
.date_var = date,
.by = "month",
across(AAPL:GOOGL, .fns = mean)
)
tech_stocks_tbl %>%
tk_acf_diagnostics(
date,
AAPL,
.ccf_vars = GOOGL
) %>%
select(lag, CCF_GOOGL)%>%
slice(1:10) %>%
gt() %>%
data_color(
columns = vars(CCF_GOOGL),
colors = scales::col_numeric(
palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"),
domain = NULL
))| lag | CCF_GOOGL |
|---|---|
| 0 | 0.9137803 |
| 1 | 0.8816792 |
| 2 | 0.8501216 |
| 3 | 0.8231274 |
| 4 | 0.8001255 |
| 5 | 0.7770242 |
| 6 | 0.7513068 |
| 7 | 0.7243054 |
| 8 | 0.6970645 |
| 9 | 0.6747878 |
The main reason why we need to use log transformations in time series analysis is to reduce the impact of extentsive outliers therefore, stablizing the variance across the time series. Log transformations are required in models like Linear Regression where an outlier can cause a significant impact on the adjusted squared error. Although, in this case wen dont have “significant” outliers log transformations can help you mitigate the impact of such
Log Transformation Formula:
\(\large Y = log_b^x\)
\(\large b^y = x\)
Summary:
# We will pick GS as it is one of the ones with the highest volatility
# GS linear regression model using log transformations
log_gs_lm_plot <- filtered_final_stock_tbl %>%
filter(name == "GS") %>%
filter_by_time(
.start_date = "2016-01-01",
.end_date = "2017-01-01"
) %>%
plot_time_series_regression(
.date_var = date,
.formula = log1p(close) ~ as.numeric(date) +
wday(date, label = TRUE) +
lubridate::month(date, label = TRUE),
.interactive=FALSE,
.show_summary = TRUE,
.title = "Log transformations in Linear Regression Models"
)##
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.114757 -0.018312 0.001485 0.020826 0.105742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.3685671 4.3585702 -2.149 0.03262
## as.numeric(date) 0.0008535 0.0002566 3.326 0.00102
## wday(date, label = TRUE).L -0.0018868 0.0050972 -0.370 0.71159
## wday(date, label = TRUE).Q 0.0004393 0.0050410 0.087 0.93063
## wday(date, label = TRUE).C -0.0015788 0.0049871 -0.317 0.75184
## wday(date, label = TRUE)^4 0.0031286 0.0049447 0.633 0.52753
## lubridate::month(date, label = TRUE).L 0.0212167 0.0939807 0.226 0.82159
## lubridate::month(date, label = TRUE).Q 0.2516840 0.0078410 32.098 < 2e-16
## lubridate::month(date, label = TRUE).C 0.1107287 0.0078093 14.179 < 2e-16
## lubridate::month(date, label = TRUE)^4 0.0581218 0.0078034 7.448 1.79e-12
## lubridate::month(date, label = TRUE)^5 -0.0174280 0.0077749 -2.242 0.02592
## lubridate::month(date, label = TRUE)^6 0.0410392 0.0078056 5.258 3.28e-07
## lubridate::month(date, label = TRUE)^7 -0.0437489 0.0077170 -5.669 4.19e-08
## lubridate::month(date, label = TRUE)^8 -0.0437221 0.0076994 -5.679 3.99e-08
## lubridate::month(date, label = TRUE)^9 0.0105762 0.0077319 1.368 0.17266
## lubridate::month(date, label = TRUE)^10 0.0131328 0.0076836 1.709 0.08874
## lubridate::month(date, label = TRUE)^11 -0.0166546 0.0077086 -2.161 0.03174
##
## (Intercept) *
## as.numeric(date) **
## wday(date, label = TRUE).L
## wday(date, label = TRUE).Q
## wday(date, label = TRUE).C
## wday(date, label = TRUE)^4
## lubridate::month(date, label = TRUE).L
## lubridate::month(date, label = TRUE).Q ***
## lubridate::month(date, label = TRUE).C ***
## lubridate::month(date, label = TRUE)^4 ***
## lubridate::month(date, label = TRUE)^5 *
## lubridate::month(date, label = TRUE)^6 ***
## lubridate::month(date, label = TRUE)^7 ***
## lubridate::month(date, label = TRUE)^8 ***
## lubridate::month(date, label = TRUE)^9
## lubridate::month(date, label = TRUE)^10 .
## lubridate::month(date, label = TRUE)^11 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03551 on 235 degrees of freedom
## Multiple R-squared: 0.933, Adjusted R-squared: 0.9285
## F-statistic: 204.6 on 16 and 235 DF, p-value: < 2.2e-16
Transforming back to the original values using expm1 function:
log_transformed_back_plot <- filtered_final_stock_tbl %>%
filter(name == "GS") %>%
filter_by_time(
.start_date = "2016-01-01",
.end_date = "2017-01-01"
) %>%
plot_time_series_regression(
.date_var = date,
.formula = expm1(log1p(close)) ~ as.numeric(date) +
wday(date, label = TRUE) +
lubridate::month(date, label = TRUE),
.interactive = TRUE,
.show_summary = TRUE,
.title = "Transforming to actual close price using expm1 function"
)##
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.0393 -3.0182 0.3419 3.3421 18.1935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.804e+03 7.652e+02 -3.664 0.000306
## as.numeric(date) 1.751e-01 4.505e-02 3.885 0.000133
## wday(date, label = TRUE).L -3.287e-01 8.948e-01 -0.367 0.713718
## wday(date, label = TRUE).Q 3.947e-02 8.850e-01 0.045 0.964464
## wday(date, label = TRUE).C -2.123e-01 8.755e-01 -0.242 0.808638
## wday(date, label = TRUE)^4 5.540e-01 8.681e-01 0.638 0.523987
## lubridate::month(date, label = TRUE).L -2.077e+00 1.650e+01 -0.126 0.899952
## lubridate::month(date, label = TRUE).Q 4.869e+01 1.377e+00 35.375 < 2e-16
## lubridate::month(date, label = TRUE).C 2.412e+01 1.371e+00 17.596 < 2e-16
## lubridate::month(date, label = TRUE)^4 1.309e+01 1.370e+00 9.555 < 2e-16
## lubridate::month(date, label = TRUE)^5 -1.112e+00 1.365e+00 -0.814 0.416229
## lubridate::month(date, label = TRUE)^6 6.582e+00 1.370e+00 4.803 2.78e-06
## lubridate::month(date, label = TRUE)^7 -7.165e+00 1.355e+00 -5.289 2.82e-07
## lubridate::month(date, label = TRUE)^8 -7.312e+00 1.352e+00 -5.410 1.55e-07
## lubridate::month(date, label = TRUE)^9 1.411e+00 1.357e+00 1.040 0.299594
## lubridate::month(date, label = TRUE)^10 1.905e+00 1.349e+00 1.412 0.159213
## lubridate::month(date, label = TRUE)^11 -2.533e+00 1.353e+00 -1.872 0.062459
##
## (Intercept) ***
## as.numeric(date) ***
## wday(date, label = TRUE).L
## wday(date, label = TRUE).Q
## wday(date, label = TRUE).C
## wday(date, label = TRUE)^4
## lubridate::month(date, label = TRUE).L
## lubridate::month(date, label = TRUE).Q ***
## lubridate::month(date, label = TRUE).C ***
## lubridate::month(date, label = TRUE)^4 ***
## lubridate::month(date, label = TRUE)^5
## lubridate::month(date, label = TRUE)^6 ***
## lubridate::month(date, label = TRUE)^7 ***
## lubridate::month(date, label = TRUE)^8 ***
## lubridate::month(date, label = TRUE)^9
## lubridate::month(date, label = TRUE)^10
## lubridate::month(date, label = TRUE)^11 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.234 on 235 degrees of freedom
## Multiple R-squared: 0.9425, Adjusted R-squared: 0.9386
## F-statistic: 240.6 on 16 and 235 DF, p-value: < 2.2e-16
When we talk about implementing machine learning models we usually talk about splitting the dataset into
# Test is 25% of our data
split_df <- filtered_final_stock_tbl %>%
filter(name == "GS") %>%
time_series_split(
date_var = date,
assess = "3 year",
cumulative = TRUE
)
splitted_data <- split_df %>%
tk_time_series_cv_plan()
splitted_data %>%
ggplot() + geom_line(aes(x=date, y=close, color=.key), size=1) +
theme_calc() + scale_color_stata() +
geom_rect(data=data.frame(xmin=as.Date(c("2015-01-02")),
xmax=as.Date(c("2017-12-29")),
ymin=-Inf,
ymax=Inf),
aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
fill=palette_light()[2],alpha=0.3) +
geom_label(
label = "Testing Set",
vjust = 0.5,
aes(x=as.Date(c("2016-07-02")),
y = 100)) +
geom_rect(data=data.frame(xmin=as.Date(c("2006-01-03")),
xmax=as.Date(c("2014-12-31")),
ymin=-Inf,
ymax=Inf),
aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
fill=palette_light()[3],alpha=0.3) + scale_x_date(expand = c(0,0)) +
geom_label(
label = "Training Set",
vjust = 0.5,
aes(x=as.Date(c("2012-07-02")),
y = 220)) +
labs(
title = "Splitting Time Series into Training and Testing Sets",
subtitle = "Dow Jones 30"
) + theme(plot.title = element_text(hjust=0.5), legend.position = "bottom") +
guides(color = guide_legend(nrow=1, byrow=TRUE)) +
scale_y_continuous(labels = scales::dollar)
## Autoregressive “Integrated” Moving Average (ARIMA)
Most of us have hear the concept of Autoregressive Integrated Moving Average (ARIMA). Before understanding the concept of ARIMA we should answer the following questions: