library(tidyverse)
library(tidyquant)
library(quantmod)
library(lubridate)
library(broom) # for tidying up linear models
library(factoextra)
FTSE 350 Industrial Engineering index. The index components are Bodycote, Hill&Smith, IMI, Rotork, Spirax-Sarco Engineering, and Weir Group (components).
index <- read_csv("FTSE 350 Industrial Engineering Historical Data.csv")
index %<>%
rename(date = Date) %>%
mutate(date = mdy(date),
adjusted = Price) %>%
select(-Vol., -`Change %`)
# gets data from Yahoo Finance
bodycote <- tq_get("BOY.L")
hill_smith <- tq_get("HILS.L")
imi <- tq_get("IMI.L")
rotork <- tq_get("ROR.L")
spirax_s <- tq_get("SPX.L")
weir <- tq_get("WEIR.L")
data <- bind_rows(index %>% mutate(type = "index", name = "FTSE 350 Industrial Engineering"),
bodycote %>% mutate(type = "stock", name = "Bodycote plc"),
hill_smith %>% mutate(type = "stock", name = "Hill & Smith Holdings PLC"),
imi %>% mutate(type = "stock", name = "IMI plc"),
rotork %>% mutate(type = "stock", name = "Rotork plc"),
spirax_s %>% mutate(type = "stock", name = "Spirax-Sarco Engineering plc"),
weir %>% mutate(type = "stock", name = "The Weir Group PLC"))
Compute the returns.
# compute returns (daily)
data_rets <- data %>%
select(date, name, type, adjusted) %>%
arrange(type, name, date) %>%
group_by(name, type) %>%
mutate(return = adjusted/lag(adjusted)-1,
return_ln = log(adjusted/lag(adjusted))) %>%
filter(!is.na(return_ln),
# cut-off date
date >= "2016-04-15") %>%
mutate(cumret_ln = cumsum(return_ln),
adjusted_ln = exp(cumret_ln))
data_rets %>%
ggplot(aes(x = date, y = adjusted_ln, color = name)) +
geom_line(aes(size = factor(desc(type)))) +
scale_size_discrete(range = c(0.4,1.4), guide = "none") +
theme_tq()
# compute returns (monthly)
data_rets_m <- data %>%
select(date, name, type, adjusted) %>%
arrange(type, name, date) %>%
group_by(name, type, date = floor_date(date, unit = "months") %>% date()) %>%
summarise(adjusted = first(adjusted)) %>%
mutate(return = adjusted/lag(adjusted)-1,
return_ln = log(adjusted/lag(adjusted))) %>%
filter(!is.na(return_ln),
# cut-off date
date >= "2016-04-01") %>%
mutate(cumret_ln = cumsum(return_ln),
adjusted_ln = exp(cumret_ln))
data_rets_m %>%
ggplot(aes(x = date, y = adjusted_ln, color = name)) +
geom_line(aes(size = factor(desc(type)))) +
scale_size_discrete(range = c(0.4,1.4), guide = "none") +
theme_tq()
Spread format for modelling.
data_rets %>%
ungroup() %>%
select(date, name, return_ln) %>%
filter(date >= "2016-04-15") %>%
spread(name, return_ln) -> returns_daily
data_rets_m %>%
ungroup() %>%
select(date, name, return_ln) %>%
filter(date >= "2016-04-01") %>%
spread(name, return_ln) -> returns_monthly
Linear model. The estimates can be seen as weights.
lm(`FTSE 350 Industrial Engineering` ~ . -1, data = returns_daily %>%
select(-date)) -> model
tidy(model)
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 `Bodycote plc` 0.0925 0.00353 26.3 1.99e-108
## 2 `Hill & Smith Holdings PLC` 0.0525 0.00249 21.1 7.50e- 78
## 3 `IMI plc` 0.180 0.00416 43.1 6.99e-206
## 4 `Rotork plc` 0.150 0.00362 41.5 4.92e-197
## 5 `Spirax-Sarco Engineering plc` 0.254 0.00477 53.2 1.58e-257
## 6 `The Weir Group PLC` 0.246 0.00293 84.2 0.
Estimated/weights shoud approximately sum up to 1.
tidy(model) %>%
summarise(total = sum(estimate))
## # A tibble: 1 x 1
## total
## <dbl>
## 1 0.975
# note R-square of .99
glance(model)$r.squared
## [1] 0.9890439
Correlation plots, daily and monthly.
data_rets %>%
ungroup() %>%
select(date, name, return_ln) %>%
spread(name, return_ln) %>%
select(-date) %>%
chart.Correlation()
data_rets_m %>%
ungroup() %>%
select(date, name, return_ln) %>%
spread(name, return_ln) %>%
select(-date) %>%
chart.Correlation()
PCA on monthly log returns.
data_rets_m %>%
filter(type != "index") %>%
ungroup() %>%
select(date, name, return_ln) %>%
# fill to remove the misssing values
spread(name, return_ln, fill = 0) %>%
select(-date) %>%
as.matrix() %>%
prcomp(scale. = T, center = T) -> pca
pca %>% summary()
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.580 1.1160 0.9779 0.7829 0.65580 0.50892
## Proportion of Variance 0.416 0.2076 0.1594 0.1022 0.07168 0.04317
## Cumulative Proportion 0.416 0.6236 0.7830 0.8851 0.95683 1.00000
pca %>% fviz_eig()
pca %>% fviz_pca_biplot()
PCA loadings (eigenvectors) and eigenvalues.
pca$rotation
## PC1 PC2 PC3 PC4
## Bodycote plc -0.34603858 -0.4689933 -0.44006358 -0.4750017
## Hill & Smith Holdings PLC -0.04201706 -0.4723064 0.84557402 -0.2241275
## IMI plc -0.51451693 -0.1478469 -0.08864026 -0.2105137
## Rotork plc -0.46124076 0.4918181 0.12879647 -0.1995112
## Spirax-Sarco Engineering plc -0.40837673 -0.3801979 -0.04891896 0.7932683
## The Weir Group PLC -0.48399339 0.3855876 0.25398804 0.1036571
## PC5 PC6
## Bodycote plc 0.49093753 -0.000565362
## Hill & Smith Holdings PLC 0.06037864 0.079291141
## IMI plc -0.78726662 -0.203597110
## Rotork plc 0.06794391 0.695961409
## Spirax-Sarco Engineering plc 0.07288212 0.227373016
## The Weir Group PLC 0.35442608 -0.645135677
pca$sdev^2
## [1] 2.4961645 1.2454682 0.9563358 0.6129514 0.4300792 0.2590009
Varimax rotation.
varimaxrot <- pca$rotation %>% varimax()
varimaxrot
## $loadings
##
## Loadings:
## PC1 PC2 PC3 PC4 PC5 PC6
## Bodycote plc -1
## Hill & Smith Holdings PLC 1
## IMI plc -1
## Rotork plc 1
## Spirax-Sarco Engineering plc 1
## The Weir Group PLC -1
##
## PC1 PC2 PC3 PC4 PC5 PC6
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.167 0.167 0.167 0.167 0.167 0.167
## Cumulative Var 0.167 0.333 0.500 0.667 0.833 1.000
##
## $rotmat
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.4839714 0.3460371235 -0.04201268 -0.40837888 0.51450979
## [2,] -0.3855569 0.4690016627 -0.47230551 -0.38019474 0.14785008
## [3,] -0.2539845 0.4400611233 0.84557494 -0.04891609 0.08864144
## [4,] -0.1036675 0.4749993490 -0.22412707 0.79326742 0.21051229
## [5,] -0.3544180 -0.4909350753 0.06037420 0.07287999 0.78727067
## [6,] 0.6451747 0.0005722849 0.07929358 0.22737892 0.20359811
## [,6]
## [1,] -0.4612714
## [2,] 0.4918365
## [3,] 0.1288061
## [4,] -0.1995172
## [5,] 0.0679630
## [6,] 0.6959228
For taking into account the maximum of the first three components, it seems that a combination of Bodycote, Hill&Smith, and Weir could be fine.
In a linear model.
lm(`FTSE 350 Industrial Engineering` ~ `Bodycote plc` + `Hill & Smith Holdings PLC` + `The Weir Group PLC` -1,
data = returns_daily %>% select(-date)) -> model_pca_daily
tidy(model_pca_daily)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 `Bodycote plc` 0.223 0.0139 16.0 4.44e- 50
## 2 `Hill & Smith Holdings PLC` 0.0858 0.0105 8.19 1.11e- 15
## 3 `The Weir Group PLC` 0.396 0.0109 36.4 1.82e-168
# R-square
glance(model_pca_daily)$r.squared
## [1] 0.8031523
lm(`FTSE 350 Industrial Engineering` ~ `Bodycote plc` + `Hill & Smith Holdings PLC` + `The Weir Group PLC` -1,
data = returns_monthly %>% select(-date)) -> model_pca_monthly
tidy(model_pca_monthly)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 `Bodycote plc` 0.251 0.0494 5.08 1.36e- 5
## 2 `Hill & Smith Holdings PLC` 0.0715 0.0354 2.02 5.10e- 2
## 3 `The Weir Group PLC` 0.428 0.0403 10.6 2.33e-12
# R-square
glance(model_pca_monthly)$r.squared
## [1] 0.8309826
Best fit portfolio return. Look at monthly here.
returns_monthly %>%
mutate(fitted = model_pca_monthly$fitted.values) %>%
select(date, fitted, everything()) -> return_fitted
return_fitted %>%
ggplot(aes(x = `FTSE 350 Industrial Engineering`, y = fitted)) +
geom_point(aes(color = fitted > `FTSE 350 Industrial Engineering`)) +
geom_abline(slope=1, intercept=0) +
geom_smooth(method = "lm") +
theme_tq()
Cumulative return for performance.
data_rets_m_fitted <- return_fitted %>%
gather(name, return_ln, -date) %>%
mutate(type = case_when(name == "fitted" ~ "fitted",
str_detect(name, "FTSE") ~ "index",
T ~ "stock")) %>%
arrange(type, name, date) %>%
group_by(name, type) %>%
mutate(cumret_ln = cumsum(return_ln),
adjusted_ln = exp(cumret_ln))
data_rets_m_fitted %>%
ggplot(aes(x = date, y = adjusted_ln, color = name)) +
geom_line(aes(size = factor(desc(type)), alpha = factor(desc(type)))) +
scale_size_discrete(range = c(0.4,1.4), guide = "none") +
scale_alpha_discrete(range = c(0.4, 1), guide = "none") +
theme_tq()
Tracking error etc. could be computed with tidyquant::tq_performance()
, see https://cran.r-project.org/web/packages/tidyquant/vignettes/TQ05-performance-analysis-with-tidyquant.html