library(tidyverse)
library(tidyquant)
library(quantmod)
library(lubridate)
library(broom)      # for tidying up linear models
library(factoextra)

Data

FTSE 350 Industrial Engineering index. The index components are Bodycote, Hill&Smith, IMI, Rotork, Spirax-Sarco Engineering, and Weir Group (components).

index <- read_csv("FTSE 350 Industrial Engineering Historical Data.csv")

index %<>%
  rename(date = Date) %>% 
  mutate(date = mdy(date),
         adjusted = Price) %>% 
  select(-Vol., -`Change %`)

# gets data from Yahoo Finance
bodycote   <- tq_get("BOY.L")
hill_smith <- tq_get("HILS.L")
imi        <- tq_get("IMI.L")
rotork     <- tq_get("ROR.L")
spirax_s   <- tq_get("SPX.L")
weir       <- tq_get("WEIR.L")

data <- bind_rows(index %>%      mutate(type = "index", name = "FTSE 350 Industrial Engineering"), 
                  bodycote %>%   mutate(type = "stock", name = "Bodycote plc"), 
                  hill_smith %>% mutate(type = "stock", name = "Hill & Smith Holdings PLC"), 
                  imi %>%        mutate(type = "stock", name = "IMI plc"), 
                  rotork %>%     mutate(type = "stock", name = "Rotork plc"), 
                  spirax_s %>%   mutate(type = "stock", name = "Spirax-Sarco Engineering plc"), 
                  weir %>%       mutate(type = "stock", name = "The Weir Group PLC"))

Compute the returns.

# compute returns (daily)
data_rets <- data %>% 
  select(date, name, type, adjusted) %>% 
  arrange(type, name, date) %>% 
  group_by(name, type) %>%
  mutate(return = adjusted/lag(adjusted)-1,
         return_ln = log(adjusted/lag(adjusted))) %>% 
  filter(!is.na(return_ln),
         # cut-off date
         date >= "2016-04-15") %>% 
  mutate(cumret_ln = cumsum(return_ln),
         adjusted_ln = exp(cumret_ln))

data_rets %>% 
  ggplot(aes(x = date, y = adjusted_ln, color = name)) +
  geom_line(aes(size = factor(desc(type)))) +
  scale_size_discrete(range = c(0.4,1.4), guide = "none") +
  theme_tq()

# compute returns (monthly)
data_rets_m <- data %>% 
  select(date, name, type, adjusted) %>% 
  arrange(type, name, date) %>% 
  group_by(name, type, date = floor_date(date, unit = "months") %>% date()) %>%
  summarise(adjusted = first(adjusted)) %>% 
  mutate(return = adjusted/lag(adjusted)-1,
         return_ln = log(adjusted/lag(adjusted))) %>% 
  filter(!is.na(return_ln),
         # cut-off date
         date >= "2016-04-01") %>% 
  mutate(cumret_ln = cumsum(return_ln),
         adjusted_ln = exp(cumret_ln))

data_rets_m %>% 
  ggplot(aes(x = date, y = adjusted_ln, color = name)) +
  geom_line(aes(size = factor(desc(type)))) +
  scale_size_discrete(range = c(0.4,1.4), guide = "none") +
  theme_tq()

Spread format for modelling.

data_rets %>%
  ungroup() %>% 
  select(date, name, return_ln) %>% 
  filter(date >= "2016-04-15") %>% 
  spread(name, return_ln) -> returns_daily

data_rets_m %>%
  ungroup() %>% 
  select(date, name, return_ln) %>% 
  filter(date >= "2016-04-01") %>% 
  spread(name, return_ln) -> returns_monthly

Linear model. The estimates can be seen as weights.

lm(`FTSE 350 Industrial Engineering` ~ . -1, data = returns_daily %>% 
     select(-date)) -> model

tidy(model)
## # A tibble: 6 x 5
##   term                           estimate std.error statistic   p.value
##   <chr>                             <dbl>     <dbl>     <dbl>     <dbl>
## 1 `Bodycote plc`                   0.0925   0.00353      26.3 1.99e-108
## 2 `Hill & Smith Holdings PLC`      0.0525   0.00249      21.1 7.50e- 78
## 3 `IMI plc`                        0.180    0.00416      43.1 6.99e-206
## 4 `Rotork plc`                     0.150    0.00362      41.5 4.92e-197
## 5 `Spirax-Sarco Engineering plc`   0.254    0.00477      53.2 1.58e-257
## 6 `The Weir Group PLC`             0.246    0.00293      84.2 0.

Estimated/weights shoud approximately sum up to 1.

tidy(model) %>% 
  summarise(total = sum(estimate))
## # A tibble: 1 x 1
##   total
##   <dbl>
## 1 0.975
# note R-square of .99
glance(model)$r.squared
## [1] 0.9890439

Correlation plots, daily and monthly.

data_rets %>% 
  ungroup() %>% 
  select(date, name, return_ln) %>% 
  spread(name, return_ln) %>% 
  select(-date) %>% 
  chart.Correlation()

data_rets_m %>% 
  ungroup() %>% 
  select(date, name, return_ln) %>% 
  spread(name, return_ln) %>% 
  select(-date) %>% 
  chart.Correlation()

PCA

PCA on monthly log returns.

data_rets_m %>%
  filter(type != "index") %>% 
  ungroup() %>% 
  select(date, name, return_ln) %>% 
  # fill to remove the misssing values
  spread(name, return_ln, fill = 0) %>% 
  select(-date) %>% 
  as.matrix() %>% 
  prcomp(scale. = T, center = T) -> pca

pca %>% summary()
## Importance of components:
##                          PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.580 1.1160 0.9779 0.7829 0.65580 0.50892
## Proportion of Variance 0.416 0.2076 0.1594 0.1022 0.07168 0.04317
## Cumulative Proportion  0.416 0.6236 0.7830 0.8851 0.95683 1.00000
pca %>% fviz_eig()

pca %>% fviz_pca_biplot()

PCA loadings (eigenvectors) and eigenvalues.

pca$rotation
##                                      PC1        PC2         PC3        PC4
## Bodycote plc                 -0.34603858 -0.4689933 -0.44006358 -0.4750017
## Hill & Smith Holdings PLC    -0.04201706 -0.4723064  0.84557402 -0.2241275
## IMI plc                      -0.51451693 -0.1478469 -0.08864026 -0.2105137
## Rotork plc                   -0.46124076  0.4918181  0.12879647 -0.1995112
## Spirax-Sarco Engineering plc -0.40837673 -0.3801979 -0.04891896  0.7932683
## The Weir Group PLC           -0.48399339  0.3855876  0.25398804  0.1036571
##                                      PC5          PC6
## Bodycote plc                  0.49093753 -0.000565362
## Hill & Smith Holdings PLC     0.06037864  0.079291141
## IMI plc                      -0.78726662 -0.203597110
## Rotork plc                    0.06794391  0.695961409
## Spirax-Sarco Engineering plc  0.07288212  0.227373016
## The Weir Group PLC            0.35442608 -0.645135677
pca$sdev^2
## [1] 2.4961645 1.2454682 0.9563358 0.6129514 0.4300792 0.2590009

Varimax

Varimax rotation.

varimaxrot <- pca$rotation %>% varimax()

varimaxrot
## $loadings
## 
## Loadings:
##                              PC1 PC2 PC3 PC4 PC5 PC6
## Bodycote plc                     -1                 
## Hill & Smith Holdings PLC             1             
## IMI plc                                      -1     
## Rotork plc                                        1 
## Spirax-Sarco Engineering plc              1         
## The Weir Group PLC           -1                     
## 
##                  PC1   PC2   PC3   PC4   PC5   PC6
## SS loadings    1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.167 0.167 0.167 0.167 0.167 0.167
## Cumulative Var 0.167 0.333 0.500 0.667 0.833 1.000
## 
## $rotmat
##            [,1]          [,2]        [,3]        [,4]       [,5]
## [1,]  0.4839714  0.3460371235 -0.04201268 -0.40837888 0.51450979
## [2,] -0.3855569  0.4690016627 -0.47230551 -0.38019474 0.14785008
## [3,] -0.2539845  0.4400611233  0.84557494 -0.04891609 0.08864144
## [4,] -0.1036675  0.4749993490 -0.22412707  0.79326742 0.21051229
## [5,] -0.3544180 -0.4909350753  0.06037420  0.07287999 0.78727067
## [6,]  0.6451747  0.0005722849  0.07929358  0.22737892 0.20359811
##            [,6]
## [1,] -0.4612714
## [2,]  0.4918365
## [3,]  0.1288061
## [4,] -0.1995172
## [5,]  0.0679630
## [6,]  0.6959228

For taking into account the maximum of the first three components, it seems that a combination of Bodycote, Hill&Smith, and Weir could be fine.

In a linear model.

lm(`FTSE 350 Industrial Engineering` ~ `Bodycote plc` + `Hill & Smith Holdings PLC` + `The Weir Group PLC` -1, 
   data = returns_daily %>% select(-date)) -> model_pca_daily

tidy(model_pca_daily)
## # A tibble: 3 x 5
##   term                        estimate std.error statistic   p.value
##   <chr>                          <dbl>     <dbl>     <dbl>     <dbl>
## 1 `Bodycote plc`                0.223     0.0139     16.0  4.44e- 50
## 2 `Hill & Smith Holdings PLC`   0.0858    0.0105      8.19 1.11e- 15
## 3 `The Weir Group PLC`          0.396     0.0109     36.4  1.82e-168
# R-square
glance(model_pca_daily)$r.squared
## [1] 0.8031523
lm(`FTSE 350 Industrial Engineering` ~ `Bodycote plc` + `Hill & Smith Holdings PLC` + `The Weir Group PLC` -1, 
   data = returns_monthly %>% select(-date)) -> model_pca_monthly

tidy(model_pca_monthly)
## # A tibble: 3 x 5
##   term                        estimate std.error statistic  p.value
##   <chr>                          <dbl>     <dbl>     <dbl>    <dbl>
## 1 `Bodycote plc`                0.251     0.0494      5.08 1.36e- 5
## 2 `Hill & Smith Holdings PLC`   0.0715    0.0354      2.02 5.10e- 2
## 3 `The Weir Group PLC`          0.428     0.0403     10.6  2.33e-12
# R-square
glance(model_pca_monthly)$r.squared
## [1] 0.8309826

Best fit portfolio return. Look at monthly here.

returns_monthly %>% 
  mutate(fitted = model_pca_monthly$fitted.values) %>% 
  select(date, fitted, everything()) -> return_fitted

return_fitted %>% 
  ggplot(aes(x = `FTSE 350 Industrial Engineering`, y = fitted)) +
  geom_point(aes(color = fitted > `FTSE 350 Industrial Engineering`)) +
  geom_abline(slope=1, intercept=0) +
  geom_smooth(method = "lm") +
  theme_tq()

Cumulative return for performance.

data_rets_m_fitted <- return_fitted %>%
  gather(name, return_ln, -date) %>% 
  mutate(type = case_when(name == "fitted"        ~ "fitted",
                          str_detect(name, "FTSE") ~ "index",
                          T ~ "stock")) %>% 
  arrange(type, name, date) %>% 
  group_by(name, type) %>%
  mutate(cumret_ln = cumsum(return_ln),
         adjusted_ln = exp(cumret_ln))

data_rets_m_fitted %>% 
  ggplot(aes(x = date, y = adjusted_ln, color = name)) +
  geom_line(aes(size = factor(desc(type)), alpha = factor(desc(type)))) +
  scale_size_discrete(range = c(0.4,1.4), guide = "none") +
  scale_alpha_discrete(range = c(0.4, 1),  guide = "none") +
  theme_tq()

Tracking error etc. could be computed with tidyquant::tq_performance(), see https://cran.r-project.org/web/packages/tidyquant/vignettes/TQ05-performance-analysis-with-tidyquant.html