knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_chunk$set(tidy.opts=list(width.cutoff=80), tidy=TRUE)

Summary

In this report, we investigated historical daily closing prices from major stock indices of 24 countries/regions using Multidimensional Scaling (MDS), by introducing methods of measuring the dissimilarities between stock indices. The analysis showed clustering behaviors of the global stock indices. By grouping markets according to distances into Americas, Europe, EM (Emerging Markets) and Asia, we are helping investors better understand and compare different markets.

Reasons for Choice of Data

Investing in stock markets carries a certain degree of risk, diversifying risk is a key part of portfolio construction. In practice, investors usually buy stocks from different markets to diversify the source of risk, as a result, classifying markets is of great interest to investors. Institutional economic surveys (like MSCI provide qualitatively identified market structures e.g., emerging markets and developed markets. As a market enthusiast, I was particularly interested to know how dissimilar global stock markets are through quantitative techniques, which motivated me to perform MDS on global stock indices data.

Data cleaning and transformation

The original data was downloaded from link, which encompasses the historical data, reaching back to the early 1920s (where available) until February of 2024, of 35 major stock indices from around the world. To clean and transform the data into the format suitable for MDS, we performed the following steps:

  1. Import the data, filter out rows with dates before January 1st, 2023
  2. Keep only the relevant columns of dates, close prices and tickers.
  3. Rename the stock indices to corresponding countries for clarity and removing indices that are not representative.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
# Import the data
stock_indices.df = read.csv(file="all_indices_data.csv")

# Convert the date column to Date type 
stock_indices.df$date <- as.Date(stock_indices.df$date)

# Filter out rows with dates before January 1st, 2023
stock_indices.df <- stock_indices.df %>% 
  filter(date >= as.Date("2023-01-01"))

# Keep only the relevant columns
cleaned_stock_indices_wide <- stock_indices.df %>%
  select(date, close, ticker) %>%  
  pivot_wider(names_from = ticker, values_from = close) %>%
  na.omit()  # Remove rows with any NA values
library(dplyr)
# Rename the stock indices to corresponding countries for clearity
new_stock_indices <- cleaned_stock_indices_wide %>%
  rename(
    USA1 = `^GSPC`,      # S&P 500
    USA2 = `^DJI`,       # Dow Jones Industrial Average
    USA3 = `^IXIC`,      # NASDAQ Composite
    USA4 = `^NYA`,       # NYSE Composite
    USA5 = `^XAX`,       # NYSE AMEX Composite Index
    USA6 = `^RUT`,       # Russell 2000
    USA7 = `^VIX`,       # CBOE Volatility Index
    UK = `^FTSE`,        # FTSE 100
    UK2 = `^BUK100P`,    # Placeholder for another UK-based index, assuming BUK100P relates to FTSE 100
    GER = `^GDAXI`,      # DAX
    FRA = `^FCHI`,       # CAC 40
    EU = `^STOXX50E`,   # EURO STOXX 50
    EU2 = `^N100`,       # Euronext 100
    BEL = `^BFX`,        # BEL 20
    RUS = `IMOEX.ME`,    # MOEX Russia Index
    JPN = `^N225`,       # Nikkei 225
    HK = `^HSI`,         # Hang Seng Index
    CHN = `000001.SS`,  # Shanghai Composite Index
    CHN2 = `399001.SZ`,  # Shenzhen Composite Index
    SGP = `^STI`,        # Straits Times Index
    AUS = `^AXJO`,      # ASX 200
    AUS2 = `^AORD`,      # All Ordinaries
    IND = `^BSESN`,      # BSE SENSEX
    IDN = `^JKSE`,       # Jakarta Composite Index
    MYS = `^KLSE`,       # FTSE Bursa Malaysia KLCI
    NZ = `^NZ50`,        # NZX 50 Index
    KOR = `^KS11`,       # KOSPI
    TWN = `^TWII`,       # Taiwan Stock Exchange Capitalization Weighted Stock Index
    CAN = `^GSPTSE`,     # S&P/TSX Composite Index
    BRA = `^BVSP`,       # Bovespa Index
    MEX = `^MXX`,        # IPC (Mexico)
    ARG = `^MERV`,       # MERVAL (Argentina)
    ISR = `^TA125.TA`,   # Tel Aviv 125
    SA = `^JN0U.JO`      # Johannesburg Stock Exchange (JSE) Top 40
  )

# Remove ones that are non-representative
reduced_stock_indices <- new_stock_indices %>%
  select(-USA5, -USA6, -USA7, -UK2, -CHN2, -EU2, -AUS2, -RUS)

#head(reduced_new_stock_indices)

Exploratory Data Analysis

Then we performed EDA by plotting the performance of each stock index over time.

# Using facet_wrap for separate scales but keeping them in the same plot
library(ggplot2)
reduced_stock_indices_long <- reduced_stock_indices %>%
  pivot_longer(cols = -date, names_to = "ticker", values_to = "close")

ggplot(data = reduced_stock_indices_long, aes(x = date, y = close, color = ticker)) +
  geom_line() +
  facet_wrap(~ticker, scales = "free_y", ncol = 5) + # Adjust 'ncol' as needed
  labs(title = "Stock Indices Over Time", x = "Date", y = "Index Value") +
  theme(legend.position = "none")

MDS

In order to do classical MDS, we need a measure of pairwise dissimilarities between stock indices (usually a metric), as we could not obtain meaningful observations from the plot [@Figure2] generated from the default Euclidean distances (\(E_{\bf A,B} = \sqrt{\sum_{i = 1}^n(A_i - B_i)}\) for stocks \(\bf A\) and B), we explored alternative measures of dissimilarities.

library(stats) # For cmdscale function

# Extracting the relevant columns for MDS
data_for_mds <- reduced_stock_indices[, -1] # Excluding the date column
#data_for_mds <- select(-ARG)

data_for_mds <- log(data_for_mds)
data_for_mds_transposed <- t(data_for_mds)
# Perform MDS
mds_result <- cmdscale(dist(data_for_mds_transposed))

# Plot the MDS result
plot(mds_result, type = "n", xlab = "Dimension 1", ylab = "Dimension 2", main = "MDS Plot")

# Add ticker labels to the plot
Ticker <- rownames(mds_result)

text(mds_result, labels = Ticker, pos = 3, col = "blue")

In mathematical finance, the daily difference of log returns of assets \(Y_i\) is often of interest as they display a high cross-dependence, even across industries and asset classes.

\[ Y_i = \log S_{i(t)} - \log S_{i(t-1)} \]

Where \(S_{i(t)}\) is the stock price of the \(i - th\) stock at time \(t\).

Pairwise similarity of stocks is commonly measured by the correlation coefficients: \(\rho _{i,j} = Cor(Y_i, Y_j)\) [@figure3], which ranges from \(-1\) to \(1\).

The correlation coefficient cannot be used as a distance for @cmdscale because it is not metric. @Mantengna using as distance an appropriate function of the correlation coefficient:

\[ d(i, j) = 1 - \rho ^2 _{i, j} \]

which was proved to be a valid distance function that fulfills the three axioms of a metric. Hence, we implemented the introduced distance to perform MDS in R:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.1     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stats)


# Calculate log returns for each index
log_stock_indices = log(subset(reduced_stock_indices, select=-date))

log_returns <- log_stock_indices %>%
  mutate(across(everything(), ~c(NA, diff(.)))) %>%  
  na.omit()  # Remove rows with NA values

# Calculate the correlation matrix of the log returns
cor_matrix <- cor(log_returns, use = "pairwise.complete.obs")

# Transform the correlation matrix into a dissimilarity matrix
dissimilarity_matrix <- as.dist(1 - cor_matrix^2)
## corrplot 0.92 loaded

mds_results <- cmdscale(dissimilarity_matrix, k = 10)
#mds_results_square$eig
mds_results_eig <- cmdscale(dissimilarity_matrix, k = 10, eig = TRUE)
mds_results_eig$eig
##  [1]  1.676225e+00  1.043916e+00  9.444641e-01  6.620423e-01  6.206995e-01
##  [6]  5.641395e-01  5.109566e-01  4.882454e-01  3.951860e-01  3.596383e-01
## [11]  3.273543e-01  2.809498e-01  2.661121e-01  2.198200e-01  1.801876e-01
## [16]  1.513577e-01  1.168845e-01  9.694066e-02  5.027924e-02  1.938739e-02
## [21]  6.794573e-03  6.245005e-17 -8.151172e-03 -1.157695e-02 -2.284996e-02
## [26] -3.937078e-02
plot(1:26, mds_results_eig$eig, xlab = "Eigenvalue number",
     ylab = "Eigenvalue")
abline(h = 0, lty = 2)
abline(v = 22, lty = 2, col = 2)

# Convert to a dataframe for plotting
mds_df <- as.data.frame(mds_results)
names(mds_df) <- c("Dim1", "Dim2")

# Create a data frame for plotting that includes the ticker names
mds_df <- cbind(Ticker = rownames(mds_df), mds_df)
rownames(mds_df) <- NULL

cluster_assignments <- c('Americas', 'Americas','Americas', 'Americas', 'Europe', 'Europe', 'Europe', 'Europe', 'Europe', 'Asia', 'Asia', 'Asia', 'Asia', 'Asia', 'EM', 'Asia', 'Asia', 'Asia', 'Asia', 'Asia', 'Americas', 'EM', 'EM', 'EM', 'EM', 'EM')


mds_df$Cluster <- cluster_assignments

# Now plot the MDS results with manually assigned clusters
ggplot(mds_df, aes(x = Dim1, y = Dim2, label = Ticker, color = Cluster)) +
  geom_text(aes(label = Ticker), check_overlap = TRUE) +
  scale_color_manual(values = c('Americas' = 'purple', 'EM' = 'orange', 'Asia' = 'red', 'Europe' = 'blue')) +  # Set colors for clusters
  theme_minimal() +
  labs(title = "Multidimensional Scaling of Stock Indices Based on Log Returns",
       x = "Dimension 1",
       y = "Dimension 2") +
  theme(legend.position = "right")