knitr::opts_chunk$set(echo = TRUE)
#knitr::opts_chunk$set(tidy.opts=list(width.cutoff=80), tidy=TRUE)
In this report, we investigated historical daily closing prices from major stock indices of 24 countries/regions using Multidimensional Scaling (MDS), by introducing methods of measuring the dissimilarities between stock indices. The analysis showed clustering behaviors of the global stock indices. By grouping markets according to distances into Americas, Europe, EM (Emerging Markets) and Asia, we are helping investors better understand and compare different markets.
Investing in stock markets carries a certain degree of risk, diversifying risk is a key part of portfolio construction. In practice, investors usually buy stocks from different markets to diversify the source of risk, as a result, classifying markets is of great interest to investors. Institutional economic surveys (like MSCI provide qualitatively identified market structures e.g., emerging markets and developed markets. As a market enthusiast, I was particularly interested to know how dissimilar global stock markets are through quantitative techniques, which motivated me to perform MDS on global stock indices data.
The original data was downloaded from link, which encompasses the historical data, reaching back to the early 1920s (where available) until February of 2024, of 35 major stock indices from around the world. To clean and transform the data into the format suitable for MDS, we performed the following steps:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
# Import the data
stock_indices.df = read.csv(file="all_indices_data.csv")
# Convert the date column to Date type
stock_indices.df$date <- as.Date(stock_indices.df$date)
# Filter out rows with dates before January 1st, 2023
stock_indices.df <- stock_indices.df %>%
filter(date >= as.Date("2023-01-01"))
# Keep only the relevant columns
cleaned_stock_indices_wide <- stock_indices.df %>%
select(date, close, ticker) %>%
pivot_wider(names_from = ticker, values_from = close) %>%
na.omit() # Remove rows with any NA values
library(dplyr)
# Rename the stock indices to corresponding countries for clearity
new_stock_indices <- cleaned_stock_indices_wide %>%
rename(
USA1 = `^GSPC`, # S&P 500
USA2 = `^DJI`, # Dow Jones Industrial Average
USA3 = `^IXIC`, # NASDAQ Composite
USA4 = `^NYA`, # NYSE Composite
USA5 = `^XAX`, # NYSE AMEX Composite Index
USA6 = `^RUT`, # Russell 2000
USA7 = `^VIX`, # CBOE Volatility Index
UK = `^FTSE`, # FTSE 100
UK2 = `^BUK100P`, # Placeholder for another UK-based index, assuming BUK100P relates to FTSE 100
GER = `^GDAXI`, # DAX
FRA = `^FCHI`, # CAC 40
EU = `^STOXX50E`, # EURO STOXX 50
EU2 = `^N100`, # Euronext 100
BEL = `^BFX`, # BEL 20
RUS = `IMOEX.ME`, # MOEX Russia Index
JPN = `^N225`, # Nikkei 225
HK = `^HSI`, # Hang Seng Index
CHN = `000001.SS`, # Shanghai Composite Index
CHN2 = `399001.SZ`, # Shenzhen Composite Index
SGP = `^STI`, # Straits Times Index
AUS = `^AXJO`, # ASX 200
AUS2 = `^AORD`, # All Ordinaries
IND = `^BSESN`, # BSE SENSEX
IDN = `^JKSE`, # Jakarta Composite Index
MYS = `^KLSE`, # FTSE Bursa Malaysia KLCI
NZ = `^NZ50`, # NZX 50 Index
KOR = `^KS11`, # KOSPI
TWN = `^TWII`, # Taiwan Stock Exchange Capitalization Weighted Stock Index
CAN = `^GSPTSE`, # S&P/TSX Composite Index
BRA = `^BVSP`, # Bovespa Index
MEX = `^MXX`, # IPC (Mexico)
ARG = `^MERV`, # MERVAL (Argentina)
ISR = `^TA125.TA`, # Tel Aviv 125
SA = `^JN0U.JO` # Johannesburg Stock Exchange (JSE) Top 40
)
# Remove ones that are non-representative
reduced_stock_indices <- new_stock_indices %>%
select(-USA5, -USA6, -USA7, -UK2, -CHN2, -EU2, -AUS2, -RUS)
#head(reduced_new_stock_indices)
Then we performed EDA by plotting the performance of each stock index over time.
# Using facet_wrap for separate scales but keeping them in the same plot
library(ggplot2)
reduced_stock_indices_long <- reduced_stock_indices %>%
pivot_longer(cols = -date, names_to = "ticker", values_to = "close")
ggplot(data = reduced_stock_indices_long, aes(x = date, y = close, color = ticker)) +
geom_line() +
facet_wrap(~ticker, scales = "free_y", ncol = 5) + # Adjust 'ncol' as needed
labs(title = "Stock Indices Over Time", x = "Date", y = "Index Value") +
theme(legend.position = "none")
In order to do classical MDS, we need a measure of pairwise dissimilarities between stock indices (usually a metric), as we could not obtain meaningful observations from the plot [@Figure2] generated from the default Euclidean distances (\(E_{\bf A,B} = \sqrt{\sum_{i = 1}^n(A_i - B_i)}\) for stocks \(\bf A\) and B), we explored alternative measures of dissimilarities.
library(stats) # For cmdscale function
# Extracting the relevant columns for MDS
data_for_mds <- reduced_stock_indices[, -1] # Excluding the date column
#data_for_mds <- select(-ARG)
data_for_mds <- log(data_for_mds)
data_for_mds_transposed <- t(data_for_mds)
# Perform MDS
mds_result <- cmdscale(dist(data_for_mds_transposed))
# Plot the MDS result
plot(mds_result, type = "n", xlab = "Dimension 1", ylab = "Dimension 2", main = "MDS Plot")
# Add ticker labels to the plot
Ticker <- rownames(mds_result)
text(mds_result, labels = Ticker, pos = 3, col = "blue")
In mathematical finance, the daily difference of log returns of assets \(Y_i\) is often of interest as they display a high cross-dependence, even across industries and asset classes.
\[ Y_i = \log S_{i(t)} - \log S_{i(t-1)} \]
Where \(S_{i(t)}\) is the stock price of the \(i - th\) stock at time \(t\).
Pairwise similarity of stocks is commonly measured by the correlation coefficients: \(\rho _{i,j} = Cor(Y_i, Y_j)\) [@figure3], which ranges from \(-1\) to \(1\).
The correlation coefficient cannot be used as a distance for @cmdscale because it is not metric. @Mantengna using as distance an appropriate function of the correlation coefficient:
\[ d(i, j) = 1 - \rho ^2 _{i, j} \]
which was proved to be a valid distance function that fulfills the three axioms of a metric. Hence, we implemented the introduced distance to perform MDS in R:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.2 ✔ stringr 1.5.0
## ✔ purrr 1.0.1 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stats)
# Calculate log returns for each index
log_stock_indices = log(subset(reduced_stock_indices, select=-date))
log_returns <- log_stock_indices %>%
mutate(across(everything(), ~c(NA, diff(.)))) %>%
na.omit() # Remove rows with NA values
# Calculate the correlation matrix of the log returns
cor_matrix <- cor(log_returns, use = "pairwise.complete.obs")
# Transform the correlation matrix into a dissimilarity matrix
dissimilarity_matrix <- as.dist(1 - cor_matrix^2)
## corrplot 0.92 loaded
mds_results <- cmdscale(dissimilarity_matrix, k = 10)
#mds_results_square$eig
mds_results_eig <- cmdscale(dissimilarity_matrix, k = 10, eig = TRUE)
mds_results_eig$eig
## [1] 1.676225e+00 1.043916e+00 9.444641e-01 6.620423e-01 6.206995e-01
## [6] 5.641395e-01 5.109566e-01 4.882454e-01 3.951860e-01 3.596383e-01
## [11] 3.273543e-01 2.809498e-01 2.661121e-01 2.198200e-01 1.801876e-01
## [16] 1.513577e-01 1.168845e-01 9.694066e-02 5.027924e-02 1.938739e-02
## [21] 6.794573e-03 6.245005e-17 -8.151172e-03 -1.157695e-02 -2.284996e-02
## [26] -3.937078e-02
plot(1:26, mds_results_eig$eig, xlab = "Eigenvalue number",
ylab = "Eigenvalue")
abline(h = 0, lty = 2)
abline(v = 22, lty = 2, col = 2)
# Convert to a dataframe for plotting
mds_df <- as.data.frame(mds_results)
names(mds_df) <- c("Dim1", "Dim2")
# Create a data frame for plotting that includes the ticker names
mds_df <- cbind(Ticker = rownames(mds_df), mds_df)
rownames(mds_df) <- NULL
cluster_assignments <- c('Americas', 'Americas','Americas', 'Americas', 'Europe', 'Europe', 'Europe', 'Europe', 'Europe', 'Asia', 'Asia', 'Asia', 'Asia', 'Asia', 'EM', 'Asia', 'Asia', 'Asia', 'Asia', 'Asia', 'Americas', 'EM', 'EM', 'EM', 'EM', 'EM')
mds_df$Cluster <- cluster_assignments
# Now plot the MDS results with manually assigned clusters
ggplot(mds_df, aes(x = Dim1, y = Dim2, label = Ticker, color = Cluster)) +
geom_text(aes(label = Ticker), check_overlap = TRUE) +
scale_color_manual(values = c('Americas' = 'purple', 'EM' = 'orange', 'Asia' = 'red', 'Europe' = 'blue')) + # Set colors for clusters
theme_minimal() +
labs(title = "Multidimensional Scaling of Stock Indices Based on Log Returns",
x = "Dimension 1",
y = "Dimension 2") +
theme(legend.position = "right")