library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(quantmod)
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(xts)
library(quantmod)
# Set date range
begin_date <- "2013-01-01"
end_date <- "2015-12-31"
# Valid tickers only (AEZS removed)
stock_namelist <- c("AAPL", "AMD", "ADI", "ABBV", "A", "APD", "AA", "CF")
# Safely get adjusted close prices
data_set <- xts()
for (symbol in stock_namelist) {
tryCatch({
stock_data <- Ad(getSymbols(symbol, from = begin_date, to = end_date, auto.assign = FALSE))
data_set <- cbind(data_set, stock_data)
}, error = function(e) {
message(paste("Skipping:", symbol, "— not available."))
})
}
# Rename columns to match tickers
colnames(data_set) <- stock_namelist
# Calculate log returns
X <- diff(log(data_set), na.pad = FALSE)
# Output: basic structure of X
print("✅ First few rows of the log return matrix X:")
## [1] "✅ First few rows of the log return matrix X:"
print(head(X))
## AAPL AMD ADI ABBV A
## 2013-01-03 -0.012702130 -0.015936577 -0.016267183 -0.008291949 0.003575210
## 2013-01-04 -0.028249767 0.039375128 -0.017947752 -0.012713278 0.019555361
## 2013-01-07 -0.005899884 0.030420658 0.003052596 0.002033576 -0.007258952
## 2013-01-08 0.002687529 0.000000000 -0.010369664 -0.022004159 -0.008022975
## 2013-01-09 -0.015752018 -0.015094611 -0.002609632 0.005619964 0.026649658
## 2013-01-10 0.012319353 -0.003809616 0.012041190 0.002945463 0.007354546
## APD AA CF
## 2013-01-03 -0.0035001834 0.008859295 -0.004739890
## 2013-01-04 0.0133509236 0.020731705 0.022151869
## 2013-01-07 -0.0009232401 -0.017429588 -0.003753371
## 2013-01-08 0.0018456287 0.000000000 -0.014768825
## 2013-01-09 0.0133908804 -0.002199977 0.034376250
## 2013-01-10 -0.0009101770 -0.012188553 0.014917238
# Show dimensions
cat("\n✅ Dimensions of X: ", nrow(X), "rows ×", ncol(X), "columns\n")
##
## ✅ Dimensions of X: 754 rows × 8 columns
# Step 2: Read and clean Fama-French 3-Factor Data from uploaded CSV
library(readr)
library(dplyr)
library(lubridate)
library(xts)
# Read CSV skipping metadata rows
ff_data <- read_csv("F-F_Research_Data_Factors_daily.csv", skip = 4)
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 25962 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (4): Mkt-RF, SMB, HML, RF
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove bottom rows (footer), usually starts with "Annual Factors"
ff_data <- ff_data[!is.na(as.numeric(ff_data[[1]])), ]
## Warning in `[.tbl_df`(ff_data, !is.na(as.numeric(ff_data[[1]])), ): NAs
## introduced by coercion
# Rename columns
colnames(ff_data)[1:5] <- c("Date", "Mkt_RF", "SMB", "HML", "RF")
# Convert types
ff_data <- ff_data %>%
mutate(
Date = ymd(as.character(Date)),
Mkt_RF = as.numeric(Mkt_RF) / 100,
SMB = as.numeric(SMB) / 100,
HML = as.numeric(HML) / 100,
RF = as.numeric(RF) / 100
)
# Create xts object and align with stock return dates
fama_lib <- xts(ff_data[, c("Mkt_RF", "SMB", "HML", "RF")], order.by = ff_data$Date)
F_FamaFrench <- fama_lib[index(X)]
# Preview result
print("✅ First few rows of Fama-French factors aligned with X:")
## [1] "✅ First few rows of Fama-French factors aligned with X:"
print(head(F_FamaFrench))
## Mkt_RF SMB HML RF
## 2013-01-03 -0.0013 0.0010 0.0005 0
## 2013-01-04 0.0055 0.0012 0.0037 0
## 2013-01-07 -0.0031 -0.0009 -0.0036 0
## 2013-01-08 -0.0026 0.0005 -0.0001 0
## 2013-01-09 0.0035 0.0028 -0.0040 0
## 2013-01-10 0.0067 -0.0065 0.0051 0
# Step 3: Download S&P500 index (market proxy)
SP500_index <- Ad(getSymbols("^GSPC", from = begin_date, to = end_date, auto.assign = FALSE))
colnames(SP500_index) <- "SP500"
f_SP500 <- diff(log(SP500_index), na.pad = FALSE)
# Align with stock return matrix X
f_SP500 <- f_SP500[index(X)]
# Combine ones + market factor
F_capm <- cbind(1, f_SP500) # [T x 2] matrix
# OLS estimate: Gamma = (F'F)^(-1) F'X
Gamma <- t(solve(t(F_capm) %*% F_capm, t(F_capm) %*% X))
# Separate alpha and beta
alpha <- Gamma[, 1]
beta <- Gamma[, 2]
# Calculate residuals
E <- X - (F_capm %*% t(Gamma)) # residual matrix
# Residual covariance matrix (idiosyncratic risk)
Psi <- (1 / (nrow(X) - 2)) * t(E) %*% E
# CAPM covariance matrix estimate
Sigma_CAPM <- as.numeric(var(f_SP500)) * (beta %o% beta) + diag(diag(Psi))
# Preview
cat("✅ CAPM covariance matrix (first 5×5 block):\n")
## ✅ CAPM covariance matrix (first 5×5 block):
print(round(Sigma_CAPM[1:5, 1:5], 6))
## AAPL AMD ADI ABBV A
## AAPL 0.000266 0.000073 0.000073 0.000070 0.000072
## AMD 0.000073 0.000976 0.000096 0.000091 0.000095
## ADI 0.000073 0.000096 0.000236 0.000092 0.000095
## ABBV 0.000070 0.000091 0.000092 0.000298 0.000091
## A 0.000072 0.000095 0.000095 0.000091 0.000196
# Step 4: Fama-French 3-Factor Model
# Create design matrix F (T x 4) including intercept
F_ff3 <- cbind(1, F_FamaFrench[, 1:3]) # [1, Mkt_RF, SMB, HML]
# OLS estimate
Gamma_ff3 <- t(solve(t(F_ff3) %*% F_ff3, t(F_ff3) %*% X))
# Separate alpha and betas
alpha_ff3 <- Gamma_ff3[, 1]
B <- Gamma_ff3[, 2:4] # [N x 3] matrix of factor loadings
# Residuals
E_ff3 <- X - (F_ff3 %*% t(Gamma_ff3))
# Residual covariance
Psi_ff3 <- (1 / (nrow(X) - 4)) * t(E_ff3) %*% E_ff3
# Factor covariance matrix
F_cov <- cov(F_FamaFrench[, 1:3])
# Fama-French covariance matrix estimate
Sigma_FF3 <- B %*% F_cov %*% t(B) + diag(diag(Psi_ff3))
# Preview
cat("✅ Fama-French 3-Factor covariance matrix (first 5×5 block):\n")
## ✅ Fama-French 3-Factor covariance matrix (first 5×5 block):
print(round(Sigma_FF3[1:5, 1:5], 6))
## AAPL AMD ADI ABBV A
## AAPL 0.000267 0.000072 0.000073 0.000074 0.000072
## AMD 0.000072 0.000978 0.000101 0.000094 0.000102
## ADI 0.000073 0.000101 0.000237 0.000094 0.000098
## ABBV 0.000074 0.000094 0.000094 0.000299 0.000094
## A 0.000072 0.000102 0.000098 0.000094 0.000196
# Step 5: Compare Estimation Errors
# True covariance matrix (sample from actual data)
Sigma_true <- cov(X)
# Sample covariance matrix
Sigma_sample <- cov(X)
# Error for each model
error <- c(
Sample = norm(Sigma_sample - Sigma_true, "F"),
CAPM = norm(Sigma_CAPM - Sigma_true, "F"),
FF3 = norm(Sigma_FF3 - Sigma_true, "F")
)
# Plot estimation error
barplot(error,
main = "Estimation Error (Frobenius Norm)",
col = "skyblue",
ylab = "Error", ylim = c(0, max(error) * 1.1))

# Calculate PRIAL safely
ref <- error["Sample"]^2
PRIAL <- 100 * (ref - error^2) / ref
# Replace -Inf and NaN with 0
PRIAL[!is.finite(PRIAL)] <- 0
# Show values
print(round(PRIAL, 2))
## Sample CAPM FF3
## 0 0 0
# Plot safely
barplot(PRIAL,
main = "PRIAL for Factor Models",
col = "lightgreen",
ylab = "% Improvement",
ylim = c(0, max(PRIAL) * 1.1))

# Step 6: PCA-based Statistical Factor Models
estimate_pca_cov <- function(X, K) {
alpha <- colMeans(X)
Xc <- X - matrix(alpha, nrow(X), ncol(X), byrow = TRUE)
Sigma <- cov(Xc)
eig <- eigen(Sigma)
B <- eig$vectors[, 1:K] %*% diag(sqrt(eig$values[1:K]), K, K)
Psi <- diag(diag(Sigma - B %*% t(B)))
B %*% t(B) + Psi
}
Sigma_PCA1 <- estimate_pca_cov(X, 1)
Sigma_PCA3 <- estimate_pca_cov(X, 3)
Sigma_PCA5 <- estimate_pca_cov(X, 5)
# Step 7: Compare all covariance matrices against true sample
Sigma_true <- cov(X)
error <- c(
SCM = norm(Sigma_sample - Sigma_true, "F"),
SP500 = norm(Sigma_CAPM - Sigma_true, "F"),
FamaFrench = norm(Sigma_FF3 - Sigma_true, "F"),
PCA1 = norm(Sigma_PCA1 - Sigma_true, "F"),
PCA3 = norm(Sigma_PCA3 - Sigma_true, "F"),
PCA5 = norm(Sigma_PCA5 - Sigma_true, "F")
)
# Print errors
print(round(error, 6))
## SCM SP500 FamaFrench PCA1 PCA3 PCA5
## 0.000000 0.000124 0.000111 0.000446 0.000224 0.000119
# PRIAL calculation
ref <- error["SCM"]^2
PRIAL <- 100 * (ref - error^2) / ref
PRIAL[!is.finite(PRIAL)] <- 0 # Clean Inf/NaN
# Barplot: Error
barplot(error,
main = "Estimation Error (Frobenius Norm)",
col = "skyblue",
ylab = "Error",
cex.names = 0.75, las = 1)

# Barplot: PRIAL
barplot(PRIAL,
main = "PRIAL for estimation of covariance matrix",
col = "bisque3",
ylab = "% Improvement",
cex.names = 0.75, las = 1)

# Print best method
cat("✅ Best method based on PRIAL:", names(which.max(PRIAL)), "\n")
## ✅ Best method based on PRIAL: SCM
print(round(PRIAL, 2))
## SCM SP500 FamaFrench PCA1 PCA3 PCA5
## 0 0 0 0 0 0