library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(quantmod)
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(xts)
library(quantmod)

begin_date <- "2013-01-01"
end_date <- "2015-12-31"

stock_namelist <- c("AAPL", "AMD", "ADI", "ABBV", "A", "APD", "AA", "CF")

data_set <- xts()
for (symbol in stock_namelist) {
  tryCatch({
    stock_data <- Ad(getSymbols(symbol, from = begin_date, to = end_date, auto.assign = FALSE))
    data_set <- cbind(data_set, stock_data)
  }, error = function(e) {
    message(paste("Skipping:", symbol, "— not available."))
  })
}

colnames(data_set) <- stock_namelist

X <- diff(log(data_set), na.pad = FALSE)

print("✅ First few rows of the log return matrix X:")
## [1] "✅ First few rows of the log return matrix X:"
print(head(X))
##                    AAPL          AMD          ADI         ABBV            A
## 2013-01-03 -0.012702132 -0.015936577 -0.016268101 -0.008291316  0.003575351
## 2013-01-04 -0.028250188  0.039375128 -0.017946940 -0.012713271  0.019555083
## 2013-01-07 -0.005899827  0.030420658  0.003052478  0.002033117 -0.007258884
## 2013-01-08  0.002687890  0.000000000 -0.010370255 -0.022004719 -0.008022556
## 2013-01-09 -0.015752324 -0.015094611 -0.002609277  0.005620339  0.026649173
## 2013-01-10  0.012319959 -0.003809616  0.012041308  0.002946020  0.007354886
##                      APD           AA           CF
## 2013-01-03 -0.0035001190  0.008859390 -0.004739890
## 2013-01-04  0.0133510537  0.020731608  0.022151306
## 2013-01-07 -0.0009230474 -0.017429587 -0.003753436
## 2013-01-08  0.0018456925  0.000000000 -0.014768388
## 2013-01-09  0.0133903708 -0.002200358  0.034376564
## 2013-01-10 -0.0009094807 -0.012188170  0.014916993
cat("\n✅ Dimensions of X: ", nrow(X), "rows ×", ncol(X), "columns\n")
## 
## ✅ Dimensions of X:  754 rows × 8 columns
library(readr)
library(dplyr)
library(lubridate)
library(xts)

ff_data <- read_csv("F-F_Research_Data_Factors_daily.CSV", skip = 4)
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 25962 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (4): Mkt-RF, SMB, HML, RF
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ff_data <- ff_data[!is.na(as.numeric(ff_data[[1]])), ]
## Warning in `[.tbl_df`(ff_data, !is.na(as.numeric(ff_data[[1]])), ): NAs
## introduced by coercion
colnames(ff_data)[1:5] <- c("Date", "Mkt_RF", "SMB", "HML", "RF")

ff_data <- ff_data %>%
  mutate(
    Date = ymd(as.character(Date)),
    Mkt_RF = as.numeric(Mkt_RF) / 100,
    SMB = as.numeric(SMB) / 100,
    HML = as.numeric(HML) / 100,
    RF = as.numeric(RF) / 100
  )

fama_lib <- xts(ff_data[, c("Mkt_RF", "SMB", "HML", "RF")], order.by = ff_data$Date)
F_FamaFrench <- fama_lib[index(X)]

print("✅ First few rows of Fama-French factors aligned with X:")
## [1] "✅ First few rows of Fama-French factors aligned with X:"
print(head(F_FamaFrench))
##             Mkt_RF     SMB     HML RF
## 2013-01-03 -0.0013  0.0010  0.0005  0
## 2013-01-04  0.0055  0.0012  0.0037  0
## 2013-01-07 -0.0031 -0.0009 -0.0036  0
## 2013-01-08 -0.0026  0.0005 -0.0001  0
## 2013-01-09  0.0035  0.0028 -0.0040  0
## 2013-01-10  0.0067 -0.0065  0.0051  0
SP500_index <- Ad(getSymbols("^GSPC", from = begin_date, to = end_date, auto.assign = FALSE))
colnames(SP500_index) <- "SP500"
f_SP500 <- diff(log(SP500_index), na.pad = FALSE)

f_SP500 <- f_SP500[index(X)]

F_capm <- cbind(1, f_SP500)  # [T x 2] matrix

Gamma <- t(solve(t(F_capm) %*% F_capm, t(F_capm) %*% X))

alpha <- Gamma[, 1]
beta <- Gamma[, 2]

E <- X - (F_capm %*% t(Gamma))  # residual matrix

Psi <- (1 / (nrow(X) - 2)) * t(E) %*% E

Sigma_CAPM <- as.numeric(var(f_SP500)) * (beta %o% beta) + diag(diag(Psi))

cat("✅ CAPM covariance matrix (first 5×5 block):\n")
## ✅ CAPM covariance matrix (first 5×5 block):
print(round(Sigma_CAPM[1:5, 1:5], 6))
##          AAPL      AMD      ADI     ABBV        A
## AAPL 0.000266 0.000073 0.000073 0.000070 0.000072
## AMD  0.000073 0.000976 0.000096 0.000091 0.000095
## ADI  0.000073 0.000096 0.000236 0.000092 0.000095
## ABBV 0.000070 0.000091 0.000092 0.000298 0.000091
## A    0.000072 0.000095 0.000095 0.000091 0.000196
F_ff3 <- cbind(1, F_FamaFrench[, 1:3])  # [1, Mkt_RF, SMB, HML]

Gamma_ff3 <- t(solve(t(F_ff3) %*% F_ff3, t(F_ff3) %*% X))

alpha_ff3 <- Gamma_ff3[, 1]
B <- Gamma_ff3[, 2:4]  # [N x 3] matrix of factor loadings

E_ff3 <- X - (F_ff3 %*% t(Gamma_ff3))

Psi_ff3 <- (1 / (nrow(X) - 4)) * t(E_ff3) %*% E_ff3

F_cov <- cov(F_FamaFrench[, 1:3])

Sigma_FF3 <- B %*% F_cov %*% t(B) + diag(diag(Psi_ff3))

cat("✅ Fama-French 3-Factor covariance matrix (first 5×5 block):\n")
## ✅ Fama-French 3-Factor covariance matrix (first 5×5 block):
print(round(Sigma_FF3[1:5, 1:5], 6))
##          AAPL      AMD      ADI     ABBV        A
## AAPL 0.000267 0.000072 0.000073 0.000074 0.000072
## AMD  0.000072 0.000978 0.000101 0.000094 0.000102
## ADI  0.000073 0.000101 0.000237 0.000094 0.000098
## ABBV 0.000074 0.000094 0.000094 0.000299 0.000094
## A    0.000072 0.000102 0.000098 0.000094 0.000196
Sigma_true <- cov(X)

Sigma_sample <- cov(X)

error <- c(
  Sample = norm(Sigma_sample - Sigma_true, "F"),
  CAPM = norm(Sigma_CAPM - Sigma_true, "F"),
  FF3 = norm(Sigma_FF3 - Sigma_true, "F")
)

barplot(error,
        main = "Estimation Error (Frobenius Norm)",
        col = "skyblue",
        ylab = "Error", ylim = c(0, max(error) * 1.1))

ref <- error["Sample"]^2
PRIAL <- 100 * (ref - error^2) / ref

PRIAL[!is.finite(PRIAL)] <- 0

print(round(PRIAL, 2))
## Sample   CAPM    FF3 
##      0      0      0
barplot(PRIAL,
        main = "PRIAL for Factor Models",
        col = "lightgreen",
        ylab = "% Improvement",
        ylim = c(0, max(PRIAL) * 1.1))

estimate_pca_cov <- function(X, K) {
  alpha <- colMeans(X)
  Xc <- X - matrix(alpha, nrow(X), ncol(X), byrow = TRUE)
  Sigma <- cov(Xc)
  eig <- eigen(Sigma)
  B <- eig$vectors[, 1:K] %*% diag(sqrt(eig$values[1:K]), K, K)
  Psi <- diag(diag(Sigma - B %*% t(B)))
  B %*% t(B) + Psi
}

Sigma_PCA1 <- estimate_pca_cov(X, 1)
Sigma_PCA3 <- estimate_pca_cov(X, 3)
Sigma_PCA5 <- estimate_pca_cov(X, 5)

Sigma_true <- cov(X)

error <- c(
  SCM = norm(Sigma_sample - Sigma_true, "F"),
  SP500 = norm(Sigma_CAPM - Sigma_true, "F"),
  FamaFrench = norm(Sigma_FF3 - Sigma_true, "F"),
  PCA1 = norm(Sigma_PCA1 - Sigma_true, "F"),
  PCA3 = norm(Sigma_PCA3 - Sigma_true, "F"),
  PCA5 = norm(Sigma_PCA5 - Sigma_true, "F")
)

print(round(error, 6))
##        SCM      SP500 FamaFrench       PCA1       PCA3       PCA5 
##   0.000000   0.000124   0.000111   0.000446   0.000224   0.000119
ref <- error["SCM"]^2
PRIAL <- 100 * (ref - error^2) / ref
PRIAL[!is.finite(PRIAL)] <- 0  # Clean Inf/NaN

barplot(error,
        main = "Estimation Error (Frobenius Norm)",
        col = "skyblue",
        ylab = "Error",
        cex.names = 0.75, las = 1)

barplot(PRIAL,
        main = "PRIAL for estimation of covariance matrix",
        col = "bisque3",
        ylab = "% Improvement",
        cex.names = 0.75, las = 1)

cat("✅ Best method based on PRIAL:", names(which.max(PRIAL)), "\n")
## ✅ Best method based on PRIAL: SCM
print(round(PRIAL, 2))
##        SCM      SP500 FamaFrench       PCA1       PCA3       PCA5 
##          0          0          0          0          0          0