library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(quantmod)
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(xts)
library(quantmod)

# Set date range
begin_date <- "2013-01-01"
end_date <- "2015-12-31"

# Valid tickers only (AEZS removed)
stock_namelist <- c("AAPL", "AMD", "ADI", "ABBV", "A", "APD", "AA", "CF")

# Safely get adjusted close prices
data_set <- xts()
for (symbol in stock_namelist) {
  tryCatch({
    stock_data <- Ad(getSymbols(symbol, from = begin_date, to = end_date, auto.assign = FALSE))
    data_set <- cbind(data_set, stock_data)
  }, error = function(e) {
    message(paste("Skipping:", symbol, "— not available."))
  })
}

# Rename columns to match tickers
colnames(data_set) <- stock_namelist

# Calculate log returns
X <- diff(log(data_set), na.pad = FALSE)

# Output: basic structure of X
print("✅ First few rows of the log return matrix X:")
## [1] "✅ First few rows of the log return matrix X:"
print(head(X))
##                    AAPL          AMD          ADI         ABBV            A
## 2013-01-03 -0.012702130 -0.015936577 -0.016267183 -0.008291949  0.003575210
## 2013-01-04 -0.028249767  0.039375128 -0.017947752 -0.012713278  0.019555361
## 2013-01-07 -0.005899884  0.030420658  0.003052596  0.002033576 -0.007258952
## 2013-01-08  0.002687529  0.000000000 -0.010369664 -0.022004159 -0.008022975
## 2013-01-09 -0.015752018 -0.015094611 -0.002609632  0.005619964  0.026649658
## 2013-01-10  0.012319353 -0.003809616  0.012041190  0.002945463  0.007354546
##                      APD           AA           CF
## 2013-01-03 -0.0035001834  0.008859295 -0.004739890
## 2013-01-04  0.0133509236  0.020731705  0.022151869
## 2013-01-07 -0.0009232401 -0.017429588 -0.003753371
## 2013-01-08  0.0018456287  0.000000000 -0.014768825
## 2013-01-09  0.0133908804 -0.002199977  0.034376250
## 2013-01-10 -0.0009101770 -0.012188553  0.014917238
# Show dimensions
cat("\n✅ Dimensions of X: ", nrow(X), "rows ×", ncol(X), "columns\n")
## 
## ✅ Dimensions of X:  754 rows × 8 columns
# Step 2: Read and clean Fama-French 3-Factor Data from uploaded CSV
library(readr)
library(dplyr)
library(lubridate)
library(xts)

# Read CSV skipping metadata rows
ff_data <- read_csv("F-F_Research_Data_Factors_daily.csv", skip = 4)
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 25962 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (4): Mkt-RF, SMB, HML, RF
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove bottom rows (footer), usually starts with "Annual Factors"
ff_data <- ff_data[!is.na(as.numeric(ff_data[[1]])), ]
## Warning in `[.tbl_df`(ff_data, !is.na(as.numeric(ff_data[[1]])), ): NAs
## introduced by coercion
# Rename columns
colnames(ff_data)[1:5] <- c("Date", "Mkt_RF", "SMB", "HML", "RF")

# Convert types
ff_data <- ff_data %>%
  mutate(
    Date = ymd(as.character(Date)),
    Mkt_RF = as.numeric(Mkt_RF) / 100,
    SMB = as.numeric(SMB) / 100,
    HML = as.numeric(HML) / 100,
    RF = as.numeric(RF) / 100
  )

# Create xts object and align with stock return dates
fama_lib <- xts(ff_data[, c("Mkt_RF", "SMB", "HML", "RF")], order.by = ff_data$Date)
F_FamaFrench <- fama_lib[index(X)]

# Preview result
print("✅ First few rows of Fama-French factors aligned with X:")
## [1] "✅ First few rows of Fama-French factors aligned with X:"
print(head(F_FamaFrench))
##             Mkt_RF     SMB     HML RF
## 2013-01-03 -0.0013  0.0010  0.0005  0
## 2013-01-04  0.0055  0.0012  0.0037  0
## 2013-01-07 -0.0031 -0.0009 -0.0036  0
## 2013-01-08 -0.0026  0.0005 -0.0001  0
## 2013-01-09  0.0035  0.0028 -0.0040  0
## 2013-01-10  0.0067 -0.0065  0.0051  0
# Step 3: Download S&P500 index (market proxy)
SP500_index <- Ad(getSymbols("^GSPC", from = begin_date, to = end_date, auto.assign = FALSE))
colnames(SP500_index) <- "SP500"
f_SP500 <- diff(log(SP500_index), na.pad = FALSE)

# Align with stock return matrix X
f_SP500 <- f_SP500[index(X)]

# Combine ones + market factor
F_capm <- cbind(1, f_SP500)  # [T x 2] matrix

# OLS estimate: Gamma = (F'F)^(-1) F'X
Gamma <- t(solve(t(F_capm) %*% F_capm, t(F_capm) %*% X))

# Separate alpha and beta
alpha <- Gamma[, 1]
beta <- Gamma[, 2]

# Calculate residuals
E <- X - (F_capm %*% t(Gamma))  # residual matrix

# Residual covariance matrix (idiosyncratic risk)
Psi <- (1 / (nrow(X) - 2)) * t(E) %*% E

# CAPM covariance matrix estimate
Sigma_CAPM <- as.numeric(var(f_SP500)) * (beta %o% beta) + diag(diag(Psi))

# Preview
cat("✅ CAPM covariance matrix (first 5×5 block):\n")
## ✅ CAPM covariance matrix (first 5×5 block):
print(round(Sigma_CAPM[1:5, 1:5], 6))
##          AAPL      AMD      ADI     ABBV        A
## AAPL 0.000266 0.000073 0.000073 0.000070 0.000072
## AMD  0.000073 0.000976 0.000096 0.000091 0.000095
## ADI  0.000073 0.000096 0.000236 0.000092 0.000095
## ABBV 0.000070 0.000091 0.000092 0.000298 0.000091
## A    0.000072 0.000095 0.000095 0.000091 0.000196
# Step 4: Fama-French 3-Factor Model

# Create design matrix F (T x 4) including intercept
F_ff3 <- cbind(1, F_FamaFrench[, 1:3])  # [1, Mkt_RF, SMB, HML]

# OLS estimate
Gamma_ff3 <- t(solve(t(F_ff3) %*% F_ff3, t(F_ff3) %*% X))

# Separate alpha and betas
alpha_ff3 <- Gamma_ff3[, 1]
B <- Gamma_ff3[, 2:4]  # [N x 3] matrix of factor loadings

# Residuals
E_ff3 <- X - (F_ff3 %*% t(Gamma_ff3))

# Residual covariance
Psi_ff3 <- (1 / (nrow(X) - 4)) * t(E_ff3) %*% E_ff3

# Factor covariance matrix
F_cov <- cov(F_FamaFrench[, 1:3])

# Fama-French covariance matrix estimate
Sigma_FF3 <- B %*% F_cov %*% t(B) + diag(diag(Psi_ff3))

# Preview
cat("✅ Fama-French 3-Factor covariance matrix (first 5×5 block):\n")
## ✅ Fama-French 3-Factor covariance matrix (first 5×5 block):
print(round(Sigma_FF3[1:5, 1:5], 6))
##          AAPL      AMD      ADI     ABBV        A
## AAPL 0.000267 0.000072 0.000073 0.000074 0.000072
## AMD  0.000072 0.000978 0.000101 0.000094 0.000102
## ADI  0.000073 0.000101 0.000237 0.000094 0.000098
## ABBV 0.000074 0.000094 0.000094 0.000299 0.000094
## A    0.000072 0.000102 0.000098 0.000094 0.000196
# Step 5: Compare Estimation Errors

# True covariance matrix (sample from actual data)
Sigma_true <- cov(X)

# Sample covariance matrix
Sigma_sample <- cov(X)

# Error for each model
error <- c(
  Sample = norm(Sigma_sample - Sigma_true, "F"),
  CAPM = norm(Sigma_CAPM - Sigma_true, "F"),
  FF3 = norm(Sigma_FF3 - Sigma_true, "F")
)

# Plot estimation error
barplot(error,
        main = "Estimation Error (Frobenius Norm)",
        col = "skyblue",
        ylab = "Error", ylim = c(0, max(error) * 1.1))

# Calculate PRIAL safely
ref <- error["Sample"]^2
PRIAL <- 100 * (ref - error^2) / ref

# Replace -Inf and NaN with 0
PRIAL[!is.finite(PRIAL)] <- 0

# Show values
print(round(PRIAL, 2))
## Sample   CAPM    FF3 
##      0      0      0
# Plot safely
barplot(PRIAL,
        main = "PRIAL for Factor Models",
        col = "lightgreen",
        ylab = "% Improvement",
        ylim = c(0, max(PRIAL) * 1.1))

# Step 6: PCA-based Statistical Factor Models

estimate_pca_cov <- function(X, K) {
  alpha <- colMeans(X)
  Xc <- X - matrix(alpha, nrow(X), ncol(X), byrow = TRUE)
  Sigma <- cov(Xc)
  eig <- eigen(Sigma)
  B <- eig$vectors[, 1:K] %*% diag(sqrt(eig$values[1:K]), K, K)
  Psi <- diag(diag(Sigma - B %*% t(B)))
  B %*% t(B) + Psi
}

Sigma_PCA1 <- estimate_pca_cov(X, 1)
Sigma_PCA3 <- estimate_pca_cov(X, 3)
Sigma_PCA5 <- estimate_pca_cov(X, 5)
# Step 7: Compare all covariance matrices against true sample
Sigma_true <- cov(X)

error <- c(
  SCM = norm(Sigma_sample - Sigma_true, "F"),
  SP500 = norm(Sigma_CAPM - Sigma_true, "F"),
  FamaFrench = norm(Sigma_FF3 - Sigma_true, "F"),
  PCA1 = norm(Sigma_PCA1 - Sigma_true, "F"),
  PCA3 = norm(Sigma_PCA3 - Sigma_true, "F"),
  PCA5 = norm(Sigma_PCA5 - Sigma_true, "F")
)

# Print errors
print(round(error, 6))
##        SCM      SP500 FamaFrench       PCA1       PCA3       PCA5 
##   0.000000   0.000124   0.000111   0.000446   0.000224   0.000119
# PRIAL calculation
ref <- error["SCM"]^2
PRIAL <- 100 * (ref - error^2) / ref
PRIAL[!is.finite(PRIAL)] <- 0  # Clean Inf/NaN

# Barplot: Error
barplot(error,
        main = "Estimation Error (Frobenius Norm)",
        col = "skyblue",
        ylab = "Error",
        cex.names = 0.75, las = 1)

# Barplot: PRIAL
barplot(PRIAL,
        main = "PRIAL for estimation of covariance matrix",
        col = "bisque3",
        ylab = "% Improvement",
        cex.names = 0.75, las = 1)

# Print best method
cat("✅ Best method based on PRIAL:", names(which.max(PRIAL)), "\n")
## ✅ Best method based on PRIAL: SCM
print(round(PRIAL, 2))
##        SCM      SP500 FamaFrench       PCA1       PCA3       PCA5 
##          0          0          0          0          0          0