#1. Import ETF data

library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tidyquant)
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(timetk)
## 
## Attaching package: 'timetk'
## 
## The following object is masked from 'package:tidyquant':
## 
##     FANG
library(dplyr)
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:xts':
## 
##     first, last
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(xts)

tickers <- c("SPY", "QQQ", "EEM", "IWM", "EFA", "TLT", "IYR", "GLD")

etf_prices <- tq_get(tickers, from = "2010-01-01", to = Sys.Date(), get = "stock.prices")

#2. Calculate weekly and monthly simple returns

monthly_returns <- etf_prices %>%
  group_by(symbol) %>%
  tq_transmute(select = adjusted,
               mutate_fun = periodReturn,
               period = "monthly",
               type = "arithmetic",
               col_rename = "monthly_return")

weekly_returns <- etf_prices %>%
  group_by(symbol) %>%
  tq_transmute(select = adjusted,
               mutate_fun = periodReturn,
               period = "weekly",
               type = "arithmetic",
               col_rename = "weekly_return")

#3. Convert monthly returns into tibble format

monthly_wide <- monthly_returns %>%
  pivot_wider(names_from = symbol, values_from = monthly_return) %>%
  rename(date = date) %>%
  as_tibble()

#4. Download Fama-French 3-Factor data and convert to digits

ff_raw <- read.csv("F-F_Research_Data_Factors-3.csv", skip = 3)

ff_data <- ff_raw %>%
  rename(date = X) %>%
  filter(!is.na(RF)) %>%
  mutate(date = as.Date(paste0(substr(date, 1, 4), "-", substr(date, 5, 6), "-01")),
         MKT = as.numeric(Mkt.RF)/100,
         SMB = as.numeric(SMB)/100,
         HML = as.numeric(HML)/100,
         RF = as.numeric(RF)/100) %>%
  select(date, MKT, SMB, HML, RF)
## Warning: There were 4 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `MKT = as.numeric(Mkt.RF)/100`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

#5. Merge monthly returns with Fama-French data

merged_data <- left_join(monthly_wide, ff_data, by = "date")

#6. CAPM Covariance Matrix (2010/02–2015/01)

capm_data <- merged_data %>%
  filter(date >= as.Date("2010-02-01") & date <= as.Date("2015-01-01"))

# Excess returns = ETF returns - RF
excess_returns <- capm_data %>%
  mutate(across(SPY:GLD, ~ . - RF, .names = "excess_{col}")) %>%
  select(date, starts_with("excess_"))

# Remove NAs and convert to matrix
excess_mat <- excess_returns %>%
  select(-date) %>%
  na.omit() %>%
  as.matrix()

# Covariance matrix
cov_capm <- cov(excess_mat)

#7. FF3 Covariance Matrix (2010/02–2015/01)

library(broom)
library(dplyr)
library(tidyr)
library(purrr)

# Step 1: Subset only the needed columns
ff3_excess <- capm_data %>%
  mutate(across(SPY:GLD, ~ . - RF, .names = "excess_{.col}")) %>%
  select(date, starts_with("excess_"), MKT, SMB, HML)

# Step 2: Pivot longer for regression
long_data <- ff3_excess %>%
  pivot_longer(cols = starts_with("excess_"), 
               names_to = "asset", 
               values_to = "excess_return") %>%
  drop_na(excess_return, MKT, SMB, HML)  # Remove rows with any NA

# Step 3: Run FF3 regression per asset
residuals_df <- long_data %>%
  group_by(asset) %>%
  group_modify(~ {
    if (nrow(.x) >= 10) {  # Require minimum data to avoid empty lm
      fit <- lm(excess_return ~ MKT + SMB + HML, data = .x)
      tibble(date = .x$date, resid = resid(fit))
    } else {
      tibble(date = .x$date, resid = NA)  # Skip if not enough data
    }
  }) %>%
  pivot_wider(names_from = asset, values_from = resid)

# Step 4: Final residual matrix for covariance
resid_mat <- residuals_df %>%
  select(-date) %>%
  na.omit() %>%
  as.matrix()

cov_ff3 <- cov(resid_mat)

#8. Global Minimum Variance (GMV) Portfolio Weights

# Round monthly_wide$date to the first of the month
monthly_wide <- monthly_wide %>%
  mutate(date = as.Date(format(date, "%Y-%m-01")))

# Already done in ff_data:
# mutate(date = as.Date(paste0(substr(date, 1, 4), "-", substr(date, 5, 6), "-01")))
merged_data <- left_join(monthly_wide, ff_data, by = "date")
capm_data <- merged_data %>%
  filter(date >= as.Date("2010-02-01") & date <= as.Date("2015-01-01"))

summary(capm_data)
##       date                 SPY                QQQ                EEM           
##  Min.   :2010-02-01   Min.   :-0.07945   Min.   :-0.07392   Min.   :-0.178947  
##  1st Qu.:2011-04-23   1st Qu.:-0.01175   1st Qu.:-0.01182   1st Qu.:-0.026297  
##  Median :2012-07-16   Median : 0.01993   Median : 0.02390   Median :-0.001049  
##  Mean   :2012-07-16   Mean   : 0.01277   Mean   : 0.01629   Mean   : 0.003595  
##  3rd Qu.:2013-10-08   3rd Qu.: 0.03555   3rd Qu.: 0.04642   3rd Qu.: 0.031060  
##  Max.   :2015-01-01   Max.   : 0.10915   Max.   : 0.13173   Max.   : 0.162678  
##       IWM                EFA                 TLT                 IYR          
##  Min.   :-0.11150   Min.   :-0.111928   Min.   :-0.067609   Min.   :-0.10668  
##  1st Qu.:-0.01888   1st Qu.:-0.022490   1st Qu.:-0.021487   1st Qu.:-0.01100  
##  Median : 0.02452   Median : 0.004836   Median : 0.005879   Median : 0.02457  
##  Mean   : 0.01354   Mean   : 0.006348   Mean   : 0.010279   Mean   : 0.01468  
##  3rd Qu.: 0.04792   3rd Qu.: 0.037477   3rd Qu.: 0.033471   3rd Qu.: 0.04500  
##  Max.   : 0.15101   Max.   : 0.116104   Max.   : 0.132061   Max.   : 0.13190  
##       GLD                 MKT                SMB            
##  Min.   :-0.110623   Min.   :-0.07900   Min.   :-0.0418000  
##  1st Qu.:-0.030766   1st Qu.:-0.01218   1st Qu.:-0.0126750  
##  Median : 0.004372   Median : 0.01990   Median : 0.0025500  
##  Mean   : 0.003976   Mean   : 0.01298   Mean   : 0.0009483  
##  3rd Qu.: 0.046968   3rd Qu.: 0.03795   3rd Qu.: 0.0134750  
##  Max.   : 0.122749   Max.   : 0.11330   Max.   : 0.0484000  
##       HML                  RF           
##  Min.   :-0.047300   Min.   :0.000e+00  
##  1st Qu.:-0.014850   1st Qu.:0.000e+00  
##  Median :-0.001500   Median :0.000e+00  
##  Mean   :-0.001153   Mean   :3.333e-05  
##  3rd Qu.: 0.011825   3rd Qu.:1.000e-04  
##  Max.   : 0.049000   Max.   :1.000e-04
# 1. Ensure dates are aligned (if not already)
monthly_wide <- monthly_wide %>%
  mutate(date = as.Date(format(date, "%Y-%m-01")))

# 2. Merge again to be safe
merged_data <- left_join(monthly_wide, ff_data, by = "date")

# 3. Filter for the period 2010-02 to 2015-01
capm_data <- merged_data %>%
  filter(date >= as.Date("2010-02-01") & date <= as.Date("2015-01-01"))

# 4. Calculate excess returns: ETF returns - RF
excess_returns <- capm_data %>%
  mutate(across(SPY:GLD, ~ . - RF, .names = "excess_{col}")) %>%
  select(date, starts_with("excess_"))

# 5. Remove NAs and ensure all columns have variance
excess_df_raw <- excess_returns %>% select(-date)

# Remove columns with all NA or zero variance
non_empty_cols <- colnames(excess_df_raw)[
  colSums(is.na(excess_df_raw)) < nrow(excess_df_raw) & 
  apply(excess_df_raw, 2, function(x) sd(x, na.rm = TRUE) > 0)
]

# Final cleaned excess return matrix
excess_mat_clean <- excess_df_raw[, non_empty_cols] %>%
  na.omit() %>%
  as.matrix()

# 6. Compute and clean the covariance matrix
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
cov_capm <- cov(excess_mat_clean)
cov_capm_fixed <- as.matrix(nearPD(cov_capm)$mat)

# 7. GMV weight calculation function
library(quadprog)
get_gmv_weights <- function(cov_mat, ridge = 1e-8) {
  cov_mat <- (cov_mat + t(cov_mat)) / 2  # Ensure symmetry
  cov_mat <- cov_mat + diag(ncol(cov_mat)) * ridge
  n <- ncol(cov_mat)
  dvec <- rep(0, n)
  Amat <- matrix(1, nrow = n, ncol = 1)
  bvec <- 1
  sol <- solve.QP(cov_mat, dvec, Amat, bvec, meq = 1)
  w <- sol$solution
  names(w) <- colnames(cov_mat)
  return(w)
}

# 8. Run GMV optimization
gmv_capm_weights <- get_gmv_weights(cov_capm_fixed)

# 9. Print final weights
cat("✅ GMV Portfolio Weights (CAPM Model):\n")
## ✅ GMV Portfolio Weights (CAPM Model):
print(round(gmv_capm_weights, 4))
## excess_SPY excess_QQQ excess_EEM excess_IWM excess_EFA excess_TLT excess_IYR 
##     0.9681    -0.1908    -0.0148    -0.0663    -0.0149     0.4704    -0.2045 
## excess_GLD 
##     0.0528

#9. Backtest GMV Portfolio Performance

library(xts)
# Convert your monthly_wide data frame (with a date column and asset returns columns) into xts:
# Make sure your date column is a Date class (or convert it)
monthly_wide$date <- as.Date(monthly_wide$date)

# Select all asset columns except date
rets_mat <- monthly_wide %>% select(-date)

# Create xts object
returns_xts <- xts(rets_mat, order.by = monthly_wide$date)
library(dplyr)
library(tidyr)
library(broom)
library(xts)
library(PerformanceAnalytics)

# Your get_gmv_weights function assumed loaded here

backtest_gmv <- function(returns_xts, ff_data, merged_data, window = 60, model = c("capm", "ff3")) {
  model <- match.arg(model)
  dates <- index(returns_xts)
  weights_list <- list()
  port_returns <- c()
  
  for (i in seq(window + 1, nrow(returns_xts))) {
    rets_window <- returns_xts[(i - window):(i - 1), ]
    
    # Get risk-free rate window for excess returns
    rf_window <- ff_data %>%
      filter(date %in% dates[(i - window):(i - 1)]) %>%
      pull(RF)
    
    excess_window <- sweep(rets_window, 1, rf_window, "-")
    
    if (model == "capm") {
      cov_mat <- cov(na.omit(excess_window))
      
    } else if (model == "ff3") {
      date_slice <- dates[(i - window):(i - 1)]
      
      # Prepare merged data for FF3 residual calculation
      tmp_data <- merged_data %>%
        filter(date %in% date_slice) %>%
        mutate(across(SPY:GLD, ~ . - RF, .names = "excess_{col}"))
      
      # Convert to long format for each asset's excess returns
      long_df <- tmp_data %>%
        select(date, starts_with("excess_"), MKT, SMB, HML) %>%
        pivot_longer(cols = starts_with("excess_"), names_to = "asset", values_to = "excess_return") %>%
        mutate(asset = sub("excess_", "", asset))
      
      # Calculate residuals per asset, carefully handling missing data
      residuals_df <- long_df %>%
        group_by(asset) %>%
        group_modify(function(.x, .y) {
          data_complete <- na.omit(.x)
          fit <- lm(excess_return ~ MKT + SMB + HML, data = data_complete)
          # Use broom::augment on complete data and select residuals
          resids <- broom::augment(fit, data = data_complete) %>%
            select(.resid)
          # Bind residuals with original dates for that asset
          tibble(date = data_complete$date, resid = resids$.resid)
        }) %>%
        ungroup()
      
      # Pivot residuals to wide matrix for covariance calculation
      residuals_wide <- residuals_df %>%
        pivot_wider(names_from = asset, values_from = resid) %>%
        arrange(date)
      
      # Remove date column and convert to matrix (rows = time, cols = assets)
      residuals_mat <- residuals_wide %>%
        select(-date) %>%
        as.matrix()
      
      # Covariance matrix of residuals (drop rows with NA)
      cov_mat <- cov(na.omit(residuals_mat))
    }
    
    # Compute GMV weights
    w <- get_gmv_weights(cov_mat)
    
    # Calculate portfolio return at date i
    r <- as.numeric(returns_xts[i, ]) %*% w
    port_returns <- c(port_returns, r)
  }
  
  # Return xts object of portfolio returns
  port_xts <- xts(port_returns, order.by = dates[(window + 1):nrow(returns_xts)])
  return(port_xts)
}

# Assuming you have these objects loaded:
# returns_xts = your monthly returns xts object (assets as columns)
# ff_data = Fama-French factors data.frame with 'date' and 'RF' columns
# merged_data = data.frame including date, asset returns, RF, MKT, SMB, HML factors

# Run backtests
gmv_capm_backtest <- backtest_gmv(returns_xts, ff_data, merged_data, model = "capm")
## Warning in sweep(rets_window, 1, rf_window, "-"): STATS does not recycle
## exactly across MARGIN
gmv_ff3_backtest  <- backtest_gmv(returns_xts, ff_data, merged_data, model = "ff3")
## Warning in sweep(rets_window, 1, rf_window, "-"): STATS does not recycle
## exactly across MARGIN
# Plot cumulative returns
combined <- merge(gmv_capm_backtest, gmv_ff3_backtest)
colnames(combined) <- c("CAPM GMV", "FF3 GMV")
charts.PerformanceSummary(combined, main = "Backtest: GMV Portfolios")