library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.27     ✔ xts                  0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first()  masks xts::first()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::last()   masks xts::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(timetk)
## 
## Attaching package: 'timetk'
## 
## The following object is masked from 'package:tidyquant':
## 
##     FANG
library(lubridate)
library(PerformanceAnalytics)

# ---- 1. Import ETF Data ----
etf_tickers <- c("SPY", "QQQ", "EEM", "IWM", "EFA", "TLT", "IYR", "GLD")
etf_prices <- tq_get(etf_tickers,
                     from = "2010-01-01",
                     get = "stock.prices") %>%
  select(symbol, date, adjusted)

# Print info about fetched data
cat("ETF data summary:\n")
## ETF data summary:
cat("Earliest date:", min(etf_prices$date), "\n")
## Earliest date: 14613
cat("Latest date:", max(etf_prices$date), "\n")
## Latest date: 20194
cat("Total observations:", nrow(etf_prices), "\n")
## Total observations: 30768
# Count observations by ticker
etf_counts <- etf_prices %>%
  group_by(symbol) %>%
  summarize(count = n(), 
            earliest = min(date),
            latest = max(date))
print(etf_counts)
## # A tibble: 8 × 4
##   symbol count earliest   latest    
##   <chr>  <int> <date>     <date>    
## 1 EEM     3846 2010-01-04 2025-04-16
## 2 EFA     3846 2010-01-04 2025-04-16
## 3 GLD     3846 2010-01-04 2025-04-16
## 4 IWM     3846 2010-01-04 2025-04-16
## 5 IYR     3846 2010-01-04 2025-04-16
## 6 QQQ     3846 2010-01-04 2025-04-16
## 7 SPY     3846 2010-01-04 2025-04-16
## 8 TLT     3846 2010-01-04 2025-04-16
# ---- 2. Compute Monthly Returns ----
etf_monthly_returns <- etf_prices %>%
  group_by(symbol) %>%
  tq_transmute(select = adjusted,
               mutate_fun = periodReturn,
               period = "monthly",
               col_rename = "monthly_return",
               type = "arithmetic") %>%
  ungroup()

# Check completeness of monthly data
monthly_completeness <- etf_monthly_returns %>%
  group_by(date) %>%
  summarize(tickers = n(),
            complete = tickers == length(etf_tickers),
            .groups = "drop")

complete_months <- sum(monthly_completeness$complete)
cat("\nMonthly returns summary:\n")
## 
## Monthly returns summary:
cat("Earliest ETF month:", min(etf_monthly_returns$date), "\n")
## Earliest ETF month: 14638
cat("Latest ETF month:", max(etf_monthly_returns$date), "\n")
## Latest ETF month: 20194
cat("Total months with data:", length(unique(etf_monthly_returns$date)), "\n")
## Total months with data: 184
cat("Months with complete data for all tickers:", complete_months, "\n")
## Months with complete data for all tickers: 184
# ---- 3. Load and Process Fama-French 3 Factors ----
# Direct download method for FF data (more reliable than using packages)
ff_url <- "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_CSV.zip"
temp <- tempfile()
download.file(ff_url, temp)
ff_data <- read.csv(unz(temp, "F-F_Research_Data_Factors.CSV"), skip=3)
unlink(temp)

# Print raw data to diagnose issues
cat("\nRaw FF data head (first few lines):\n")
## 
## Raw FF data head (first few lines):
print(head(ff_data, 10))
##         X   Mkt.RF      SMB      HML       RF
## 1  192607     2.96    -2.56    -2.43     0.22
## 2  192608     2.64    -1.17     3.82     0.25
## 3  192609     0.36    -1.40     0.13     0.23
## 4  192610    -3.24    -0.09     0.70     0.32
## 5  192611     2.53    -0.10    -0.51     0.31
## 6  192612     2.62    -0.03    -0.05     0.28
## 7  192701    -0.06    -0.37     4.54     0.25
## 8  192702     4.18     0.04     2.94     0.26
## 9  192703     0.13    -1.65    -2.61     0.30
## 10 192704     0.46     0.30     0.81     0.25
# Clean up the data and carefully handle date formats
ff3_clean <- ff_data %>%
  filter(!is.na(X)) %>%
  filter(X != "") %>%
  rename(date_str = X, Mkt_RF = Mkt.RF) %>%
  mutate(
    # Convert percentages to decimals
    across(c(Mkt_RF, SMB, HML, RF), as.numeric),
    across(c(Mkt_RF, SMB, HML, RF), ~ . / 100)
  )

# Debugging: Print values to check
cat("\nFF data after numeric conversion (first few lines):\n")
## 
## FF data after numeric conversion (first few lines):
print(head(ff3_clean, 10))
##    date_str  Mkt_RF     SMB     HML     RF
## 1    192607  0.0296 -0.0256 -0.0243 0.0022
## 2    192608  0.0264 -0.0117  0.0382 0.0025
## 3    192609  0.0036 -0.0140  0.0013 0.0023
## 4    192610 -0.0324 -0.0009  0.0070 0.0032
## 5    192611  0.0253 -0.0010 -0.0051 0.0031
## 6    192612  0.0262 -0.0003 -0.0005 0.0028
## 7    192701 -0.0006 -0.0037  0.0454 0.0025
## 8    192702  0.0418  0.0004  0.0294 0.0026
## 9    192703  0.0013 -0.0165 -0.0261 0.0030
## 10   192704  0.0046  0.0030  0.0081 0.0025
# Handle date formats
ff3_clean <- ff3_clean %>%
  mutate(
    # First try to identify the date format
    date_str = as.character(date_str),
    date_len = nchar(date_str),
    date = case_when(
      # YYYYMM format (annual/monthly data)
      date_len == 6 ~ as.Date(paste0(substr(date_str, 1, 6), "01"), format="%Y%m%d"),
      # YYYYMMDD format (daily data)
      date_len == 8 ~ as.Date(date_str, format="%Y%m%d"),
      # Try parsing as YYYYMM without adding day
      TRUE ~ as.Date(paste0(date_str, "01"), format="%Y%m%d")
    )
  ) %>%
  filter(!is.na(date)) %>% # Remove rows with invalid dates
  select(-date_str, -date_len) # Clean up temporary columns

# Debugging: Print date conversions to check
cat("\nFF data after date conversion (first few lines):\n")
## 
## FF data after date conversion (first few lines):
print(head(ff3_clean, 10))
##     Mkt_RF     SMB     HML     RF       date
## 1   0.0296 -0.0256 -0.0243 0.0022 1926-07-01
## 2   0.0264 -0.0117  0.0382 0.0025 1926-08-01
## 3   0.0036 -0.0140  0.0013 0.0023 1926-09-01
## 4  -0.0324 -0.0009  0.0070 0.0032 1926-10-01
## 5   0.0253 -0.0010 -0.0051 0.0031 1926-11-01
## 6   0.0262 -0.0003 -0.0005 0.0028 1926-12-01
## 7  -0.0006 -0.0037  0.0454 0.0025 1927-01-01
## 8   0.0418  0.0004  0.0294 0.0026 1927-02-01
## 9   0.0013 -0.0165 -0.0261 0.0030 1927-03-01
## 10  0.0046  0.0030  0.0081 0.0025 1927-04-01
cat("\nFF data date range:", min(ff3_clean$date), "to", max(ff3_clean$date), "\n")
## 
## FF data date range: -15890 to 20058
# Handle ETF dates properly - convert to month-end dates
etf_wide <- etf_monthly_returns %>%
  # Add a column with just year and month for matching
  mutate(year_month = floor_date(date, "month")) %>%
  # We need this to match the FF data format
  pivot_wider(names_from = symbol, values_from = monthly_return)

# Print to verify ETF dates
cat("\nETF data date format sample:\n")
## 
## ETF data date format sample:
print(head(etf_wide[, c("date", "year_month")], 10))
## # A tibble: 10 × 2
##    date       year_month
##    <date>     <date>    
##  1 2010-01-29 2010-01-01
##  2 2010-02-26 2010-02-01
##  3 2010-03-31 2010-03-01
##  4 2010-04-30 2010-04-01
##  5 2010-05-28 2010-05-01
##  6 2010-06-30 2010-06-01
##  7 2010-07-30 2010-07-01
##  8 2010-08-31 2010-08-01
##  9 2010-09-30 2010-09-01
## 10 2010-10-29 2010-10-01
# Try to match by year and month instead of exact date
# Add year_month column to FF data too
ff3_clean <- ff3_clean %>%
  mutate(year_month = floor_date(date, "month"))

# Print to verify FF dates
cat("\nFF data date format sample:\n")
## 
## FF data date format sample:
print(head(ff3_clean[, c("date", "year_month")], 10))
##          date year_month
## 1  1926-07-01 1926-07-01
## 2  1926-08-01 1926-08-01
## 3  1926-09-01 1926-09-01
## 4  1926-10-01 1926-10-01
## 5  1926-11-01 1926-11-01
## 6  1926-12-01 1926-12-01
## 7  1927-01-01 1927-01-01
## 8  1927-02-01 1927-02-01
## 9  1927-03-01 1927-03-01
## 10 1927-04-01 1927-04-01
# Use year_month for merging
merged_data <- inner_join(etf_wide, ff3_clean, by = "year_month")

# Keep only the needed columns after merging
merged_data <- merged_data %>%
  select(date.x, all_of(etf_tickers), Mkt_RF, SMB, HML, RF) %>%
  rename(date = date.x)

# Check merged data
cat("\nMerged data summary:\n")
## 
## Merged data summary:
cat("Rows in merged data:", nrow(merged_data), "\n")
## Rows in merged data: 180
if(nrow(merged_data) > 0) {
  cat("Date range:", min(merged_data$date), "to", max(merged_data$date), "\n")
} else {
  # Display diagnostic information
  cat("No matching dates found. Here's what we have:\n")
  cat("ETF dates (sample):", paste(head(sort(unique(etf_wide$year_month))), collapse=", "), "...\n")
  cat("FF dates (sample):", paste(head(sort(unique(ff3_clean$year_month))), collapse=", "), "...\n")
  
  # Try a last-resort approach: simplified monthly format for comparison
  cat("\nAttempting date format comparison...\n")
  etf_months <- format(etf_wide$year_month, "%Y-%m")
  ff_months <- format(ff3_clean$year_month, "%Y-%m")
  
  cat("ETF months format:", paste(head(unique(etf_months)), collapse=", "), "...\n")
  cat("FF months format:", paste(head(unique(ff_months)), collapse=", "), "...\n")
  
  common_months <- intersect(etf_months, ff_months)
  cat("Common months count:", length(common_months), "\n")
  if(length(common_months) > 0) {
    cat("Common months (sample):", paste(head(common_months), collapse=", "), "...\n")
    
    # Try one more merging approach based on string formatted year-month
    etf_wide$month_str <- format(etf_wide$date, "%Y-%m")
    ff3_clean$month_str <- format(ff3_clean$date, "%Y-%m")
    
    merged_data <- inner_join(etf_wide, ff3_clean, by = "month_str")
    
    # Keep only the needed columns after merging
    if(nrow(merged_data) > 0) {
      merged_data <- merged_data %>%
        select(date.x, all_of(etf_tickers), Mkt_RF, SMB, HML, RF) %>%
        rename(date = date.x)
      
      cat("\nSuccessfully merged using string month format! Rows:", nrow(merged_data), "\n")
    } else {
      stop("All merge attempts failed. Please check data formats and date ranges.")
    }
  } else {
    stop("No overlapping months between ETF and FF data. Cannot proceed.")
  }
}# Remove rows with NA values
## Date range: 14638 to 20088
merged_clean <- merged_data %>% drop_na()
cat("Rows after removing NAs:", nrow(merged_clean), "\n")
## Rows after removing NAs: 180
# If we have at least 12 months of data, we'll work with it
min_months <- 12  # Lower threshold to 12 months

if (nrow(merged_clean) < min_months) {
  stop(paste("Not enough valid monthly data. Have", nrow(merged_clean), "months, need at least", min_months))
} else {
  cat("Proceeding with", nrow(merged_clean), "months of data\n")
  capm_window <- merged_clean
}
## Proceeding with 180 months of data
# ---- 5. CAPM MVP ----
returns_matrix <- capm_window %>%
  select(all_of(etf_tickers)) %>%
  as.matrix()

# Print a sample of returns to verify data
cat("\nSample of monthly returns (first 5 rows):\n")
## 
## Sample of monthly returns (first 5 rows):
print(round(returns_matrix[1:min(5,nrow(returns_matrix)),], 4))
##          SPY     QQQ     EEM     IWM     EFA     TLT     IYR     GLD
## [1,] -0.0524 -0.0782 -0.1037 -0.0605 -0.0749  0.0278 -0.0520 -0.0350
## [2,]  0.0312  0.0460  0.0178  0.0448  0.0027 -0.0034  0.0546  0.0327
## [3,]  0.0609  0.0771  0.0811  0.0823  0.0639 -0.0206  0.0975 -0.0044
## [4,]  0.0155  0.0224 -0.0017  0.0568 -0.0280  0.0332  0.0639  0.0588
## [5,] -0.0795 -0.0739 -0.0939 -0.0754 -0.1119  0.0511 -0.0568  0.0305
cov_matrix <- cov(returns_matrix)
cat("\nCovariance matrix summary:\n")
## 
## Covariance matrix summary:
cat("Dimension:", nrow(cov_matrix), "x", ncol(cov_matrix), "\n")
## Dimension: 8 x 8
cat("Determinant:", det(cov_matrix), "\n")
## Determinant: 3.241609e-25
# Check if covariance matrix is invertible
if (det(cov_matrix) < 1e-10) {
  cat("WARNING: Covariance matrix is nearly singular, using a more robust approach\n")
  # Add a small constant to the diagonal
  diag(cov_matrix) <- diag(cov_matrix) + 1e-5
}
## WARNING: Covariance matrix is nearly singular, using a more robust approach
ones <- rep(1, ncol(cov_matrix))
inv_cov <- solve(cov_matrix)
capm_weights <- inv_cov %*% ones / as.numeric(t(ones) %*% inv_cov %*% ones)
rownames(capm_weights) <- etf_tickers

# ---- 6. FF MVP using residuals from regression ----
# Create an empty matrix of the right dimensions
residual_matrix <- matrix(0, nrow = nrow(capm_window), ncol = length(etf_tickers))
colnames(residual_matrix) <- etf_tickers

# Fill in the matrix column by column
cat("\nProcessing FF3 regressions for each ETF:\n")
## 
## Processing FF3 regressions for each ETF:
for (i in 1:length(etf_tickers)) {
  asset <- etf_tickers[i]
  cat("Processing", asset, "... ")
  
  tryCatch({
    # Prepare data for regression
    lm_data <- capm_window %>%
      select(date, all_of(asset), Mkt_RF, SMB, HML) %>%
      rename(R = all_of(asset))
    
    # Check for NAs
    if (any(is.na(lm_data))) {
      cat("NA values found, skipping\n")
      next
    }
    
    # Fit model
    model <- lm(R ~ Mkt_RF + SMB + HML, data = lm_data)
    
    # Store residuals
    residual_matrix[, i] <- residuals(model)
    cat("done\n")
  }, error = function(e) {
    cat("ERROR:", e$message, "\n")
  })
}
## Processing SPY ... done
## Processing QQQ ... done
## Processing EEM ... done
## Processing IWM ... done
## Processing EFA ... done
## Processing TLT ... done
## Processing IYR ... done
## Processing GLD ... done
# Check if we have valid residuals
if (all(residual_matrix == 0)) {
  cat("\nWARNING: Could not calculate valid residuals. Using equal weights for FF3 portfolio.\n")
  ff_weights <- matrix(1/length(etf_tickers), nrow = length(etf_tickers), ncol = 1)
  rownames(ff_weights) <- etf_tickers
} else {
  # Calculate residual covariance matrix
  ff_cov_matrix <- cov(residual_matrix)
  
  # Check if residual covariance matrix is invertible
  if (det(ff_cov_matrix) < 1e-10) {
    cat("WARNING: Residual covariance matrix is nearly singular, using a more robust approach\n")
    # Add a small constant to the diagonal
    diag(ff_cov_matrix) <- diag(ff_cov_matrix) + 1e-5
  }
  
  inv_cov_ff <- solve(ff_cov_matrix)
  ff_weights <- inv_cov_ff %*% ones / as.numeric(t(ones) %*% inv_cov_ff %*% ones)
  rownames(ff_weights) <- etf_tickers
}
## WARNING: Residual covariance matrix is nearly singular, using a more robust approach
# ---- 7. Calculate portfolio statistics ----
# Calculate portfolio expected return and risk
capm_portfolio_return <- mean(returns_matrix %*% capm_weights)
capm_portfolio_risk <- sqrt(t(capm_weights) %*% cov_matrix %*% capm_weights)
capm_sharpe <- capm_portfolio_return / capm_portfolio_risk

# Calculate portfolio metrics for FF
ff_portfolio_return <- mean(returns_matrix %*% ff_weights)
ff_portfolio_risk <- sqrt(t(ff_weights) %*% cov_matrix %*% ff_weights)
ff_sharpe <- ff_portfolio_return / ff_portfolio_risk

# ---- 8. Get latest return data for out of sample test ----
# Find the latest date we have ETF data for
latest_etf_date <- max(etf_monthly_returns$date)

# Get returns for that date
latest_returns <- etf_monthly_returns %>%
  filter(date == latest_etf_date) %>%
  pivot_wider(names_from = symbol, values_from = monthly_return)

# Check if we have all tickers
missing_tickers <- setdiff(etf_tickers, names(latest_returns))

if (length(missing_tickers) > 0) {
  cat("\nWARNING: Missing data for these tickers in latest month:", paste(missing_tickers, collapse = ", "), "\n")
  # Fill with zeros as fallback
  for (ticker in missing_tickers) {
    latest_returns[[ticker]] <- 0
  }
}

latest_return_matrix <- latest_returns %>%
  select(all_of(etf_tickers)) %>%
  as.matrix()

# Calculate realized returns
realized_capm <- as.numeric(latest_return_matrix %*% capm_weights)
realized_ff <- as.numeric(latest_return_matrix %*% ff_weights)

# ---- 9. Print Final Results ----
cat("\n=== Minimum Variance Portfolio Analysis ===\n")
## 
## === Minimum Variance Portfolio Analysis ===
cat("Analysis period:", min(capm_window$date), "to", max(capm_window$date), "\n")
## Analysis period: 14638 to 20088
cat("Number of months:", nrow(capm_window), "\n\n")
## Number of months: 180
cat("CAPM Weights:\n")
## CAPM Weights:
print(round(capm_weights, 4))
##        [,1]
## SPY  1.0358
## QQQ -0.4280
## EEM -0.0092
## IWM -0.0137
## EFA  0.0905
## TLT  0.4839
## IYR -0.3321
## GLD  0.1727
cat("\nPortfolio Statistics (CAPM):\n")
## 
## Portfolio Statistics (CAPM):
cat("Expected Monthly Return:", round(capm_portfolio_return * 100, 4), "%\n")
## Expected Monthly Return: 0.5406 %
cat("Expected Monthly Risk:", round(capm_portfolio_risk * 100, 4), "%\n")
## Expected Monthly Risk: 2.3253 %
cat("Sharpe Ratio:", round(capm_sharpe, 4), "\n")
## Sharpe Ratio: 0.2325
cat("Realized Return (", format(latest_etf_date, "%B %Y"), "):", round(realized_capm * 100, 4), "%\n\n")
## Realized Return ( April 2025 ): -2.3393 %
cat("FF3 Weights:\n")
## FF3 Weights:
print(round(ff_weights, 4))
##        [,1]
## SPY  0.7141
## QQQ  0.0498
## EEM  0.0085
## IWM  0.2533
## EFA -0.0027
## TLT  0.0122
## IYR -0.0331
## GLD -0.0022
cat("\nPortfolio Statistics (FF3):\n")
## 
## Portfolio Statistics (FF3):
cat("Expected Monthly Return:", round(ff_portfolio_return * 100, 4), "%\n")
## Expected Monthly Return: 1.1285 %
cat("Expected Monthly Risk:", round(ff_portfolio_risk * 100, 4), "%\n")
## Expected Monthly Risk: 4.463 %
cat("Sharpe Ratio:", round(ff_sharpe, 4), "\n")
## Sharpe Ratio: 0.2529
cat("Realized Return (", format(latest_etf_date, "%B %Y"), "):", round(realized_ff * 100, 4), "%\n")
## Realized Return ( April 2025 ): -6.3171 %
# ---- 10. Create weight comparison chart ----
# Combine weights into a data frame
weight_comparison <- data.frame(
  ETF = etf_tickers,
  CAPM = as.numeric(capm_weights),
  FF3 = as.numeric(ff_weights)
)

# Convert to long format for plotting
weight_long <- weight_comparison %>%
  pivot_longer(cols = c(CAPM, FF3), names_to = "Model", values_to = "Weight")

# Plot
ggplot(weight_long, aes(x = ETF, y = Weight, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Portfolio Weights Comparison",
       subtitle = paste("Analysis period:", format(min(capm_window$date), "%b %Y"), "to", format(max(capm_window$date), "%b %Y")),
       x = "ETF Ticker",
       y = "Weight") +
  scale_fill_manual(values = c("CAPM" = "steelblue", "FF3" = "darkred")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Save results to CSV
weight_comparison %>%
  write_csv("portfolio_weights.csv")