library(tidyquant)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8 ✔ TTR 0.24.4
## ✔ quantmod 0.4.27 ✔ xts 0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks xts::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks xts::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(timetk)
##
## Attaching package: 'timetk'
##
## The following object is masked from 'package:tidyquant':
##
## FANG
library(lubridate)
library(PerformanceAnalytics)
# ---- 1. Import ETF Data ----
etf_tickers <- c("SPY", "QQQ", "EEM", "IWM", "EFA", "TLT", "IYR", "GLD")
etf_prices <- tq_get(etf_tickers,
from = "2010-01-01",
get = "stock.prices") %>%
select(symbol, date, adjusted)
# Print info about fetched data
cat("ETF data summary:\n")
## ETF data summary:
cat("Earliest date:", min(etf_prices$date), "\n")
## Earliest date: 14613
cat("Latest date:", max(etf_prices$date), "\n")
## Latest date: 20194
cat("Total observations:", nrow(etf_prices), "\n")
## Total observations: 30768
# Count observations by ticker
etf_counts <- etf_prices %>%
group_by(symbol) %>%
summarize(count = n(),
earliest = min(date),
latest = max(date))
print(etf_counts)
## # A tibble: 8 × 4
## symbol count earliest latest
## <chr> <int> <date> <date>
## 1 EEM 3846 2010-01-04 2025-04-16
## 2 EFA 3846 2010-01-04 2025-04-16
## 3 GLD 3846 2010-01-04 2025-04-16
## 4 IWM 3846 2010-01-04 2025-04-16
## 5 IYR 3846 2010-01-04 2025-04-16
## 6 QQQ 3846 2010-01-04 2025-04-16
## 7 SPY 3846 2010-01-04 2025-04-16
## 8 TLT 3846 2010-01-04 2025-04-16
# ---- 2. Compute Monthly Returns ----
etf_monthly_returns <- etf_prices %>%
group_by(symbol) %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "monthly",
col_rename = "monthly_return",
type = "arithmetic") %>%
ungroup()
# Check completeness of monthly data
monthly_completeness <- etf_monthly_returns %>%
group_by(date) %>%
summarize(tickers = n(),
complete = tickers == length(etf_tickers),
.groups = "drop")
complete_months <- sum(monthly_completeness$complete)
cat("\nMonthly returns summary:\n")
##
## Monthly returns summary:
cat("Earliest ETF month:", min(etf_monthly_returns$date), "\n")
## Earliest ETF month: 14638
cat("Latest ETF month:", max(etf_monthly_returns$date), "\n")
## Latest ETF month: 20194
cat("Total months with data:", length(unique(etf_monthly_returns$date)), "\n")
## Total months with data: 184
cat("Months with complete data for all tickers:", complete_months, "\n")
## Months with complete data for all tickers: 184
# ---- 3. Load and Process Fama-French 3 Factors ----
# Direct download method for FF data (more reliable than using packages)
ff_url <- "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_CSV.zip"
temp <- tempfile()
download.file(ff_url, temp)
ff_data <- read.csv(unz(temp, "F-F_Research_Data_Factors.CSV"), skip=3)
unlink(temp)
# Print raw data to diagnose issues
cat("\nRaw FF data head (first few lines):\n")
##
## Raw FF data head (first few lines):
print(head(ff_data, 10))
## X Mkt.RF SMB HML RF
## 1 192607 2.96 -2.56 -2.43 0.22
## 2 192608 2.64 -1.17 3.82 0.25
## 3 192609 0.36 -1.40 0.13 0.23
## 4 192610 -3.24 -0.09 0.70 0.32
## 5 192611 2.53 -0.10 -0.51 0.31
## 6 192612 2.62 -0.03 -0.05 0.28
## 7 192701 -0.06 -0.37 4.54 0.25
## 8 192702 4.18 0.04 2.94 0.26
## 9 192703 0.13 -1.65 -2.61 0.30
## 10 192704 0.46 0.30 0.81 0.25
# Clean up the data and carefully handle date formats
ff3_clean <- ff_data %>%
filter(!is.na(X)) %>%
filter(X != "") %>%
rename(date_str = X, Mkt_RF = Mkt.RF) %>%
mutate(
# Convert percentages to decimals
across(c(Mkt_RF, SMB, HML, RF), as.numeric),
across(c(Mkt_RF, SMB, HML, RF), ~ . / 100)
)
# Debugging: Print values to check
cat("\nFF data after numeric conversion (first few lines):\n")
##
## FF data after numeric conversion (first few lines):
print(head(ff3_clean, 10))
## date_str Mkt_RF SMB HML RF
## 1 192607 0.0296 -0.0256 -0.0243 0.0022
## 2 192608 0.0264 -0.0117 0.0382 0.0025
## 3 192609 0.0036 -0.0140 0.0013 0.0023
## 4 192610 -0.0324 -0.0009 0.0070 0.0032
## 5 192611 0.0253 -0.0010 -0.0051 0.0031
## 6 192612 0.0262 -0.0003 -0.0005 0.0028
## 7 192701 -0.0006 -0.0037 0.0454 0.0025
## 8 192702 0.0418 0.0004 0.0294 0.0026
## 9 192703 0.0013 -0.0165 -0.0261 0.0030
## 10 192704 0.0046 0.0030 0.0081 0.0025
# Handle date formats
ff3_clean <- ff3_clean %>%
mutate(
# First try to identify the date format
date_str = as.character(date_str),
date_len = nchar(date_str),
date = case_when(
# YYYYMM format (annual/monthly data)
date_len == 6 ~ as.Date(paste0(substr(date_str, 1, 6), "01"), format="%Y%m%d"),
# YYYYMMDD format (daily data)
date_len == 8 ~ as.Date(date_str, format="%Y%m%d"),
# Try parsing as YYYYMM without adding day
TRUE ~ as.Date(paste0(date_str, "01"), format="%Y%m%d")
)
) %>%
filter(!is.na(date)) %>% # Remove rows with invalid dates
select(-date_str, -date_len) # Clean up temporary columns
# Debugging: Print date conversions to check
cat("\nFF data after date conversion (first few lines):\n")
##
## FF data after date conversion (first few lines):
print(head(ff3_clean, 10))
## Mkt_RF SMB HML RF date
## 1 0.0296 -0.0256 -0.0243 0.0022 1926-07-01
## 2 0.0264 -0.0117 0.0382 0.0025 1926-08-01
## 3 0.0036 -0.0140 0.0013 0.0023 1926-09-01
## 4 -0.0324 -0.0009 0.0070 0.0032 1926-10-01
## 5 0.0253 -0.0010 -0.0051 0.0031 1926-11-01
## 6 0.0262 -0.0003 -0.0005 0.0028 1926-12-01
## 7 -0.0006 -0.0037 0.0454 0.0025 1927-01-01
## 8 0.0418 0.0004 0.0294 0.0026 1927-02-01
## 9 0.0013 -0.0165 -0.0261 0.0030 1927-03-01
## 10 0.0046 0.0030 0.0081 0.0025 1927-04-01
cat("\nFF data date range:", min(ff3_clean$date), "to", max(ff3_clean$date), "\n")
##
## FF data date range: -15890 to 20058
# Handle ETF dates properly - convert to month-end dates
etf_wide <- etf_monthly_returns %>%
# Add a column with just year and month for matching
mutate(year_month = floor_date(date, "month")) %>%
# We need this to match the FF data format
pivot_wider(names_from = symbol, values_from = monthly_return)
# Print to verify ETF dates
cat("\nETF data date format sample:\n")
##
## ETF data date format sample:
print(head(etf_wide[, c("date", "year_month")], 10))
## # A tibble: 10 × 2
## date year_month
## <date> <date>
## 1 2010-01-29 2010-01-01
## 2 2010-02-26 2010-02-01
## 3 2010-03-31 2010-03-01
## 4 2010-04-30 2010-04-01
## 5 2010-05-28 2010-05-01
## 6 2010-06-30 2010-06-01
## 7 2010-07-30 2010-07-01
## 8 2010-08-31 2010-08-01
## 9 2010-09-30 2010-09-01
## 10 2010-10-29 2010-10-01
# Try to match by year and month instead of exact date
# Add year_month column to FF data too
ff3_clean <- ff3_clean %>%
mutate(year_month = floor_date(date, "month"))
# Print to verify FF dates
cat("\nFF data date format sample:\n")
##
## FF data date format sample:
print(head(ff3_clean[, c("date", "year_month")], 10))
## date year_month
## 1 1926-07-01 1926-07-01
## 2 1926-08-01 1926-08-01
## 3 1926-09-01 1926-09-01
## 4 1926-10-01 1926-10-01
## 5 1926-11-01 1926-11-01
## 6 1926-12-01 1926-12-01
## 7 1927-01-01 1927-01-01
## 8 1927-02-01 1927-02-01
## 9 1927-03-01 1927-03-01
## 10 1927-04-01 1927-04-01
# Use year_month for merging
merged_data <- inner_join(etf_wide, ff3_clean, by = "year_month")
# Keep only the needed columns after merging
merged_data <- merged_data %>%
select(date.x, all_of(etf_tickers), Mkt_RF, SMB, HML, RF) %>%
rename(date = date.x)
# Check merged data
cat("\nMerged data summary:\n")
##
## Merged data summary:
cat("Rows in merged data:", nrow(merged_data), "\n")
## Rows in merged data: 180
if(nrow(merged_data) > 0) {
cat("Date range:", min(merged_data$date), "to", max(merged_data$date), "\n")
} else {
# Display diagnostic information
cat("No matching dates found. Here's what we have:\n")
cat("ETF dates (sample):", paste(head(sort(unique(etf_wide$year_month))), collapse=", "), "...\n")
cat("FF dates (sample):", paste(head(sort(unique(ff3_clean$year_month))), collapse=", "), "...\n")
# Try a last-resort approach: simplified monthly format for comparison
cat("\nAttempting date format comparison...\n")
etf_months <- format(etf_wide$year_month, "%Y-%m")
ff_months <- format(ff3_clean$year_month, "%Y-%m")
cat("ETF months format:", paste(head(unique(etf_months)), collapse=", "), "...\n")
cat("FF months format:", paste(head(unique(ff_months)), collapse=", "), "...\n")
common_months <- intersect(etf_months, ff_months)
cat("Common months count:", length(common_months), "\n")
if(length(common_months) > 0) {
cat("Common months (sample):", paste(head(common_months), collapse=", "), "...\n")
# Try one more merging approach based on string formatted year-month
etf_wide$month_str <- format(etf_wide$date, "%Y-%m")
ff3_clean$month_str <- format(ff3_clean$date, "%Y-%m")
merged_data <- inner_join(etf_wide, ff3_clean, by = "month_str")
# Keep only the needed columns after merging
if(nrow(merged_data) > 0) {
merged_data <- merged_data %>%
select(date.x, all_of(etf_tickers), Mkt_RF, SMB, HML, RF) %>%
rename(date = date.x)
cat("\nSuccessfully merged using string month format! Rows:", nrow(merged_data), "\n")
} else {
stop("All merge attempts failed. Please check data formats and date ranges.")
}
} else {
stop("No overlapping months between ETF and FF data. Cannot proceed.")
}
}# Remove rows with NA values
## Date range: 14638 to 20088
merged_clean <- merged_data %>% drop_na()
cat("Rows after removing NAs:", nrow(merged_clean), "\n")
## Rows after removing NAs: 180
# If we have at least 12 months of data, we'll work with it
min_months <- 12 # Lower threshold to 12 months
if (nrow(merged_clean) < min_months) {
stop(paste("Not enough valid monthly data. Have", nrow(merged_clean), "months, need at least", min_months))
} else {
cat("Proceeding with", nrow(merged_clean), "months of data\n")
capm_window <- merged_clean
}
## Proceeding with 180 months of data
# ---- 5. CAPM MVP ----
returns_matrix <- capm_window %>%
select(all_of(etf_tickers)) %>%
as.matrix()
# Print a sample of returns to verify data
cat("\nSample of monthly returns (first 5 rows):\n")
##
## Sample of monthly returns (first 5 rows):
print(round(returns_matrix[1:min(5,nrow(returns_matrix)),], 4))
## SPY QQQ EEM IWM EFA TLT IYR GLD
## [1,] -0.0524 -0.0782 -0.1037 -0.0605 -0.0749 0.0278 -0.0520 -0.0350
## [2,] 0.0312 0.0460 0.0178 0.0448 0.0027 -0.0034 0.0546 0.0327
## [3,] 0.0609 0.0771 0.0811 0.0823 0.0639 -0.0206 0.0975 -0.0044
## [4,] 0.0155 0.0224 -0.0017 0.0568 -0.0280 0.0332 0.0639 0.0588
## [5,] -0.0795 -0.0739 -0.0939 -0.0754 -0.1119 0.0511 -0.0568 0.0305
cov_matrix <- cov(returns_matrix)
cat("\nCovariance matrix summary:\n")
##
## Covariance matrix summary:
cat("Dimension:", nrow(cov_matrix), "x", ncol(cov_matrix), "\n")
## Dimension: 8 x 8
cat("Determinant:", det(cov_matrix), "\n")
## Determinant: 3.241609e-25
# Check if covariance matrix is invertible
if (det(cov_matrix) < 1e-10) {
cat("WARNING: Covariance matrix is nearly singular, using a more robust approach\n")
# Add a small constant to the diagonal
diag(cov_matrix) <- diag(cov_matrix) + 1e-5
}
## WARNING: Covariance matrix is nearly singular, using a more robust approach
ones <- rep(1, ncol(cov_matrix))
inv_cov <- solve(cov_matrix)
capm_weights <- inv_cov %*% ones / as.numeric(t(ones) %*% inv_cov %*% ones)
rownames(capm_weights) <- etf_tickers
# ---- 6. FF MVP using residuals from regression ----
# Create an empty matrix of the right dimensions
residual_matrix <- matrix(0, nrow = nrow(capm_window), ncol = length(etf_tickers))
colnames(residual_matrix) <- etf_tickers
# Fill in the matrix column by column
cat("\nProcessing FF3 regressions for each ETF:\n")
##
## Processing FF3 regressions for each ETF:
for (i in 1:length(etf_tickers)) {
asset <- etf_tickers[i]
cat("Processing", asset, "... ")
tryCatch({
# Prepare data for regression
lm_data <- capm_window %>%
select(date, all_of(asset), Mkt_RF, SMB, HML) %>%
rename(R = all_of(asset))
# Check for NAs
if (any(is.na(lm_data))) {
cat("NA values found, skipping\n")
next
}
# Fit model
model <- lm(R ~ Mkt_RF + SMB + HML, data = lm_data)
# Store residuals
residual_matrix[, i] <- residuals(model)
cat("done\n")
}, error = function(e) {
cat("ERROR:", e$message, "\n")
})
}
## Processing SPY ... done
## Processing QQQ ... done
## Processing EEM ... done
## Processing IWM ... done
## Processing EFA ... done
## Processing TLT ... done
## Processing IYR ... done
## Processing GLD ... done
# Check if we have valid residuals
if (all(residual_matrix == 0)) {
cat("\nWARNING: Could not calculate valid residuals. Using equal weights for FF3 portfolio.\n")
ff_weights <- matrix(1/length(etf_tickers), nrow = length(etf_tickers), ncol = 1)
rownames(ff_weights) <- etf_tickers
} else {
# Calculate residual covariance matrix
ff_cov_matrix <- cov(residual_matrix)
# Check if residual covariance matrix is invertible
if (det(ff_cov_matrix) < 1e-10) {
cat("WARNING: Residual covariance matrix is nearly singular, using a more robust approach\n")
# Add a small constant to the diagonal
diag(ff_cov_matrix) <- diag(ff_cov_matrix) + 1e-5
}
inv_cov_ff <- solve(ff_cov_matrix)
ff_weights <- inv_cov_ff %*% ones / as.numeric(t(ones) %*% inv_cov_ff %*% ones)
rownames(ff_weights) <- etf_tickers
}
## WARNING: Residual covariance matrix is nearly singular, using a more robust approach
# ---- 7. Calculate portfolio statistics ----
# Calculate portfolio expected return and risk
capm_portfolio_return <- mean(returns_matrix %*% capm_weights)
capm_portfolio_risk <- sqrt(t(capm_weights) %*% cov_matrix %*% capm_weights)
capm_sharpe <- capm_portfolio_return / capm_portfolio_risk
# Calculate portfolio metrics for FF
ff_portfolio_return <- mean(returns_matrix %*% ff_weights)
ff_portfolio_risk <- sqrt(t(ff_weights) %*% cov_matrix %*% ff_weights)
ff_sharpe <- ff_portfolio_return / ff_portfolio_risk
# ---- 8. Get latest return data for out of sample test ----
# Find the latest date we have ETF data for
latest_etf_date <- max(etf_monthly_returns$date)
# Get returns for that date
latest_returns <- etf_monthly_returns %>%
filter(date == latest_etf_date) %>%
pivot_wider(names_from = symbol, values_from = monthly_return)
# Check if we have all tickers
missing_tickers <- setdiff(etf_tickers, names(latest_returns))
if (length(missing_tickers) > 0) {
cat("\nWARNING: Missing data for these tickers in latest month:", paste(missing_tickers, collapse = ", "), "\n")
# Fill with zeros as fallback
for (ticker in missing_tickers) {
latest_returns[[ticker]] <- 0
}
}
latest_return_matrix <- latest_returns %>%
select(all_of(etf_tickers)) %>%
as.matrix()
# Calculate realized returns
realized_capm <- as.numeric(latest_return_matrix %*% capm_weights)
realized_ff <- as.numeric(latest_return_matrix %*% ff_weights)
# ---- 9. Print Final Results ----
cat("\n=== Minimum Variance Portfolio Analysis ===\n")
##
## === Minimum Variance Portfolio Analysis ===
cat("Analysis period:", min(capm_window$date), "to", max(capm_window$date), "\n")
## Analysis period: 14638 to 20088
cat("Number of months:", nrow(capm_window), "\n\n")
## Number of months: 180
cat("CAPM Weights:\n")
## CAPM Weights:
print(round(capm_weights, 4))
## [,1]
## SPY 1.0358
## QQQ -0.4280
## EEM -0.0092
## IWM -0.0137
## EFA 0.0905
## TLT 0.4839
## IYR -0.3321
## GLD 0.1727
cat("\nPortfolio Statistics (CAPM):\n")
##
## Portfolio Statistics (CAPM):
cat("Expected Monthly Return:", round(capm_portfolio_return * 100, 4), "%\n")
## Expected Monthly Return: 0.5406 %
cat("Expected Monthly Risk:", round(capm_portfolio_risk * 100, 4), "%\n")
## Expected Monthly Risk: 2.3253 %
cat("Sharpe Ratio:", round(capm_sharpe, 4), "\n")
## Sharpe Ratio: 0.2325
cat("Realized Return (", format(latest_etf_date, "%B %Y"), "):", round(realized_capm * 100, 4), "%\n\n")
## Realized Return ( April 2025 ): -2.3393 %
cat("FF3 Weights:\n")
## FF3 Weights:
print(round(ff_weights, 4))
## [,1]
## SPY 0.7141
## QQQ 0.0498
## EEM 0.0085
## IWM 0.2533
## EFA -0.0027
## TLT 0.0122
## IYR -0.0331
## GLD -0.0022
cat("\nPortfolio Statistics (FF3):\n")
##
## Portfolio Statistics (FF3):
cat("Expected Monthly Return:", round(ff_portfolio_return * 100, 4), "%\n")
## Expected Monthly Return: 1.1285 %
cat("Expected Monthly Risk:", round(ff_portfolio_risk * 100, 4), "%\n")
## Expected Monthly Risk: 4.463 %
cat("Sharpe Ratio:", round(ff_sharpe, 4), "\n")
## Sharpe Ratio: 0.2529
cat("Realized Return (", format(latest_etf_date, "%B %Y"), "):", round(realized_ff * 100, 4), "%\n")
## Realized Return ( April 2025 ): -6.3171 %
# ---- 10. Create weight comparison chart ----
# Combine weights into a data frame
weight_comparison <- data.frame(
ETF = etf_tickers,
CAPM = as.numeric(capm_weights),
FF3 = as.numeric(ff_weights)
)
# Convert to long format for plotting
weight_long <- weight_comparison %>%
pivot_longer(cols = c(CAPM, FF3), names_to = "Model", values_to = "Weight")
# Plot
ggplot(weight_long, aes(x = ETF, y = Weight, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Portfolio Weights Comparison",
subtitle = paste("Analysis period:", format(min(capm_window$date), "%b %Y"), "to", format(max(capm_window$date), "%b %Y")),
x = "ETF Ticker",
y = "Weight") +
scale_fill_manual(values = c("CAPM" = "steelblue", "FF3" = "darkred")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Save results to CSV
weight_comparison %>%
write_csv("portfolio_weights.csv")