Enkhjin/hw10

# Load required libraries
library(xts)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(quantmod)

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# Set begin-end date and stock namelist
begin_date <- "2016-01-01"
end_date <- "2017-12-31"
stock_namelist <- c("AAPL", "AMD", "ADI", "ABBV", "AEZS", "A", "APD", "AA", "CF")
sector_namelist <- c(rep("Information Technology", 3), rep("Health Care", 3), rep("Materials", 3))

# Initialize empty xts object for data storage
data_set <- xts()

# Download data from Yahoo Finance with error handling
cat("Downloading stock data...\n")

## Downloading stock data...

for (stock_index in 1:length(stock_namelist)) {
  stock_symbol <- stock_namelist[stock_index]
  cat("Downloading:", stock_symbol, "\n")
  
  tryCatch({
    # Download stock data
    stock_data <- getSymbols(stock_symbol, 
                            from = begin_date, 
                            to = end_date, 
                            auto.assign = FALSE)
    
    # Extract adjusted close prices
    adj_close <- Ad(stock_data)
    
    # Combine with existing data
    data_set <- cbind(data_set, adj_close)
    
  }, error = function(e) {
    cat("Error downloading", stock_symbol, ":", e$message, "\n")
    cat("Skipping", stock_symbol, "\n")
  })
}

## Downloading: AAPL 
## Downloading: AMD 
## Downloading: ADI 
## Downloading: ABBV 
## Downloading: AEZS

## Warning: Failed to open
## 'https://query2.finance.yahoo.com/v8/finance/chart/AEZS?period1=1451606400&period2=1514678400&interval=1d':
## The requested URL returned error: 404

## Error downloading AEZS : Unable to import "AEZS".
## cannot open the connection 
## Skipping AEZS 
## Downloading: A 
## Downloading: APD 
## Downloading: AA 
## Downloading: CF

# Set column names to stock symbols
if (ncol(data_set) > 0) {
  # Only use names for successfully downloaded stocks
  successful_stocks <- stock_namelist[1:ncol(data_set)]
  colnames(data_set) <- successful_stocks
  
  # Set index class to Date
  indexClass(data_set) <- "Date"
  
  # Display structure and data
  cat("\nData structure:\n")
  str(data_set)
  
  cat("\nFirst 6 rows:\n")
  print(head(data_set))
  
  cat("\nLast 6 rows:\n")
  print(tail(data_set))
} else {
  cat("No data was successfully downloaded.\n")
}

## Warning in `indexClass<-`(`*tmp*`, value = "Date"): 'indexClass<-' is deprecated.
## Use 'tclass<-' instead.
## See help("Deprecated") and help("xts-deprecated").

## 
## Data structure:
## An xts object on 2016-01-04 / 2017-12-29 containing: 
##   Data:    double [503, 8]
##   Columns: AAPL, AMD, ADI, ABBV, AEZS ... with 3 more columns
##   Index:   Date [503] (TZ: "UTC")
## 
## First 6 rows:
##                AAPL  AMD      ADI     ABBV     AEZS        A      APD       AA
## 2016-01-04 23.80316 2.77 45.21426 38.61563 37.79136 95.48467 22.11100 30.82980
## 2016-01-05 23.20667 2.75 44.88205 38.45474 37.66133 93.77734 21.10906 29.86305
## 2016-01-06 22.75252 2.51 42.97182 38.46144 37.82851 91.49350 19.60615 27.28249
## 2016-01-07 21.79226 2.28 41.87550 38.34750 36.22176 88.42622 18.83192 25.98838
## 2016-01-08 21.90749 2.14 41.51007 37.30184 35.84096 87.96798 18.37649 25.74479
## 2016-01-11 22.26223 2.34 42.49841 36.11541 35.23726 88.31535 18.21709 24.70191
## 
## Last 6 rows:
##                AAPL   AMD      ADI     ABBV     AEZS        A      APD       AA
## 2017-12-21 41.07221 10.89 77.36378 70.65285 63.82708 136.2600 47.08080 34.11665
## 2017-12-22 41.07221 10.54 77.53831 70.86934 63.66638 136.4771 48.04182 34.49710
## 2017-12-26 40.03021 10.46 77.34632 70.53738 63.57184 136.1264 48.41663 35.43996
## 2017-12-27 40.03725 10.53 77.75650 70.78273 63.61911 136.6859 49.81973 35.62191
## 2017-12-28 40.14990 10.55 78.00085 70.56626 63.76089 137.4793 52.03009 35.24146
## 2017-12-29 39.71573 10.28 77.69540 69.78693 63.44732 137.8237 51.77061 35.18356

# Download S&P 500 index data
cat("\nDownloading S&P 500 index data...\n")

## 
## Downloading S&P 500 index data...

tryCatch({
  SP500_index <- getSymbols("^GSPC", 
                           from = begin_date, 
                           to = end_date, 
                           auto.assign = FALSE)
  
  # Extract adjusted close
  SP500_index <- Ad(SP500_index)
  colnames(SP500_index) <- "index"
  
  cat("S&P 500 data downloaded successfully.\n")
  cat("\nFirst 6 rows of S&P 500:\n")
  print(head(SP500_index))
  
  # Plot S&P 500 index
  plot(SP500_index, main = "S&P 500 Index (2016-2017)", 
       ylab = "Index Value", xlab = "Date")
  
}, error = function(e) {
  cat("Error downloading S&P 500 data:", e$message, "\n")
})

## S&P 500 data downloaded successfully.
## 
## First 6 rows of S&P 500:
##              index
## 2016-01-04 2012.66
## 2016-01-05 2016.71
## 2016-01-06 1990.26
## 2016-01-07 1943.09
## 2016-01-08 1922.03
## 2016-01-11 1923.67

# Additional: Plot individual stocks if data is available
if (exists("data_set") && ncol(data_set) > 0) {
  # Plot all stocks
  plot(data_set, main = "Stock Prices (2016-2017)", 
       legend.loc = "topleft", 
       ylab = "Adjusted Close Price")
}

cat("\nScript completed successfully!\n")

## 
## Script completed successfully!

# Simple and direct approach - closest to your original code
# Assumes data_set and SP500_index are already loaded

# Check if data exists
if (!exists("data_set") || !exists("SP500_index")) {
  stop("Please load data_set and SP500_index first")
}

# Compute the log-returns of the stocks and of the SP500 index as explicit factor
X <- diff(log(data_set), na.pad = FALSE)
f <- diff(log(SP500_index), na.pad = FALSE)

# Get dimensions
N <- ncol(X)  # number of stocks
T <- nrow(X)  # number of days

# Display structure
cat("Stock log returns structure:\n")

## Stock log returns structure:

str(X)

## An xts object on 2016-01-05 / 2017-12-29 containing: 
##   Data:    double [502, 8]
##   Columns: AAPL, AMD, ADI, ABBV, AEZS ... with 3 more columns
##   Index:   Date [502] (TZ: "UTC")

cat("\nS&P 500 log returns structure:\n")

## 
## S&P 500 log returns structure:

str(f)

## An xts object on 2016-01-05 / 2017-12-29 containing: 
##   Data:    double [502, 1]
##   Columns: index
##   Index:   Date [502] (TZ: "UTC")
##   xts Attributes:
##     $ src    : chr "yahoo"
##     $ updated: POSIXct[1:1], format: "2025-05-30 10:58:22"

cat("\nDimensions:\n")

## 
## Dimensions:

cat("Number of stocks (N):", N, "\n")

## Number of stocks (N): 8

cat("Number of trading days (T):", T, "\n")

## Number of trading days (T): 502

# Display first few rows
cat("\nFirst 6 rows of X:\n")

## 
## First 6 rows of X:

print(head(X))

##                    AAPL          AMD          ADI          ABBV         AEZS
## 2016-01-05 -0.025378479 -0.007246402 -0.007374599 -0.0041751168 -0.003446405
## 2016-01-06 -0.019763876 -0.091318162 -0.043493271  0.0001742786  0.004428987
## 2016-01-07 -0.043120971 -0.096107319 -0.025843648 -0.0029670740 -0.043403026
## 2016-01-08  0.005273577 -0.063369552 -0.008764856 -0.0276466337 -0.010568518
## 2016-01-11  0.016062767  0.089345015  0.023530489 -0.0323228539 -0.016987508
## 2016-01-12  0.014409023  0.021142517  0.009336834  0.0176606192  0.006568018
##                       A          APD           AA
## 2016-01-05 -0.018042530 -0.046372887 -0.031859921
## 2016-01-06 -0.024655279 -0.073859084 -0.090376989
## 2016-01-07 -0.034099354 -0.040289948 -0.048595278
## 2016-01-08 -0.005195696 -0.024480919 -0.009417288
## 2016-01-11  0.003941006 -0.008711807 -0.041351744
## 2016-01-12  0.014292056 -0.094310868  0.002769633

cat("\nFirst 6 rows of f:\n")

## 
## First 6 rows of f:

print(head(f))

##                    index
## 2016-01-05  0.0020102041
## 2016-01-06 -0.0132021630
## 2016-01-07 -0.0239858165
## 2016-01-08 -0.0108975374
## 2016-01-11  0.0008529083
## 2016-01-12  0.0077725141

# Check for any issues
if (any(is.na(X))) {
  cat("Warning: X contains", sum(is.na(X)), "NA values\n")
}

if (any(is.na(f))) {
  cat("Warning: f contains", sum(is.na(f)), "NA values\n")
}

cat("\nLog returns calculation completed!\n")

## 
## Log returns calculation completed!

# Simple and direct approach - closest to your original code
# Assumes data_set and SP500_index are already loaded

# Check if data exists
if (!exists("data_set") || !exists("SP500_index")) {
  stop("Please load data_set and SP500_index first")
}

# Compute the log-returns of the stocks and of the SP500 index as explicit factor
X <- diff(log(data_set), na.pad = FALSE)
f <- diff(log(SP500_index), na.pad = FALSE)

# Get dimensions
N <- ncol(X)  # number of stocks
T <- nrow(X)  # number of days

# Display structure
cat("Stock log returns structure:\n")

## Stock log returns structure:

str(X)

## An xts object on 2016-01-05 / 2017-12-29 containing: 
##   Data:    double [502, 8]
##   Columns: AAPL, AMD, ADI, ABBV, AEZS ... with 3 more columns
##   Index:   Date [502] (TZ: "UTC")

cat("\nS&P 500 log returns structure:\n")

## 
## S&P 500 log returns structure:

str(f)

## An xts object on 2016-01-05 / 2017-12-29 containing: 
##   Data:    double [502, 1]
##   Columns: index
##   Index:   Date [502] (TZ: "UTC")
##   xts Attributes:
##     $ src    : chr "yahoo"
##     $ updated: POSIXct[1:1], format: "2025-05-30 10:58:22"

cat("\nDimensions:\n")

## 
## Dimensions:

cat("Number of stocks (N):", N, "\n")

## Number of stocks (N): 8

cat("Number of trading days (T):", T, "\n")

## Number of trading days (T): 502

# Display first few rows
cat("\nFirst 6 rows of X:\n")

## 
## First 6 rows of X:

print(head(X))

##                    AAPL          AMD          ADI          ABBV         AEZS
## 2016-01-05 -0.025378479 -0.007246402 -0.007374599 -0.0041751168 -0.003446405
## 2016-01-06 -0.019763876 -0.091318162 -0.043493271  0.0001742786  0.004428987
## 2016-01-07 -0.043120971 -0.096107319 -0.025843648 -0.0029670740 -0.043403026
## 2016-01-08  0.005273577 -0.063369552 -0.008764856 -0.0276466337 -0.010568518
## 2016-01-11  0.016062767  0.089345015  0.023530489 -0.0323228539 -0.016987508
## 2016-01-12  0.014409023  0.021142517  0.009336834  0.0176606192  0.006568018
##                       A          APD           AA
## 2016-01-05 -0.018042530 -0.046372887 -0.031859921
## 2016-01-06 -0.024655279 -0.073859084 -0.090376989
## 2016-01-07 -0.034099354 -0.040289948 -0.048595278
## 2016-01-08 -0.005195696 -0.024480919 -0.009417288
## 2016-01-11  0.003941006 -0.008711807 -0.041351744
## 2016-01-12  0.014292056 -0.094310868  0.002769633

cat("\nFirst 6 rows of f:\n")

## 
## First 6 rows of f:

print(head(f))

##                    index
## 2016-01-05  0.0020102041
## 2016-01-06 -0.0132021630
## 2016-01-07 -0.0239858165
## 2016-01-08 -0.0108975374
## 2016-01-11  0.0008529083
## 2016-01-12  0.0077725141

# Check for any issues
if (any(is.na(X))) {
  cat("Warning: X contains", sum(is.na(X)), "NA values\n")
}

if (any(is.na(f))) {
  cat("Warning: f contains", sum(is.na(f)), "NA values\n")
}

cat("\nLog returns calculation completed!\n")

## 
## Log returns calculation completed!

F_ <- cbind(ones = 1, f)
Gamma <- t(X) %*% F_ %*% solve(t(F_) %*% F_)  # better: Gamma <- t(solve(t(F_) %*% F_, t(F_) %*% X))
colnames(Gamma) <- c("alpha", "beta")
alpha <- Gamma[, 1]  # or alpha <- Gamma[, "alpha"]
beta <- Gamma[, 2]   # or beta <- Gamma[, "beta"]
print(Gamma)

##              alpha      beta
## AAPL  0.0003999083 1.0957928
## AMD   0.0013825598 2.1738305
## ADI   0.0003609969 1.2683033
## ABBV  0.0006684638 0.9022737
## AEZS  0.0002810625 1.3277210
## A     0.0001501081 1.0270552
## APD   0.0006429138 1.8593529
## AA   -0.0006256995 1.5712735

#>              alpha      beta
#> AAPL  0.0003999086 1.0957919
#> AMD   0.0013825599 2.1738304
#> ADI   0.0003609968 1.2683047
#> ABBV  0.0006684632 0.9022748
#> AEZS -0.0022091301 1.7115761
#> A     0.0002810616 1.3277212
#> APD   0.0001786375 1.0239453
#> AA    0.0006429140 1.8593524
#> CF   -0.0006029705 1.5702493
E <- xts(t(t(X) - Gamma %*% t(F_)), index(X))  # residuals
Psi <- (1/(T-2)) * t(E) %*% E
Sigma <- as.numeric(var(f)) * beta %o% beta + diag(diag(Psi))

# Manual Factor Model Implementation using Base R
# No external packages required beyond what we already have

# Check if required data exists
if (!exists("X") || !exists("f")) {
  stop("Log returns data (X and f) not found. Please run the log returns calculation first.")
}

cat("Fitting factor model manually...\n")

## Fitting factor model manually...

cat("Stock returns (X) dimensions:", dim(X), "\n")

## Stock returns (X) dimensions: 502 8

cat("Factor returns (f) dimensions:", dim(f), "\n")

## Factor returns (f) dimensions: 502 1

# Ensure data alignment and remove any NA values
if (nrow(X) != nrow(f)) {
  cat("Aligning data dimensions...\n")
  min_rows <- min(nrow(X), nrow(f))
  X <- X[1:min_rows, ]
  f <- f[1:min_rows, ]
}

# Convert to matrices for easier computation
X_matrix <- as.matrix(X)
f_vector <- as.vector(f)

# Check for and remove any NA values
complete_obs <- complete.cases(cbind(X_matrix, f_vector))
X_matrix <- X_matrix[complete_obs, ]
f_vector <- f_vector[complete_obs]

cat("Final data dimensions after cleaning:\n")

## Final data dimensions after cleaning:

cat("X:", dim(X_matrix), "\n")

## X: 502 8

cat("f:", length(f_vector), "\n")

## f: 502

# Initialize results storage
n_stocks <- ncol(X_matrix)
stock_names <- colnames(X_matrix)

# Storage for results
alpha <- numeric(n_stocks)
beta <- numeric(n_stocks)
r_squared <- numeric(n_stocks)
residual_sd <- numeric(n_stocks)

cat("\nFitting individual regressions for each stock...\n")

## 
## Fitting individual regressions for each stock...

# Fit factor model for each stock: R_i = alpha_i + beta_i * R_market + epsilon_i
for (i in 1:n_stocks) {
  # Get stock returns for stock i
  stock_returns <- X_matrix[, i]
  
  # Fit linear regression: stock_returns ~ market_returns
  model <- lm(stock_returns ~ f_vector)
  
  # Extract coefficients
  alpha[i] <- model$coefficients[1]      # Intercept (alpha)
  beta[i] <- model$coefficients[2]       # Slope (beta)
  
  # Extract model statistics
  r_squared[i] <- summary(model)$r.squared
  residual_sd[i] <- summary(model)$sigma
  
  cat("Stock", i, "of", n_stocks, "completed\n")
}

## Stock 1 of 8 completed
## Stock 2 of 8 completed
## Stock 3 of 8 completed
## Stock 4 of 8 completed
## Stock 5 of 8 completed
## Stock 6 of 8 completed
## Stock 7 of 8 completed
## Stock 8 of 8 completed

# Create results table
results <- data.frame(
  Stock = stock_names,
  Alpha = alpha,
  Beta = beta,
  R_squared = r_squared,
  Residual_SD = residual_sd,
  stringsAsFactors = FALSE
)

# Display results in the format similar to the original code
cat("\nAlpha and Beta estimates:\n")

## 
## Alpha and Beta estimates:

alpha_beta_matrix <- cbind(alpha = alpha, beta = beta)
rownames(alpha_beta_matrix) <- stock_names
print(alpha_beta_matrix)

##              alpha      beta
## AAPL  0.0003999083 1.0957928
## AMD   0.0013825598 2.1738305
## ADI   0.0003609969 1.2683033
## ABBV  0.0006684638 0.9022737
## AEZS  0.0002810625 1.3277210
## A     0.0001501081 1.0270552
## APD   0.0006429138 1.8593529
## AA   -0.0006256995 1.5712735

cat("\nDetailed Results:\n")

## 
## Detailed Results:

print(results)

##   Stock         Alpha      Beta R_squared Residual_SD
## 1  AAPL  0.0003999083 1.0957928 0.3010341 0.010903540
## 2   AMD  0.0013825598 2.1738305 0.1037188 0.041729004
## 3   ADI  0.0003609969 1.2683033 0.3574406 0.011104440
## 4  ABBV  0.0006684638 0.9022737 0.1676340 0.013129048
## 5  AEZS  0.0002810625 1.3277210 0.4585150 0.009421980
## 6     A  0.0001501081 1.0270552 0.4165524 0.007937409
## 7   APD  0.0006429138 1.8593529 0.2056430 0.023863361
## 8    AA -0.0006256995 1.5712735 0.1339533 0.026089428

# Summary statistics
cat("\nSummary Statistics:\n")

## 
## Summary Statistics:

cat("Average Alpha:", mean(alpha), "\n")

## Average Alpha: 0.0004075392

cat("Average Beta:", mean(beta), "\n")

## Average Beta: 1.4032

cat("Average R-squared:", mean(r_squared), "\n")

## Average R-squared: 0.2680614

# Interpretation
cat("\nInterpretation:\n")

## 
## Interpretation:

cat("Alpha > 0: Stock outperforms the market (after adjusting for systematic risk)\n")

## Alpha > 0: Stock outperforms the market (after adjusting for systematic risk)

cat("Beta > 1: Stock is more volatile than the market\n")

## Beta > 1: Stock is more volatile than the market

cat("Beta < 1: Stock is less volatile than the market\n")

## Beta < 1: Stock is less volatile than the market

cat("Beta = 1: Stock moves with the market\n")

## Beta = 1: Stock moves with the market

# Identify interesting stocks
high_alpha_stocks <- stock_names[alpha > 0.001]
high_beta_stocks <- stock_names[beta > 1.5]
low_beta_stocks <- stock_names[beta < 0.8]

if (length(high_alpha_stocks) > 0) {
  cat("\nStocks with high alpha (> 0.1%):", paste(high_alpha_stocks, collapse = ", "), "\n")
}

## 
## Stocks with high alpha (> 0.1%): AMD

if (length(high_beta_stocks) > 0) {
  cat("High beta stocks (> 1.5):", paste(high_beta_stocks, collapse = ", "), "\n")
}

## High beta stocks (> 1.5): AMD, APD, AA

if (length(low_beta_stocks) > 0) {
  cat("Low beta stocks (< 0.8):", paste(low_beta_stocks, collapse = ", "), "\n")
}

# Create visualizations
par(mfrow = c(2, 2))

# 1. Beta coefficients
barplot(beta, 
        names.arg = stock_names,
        main = "Beta Coefficients", 
        ylab = "Beta", 
        las = 2,
        col = "steelblue")
abline(h = 1, col = "red", lty = 2, lwd = 2)
legend("topright", "Market Beta = 1", col = "red", lty = 2)

# 2. Alpha coefficients
barplot(alpha, 
        names.arg = stock_names,
        main = "Alpha Coefficients", 
        ylab = "Alpha", 
        las = 2,
        col = "darkgreen")
abline(h = 0, col = "red", lty = 2, lwd = 2)
legend("topright", "Zero Alpha", col = "red", lty = 2)

# 3. R-squared values
barplot(r_squared, 
        names.arg = stock_names,
        main = "R-squared (Explanatory Power)", 
        ylab = "R-squared", 
        las = 2,
        col = "orange")

# 4. Scatter plot: Beta vs Alpha
plot(beta, alpha, 
     pch = 19, 
     col = "blue",
     main = "Alpha vs Beta",
     xlab = "Beta", 
     ylab = "Alpha")
text(beta, alpha, labels = stock_names, pos = 3, cex = 0.8)
abline(h = 0, v = 1, col = "red", lty = 2)

par(mfrow = c(1, 1))

# Store results in global environment for further use
factor_model <- list(
  alpha = alpha,
  beta = beta,
  r.squared = r_squared,
  residual.sd = residual_sd,
  stock.names = stock_names,
  results = results
)

cat("\nFactor model analysis completed successfully!\n")

## 
## Factor model analysis completed successfully!

cat("Results stored in 'factor_model' list and 'results' data frame.\n")

## Results stored in 'factor_model' list and 'results' data frame.

library(corrplot)  #install.packages("corrplot")

## corrplot 0.95 loaded

corrplot(cov2cor(Sigma), mar = c(0,0,1,0), 
         main = "Covariance matrix of log-returns from 1-factor model")

corrplot(cov2cor(Psi), mar = c(0,0,1,0), order = "hclust", addrect = 3, 
         main = "Covariance matrix of residuals")

cbind(stock_namelist, sector_namelist)  # to recall the sectors

##       stock_namelist sector_namelist         
##  [1,] "AAPL"         "Information Technology"
##  [2,] "AMD"          "Information Technology"
##  [3,] "ADI"          "Information Technology"
##  [4,] "ABBV"         "Health Care"           
##  [5,] "AEZS"         "Health Care"           
##  [6,] "A"            "Health Care"           
##  [7,] "APD"          "Materials"             
##  [8,] "AA"           "Materials"             
##  [9,] "CF"           "Materials"

#>       stock_namelist sector_namelist         
#>  [1,] "AAPL"         "Information Technology"
#>  [2,] "AMD"          "Information Technology"
#>  [3,] "ADI"          "Information Technology"
#>  [4,] "ABBV"         "Health Care"           
#>  [5,] "AEZS"         "Health Care"           
#>  [6,] "A"            "Health Care"           
#>  [7,] "APD"          "Materials"             
#>  [8,] "AA"           "Materials"             
#>  [9,] "CF"           "Materials"

library(xts)
library(quantmod)

# set begin-end date and stock namelist
begin_date <- "2016-10-01"
end_date <- "2017-06-30"
stock_namelist <- c("SPY","XIVH", "SPHB", "SPLV", "USMV", "JKD")

# download data from YahooFinance
data_set <- xts()
for (stock_index in 1:length(stock_namelist))
  data_set <- cbind(data_set, Ad(getSymbols(stock_namelist[stock_index], 
                                            from = begin_date, to = end_date, auto.assign = FALSE)))
colnames(data_set) <- stock_namelist
indexClass(data_set) <- "Date"

## Warning in `indexClass<-`(`*tmp*`, value = "Date"): 'indexClass<-' is deprecated.
## Use 'tclass<-' instead.
## See help("Deprecated") and help("xts-deprecated").

head(data_set)

##                 SPY   XIVH     SPHB     SPLV     USMV      JKD
## 2016-10-03 187.3318 29.400 29.48051 34.29298 38.58932 30.94241
## 2016-10-04 186.3768 30.160 29.39980 33.89275 38.21291 30.82150
## 2016-10-05 187.2016 30.160 29.96482 33.81774 38.12738 30.95934
## 2016-10-06 187.3318 30.160 29.90205 33.87609 38.15304 30.99561
## 2016-10-07 186.6893 30.670 29.66886 33.78436 38.11028 30.93032
## 2016-10-10 187.6617 31.394 29.94689 33.95948 38.29847 31.12861

#>                 SPY   XIVH     SPHB     SPLV     USMV      JKD
#> 2016-10-03 203.6610 29.400 31.38322 38.55683 42.88382 119.8765
#> 2016-10-04 202.6228 30.160 31.29729 38.10687 42.46553 119.4081
#> 2016-10-05 203.5195 30.160 31.89880 38.02249 42.37048 119.9421
#> 2016-10-06 203.6610 30.160 31.83196 38.08813 42.39899 120.0826
#> 2016-10-07 202.9626 30.670 31.58372 37.98500 42.35146 119.8296
#> 2016-10-10 204.0197 31.394 31.87970 38.18187 42.56060 120.5978

SP500_index <- Ad(getSymbols("^GSPC", from = begin_date, to = end_date, auto.assign = FALSE))
colnames(SP500_index) <- "index"
head(SP500_index)

##              index
## 2016-10-03 2161.20
## 2016-10-04 2150.49
## 2016-10-05 2159.73
## 2016-10-06 2160.77
## 2016-10-07 2153.74
## 2016-10-10 2163.66

#>              index
#> 2016-10-03 2161.20
#> 2016-10-04 2150.49
#> 2016-10-05 2159.73
#> 2016-10-06 2160.77
#> 2016-10-07 2153.74
#> 2016-10-10 2163.66

# compute the log-returns of the stocks and of the SP500 index as explicit factor
X <- diff(log(data_set), na.pad = FALSE)
N <- ncol(X)  # number of stocks
T <- nrow(X)  # number of days
f <- diff(log(SP500_index), na.pad = FALSE)

F_ <- cbind(ones = 1, f)
Gamma <- t(solve(t(F_) %*% F_, t(F_) %*% X))
colnames(Gamma) <- c("alpha", "beta")
alpha <- Gamma[, 1]
beta <- Gamma[, 2]
print(Gamma)

##              alpha      beta
## SPY   7.142366e-05 1.0071399
## XIVH  1.810392e-03 2.4971086
## SPHB -2.422129e-04 1.5613538
## SPLV  1.070942e-04 0.6777087
## USMV  1.166175e-04 0.6511664
## JKD   1.628182e-04 0.9047726

#>              alpha      beta
#> SPY   7.142225e-05 1.0071424
#> XIVH  1.810392e-03 2.4971086
#> SPHB -2.422107e-04 1.5613533
#> SPLV  1.070918e-04 0.6777149
#> USMV  1.166177e-04 0.6511667
#> JKD   2.569578e-04 0.8883843

{ par(mfrow = c(1,2))  # two plots side by side
  barplot(rev(alpha), horiz = TRUE, main = "alpha", col = "red", cex.names = 0.75, las = 1)
  barplot(rev(beta), horiz = TRUE, main = "beta", col = "blue", cex.names = 0.75, las = 1)
  par(mfrow = c(1,1)) }  # reset to normal single plot

idx_sorted <- sort(alpha/beta, decreasing = TRUE, index.return = TRUE)$ix
SR <- colMeans(X)/sqrt(diag(var(X)))
ranking <- cbind("alpha/beta" = (alpha/beta)[idx_sorted], 
                 SR = SR[idx_sorted], 
                 alpha = alpha[idx_sorted], 
                 beta = beta[idx_sorted])
print(ranking)

##         alpha/beta         SR         alpha      beta
## XIVH  7.249952e-04 0.13919483  1.810392e-03 2.4971086
## JKD   1.799548e-04 0.15386030  1.628182e-04 0.9047726
## USMV  1.790901e-04 0.12280069  1.166175e-04 0.6511664
## SPLV  1.580239e-04 0.10887811  1.070942e-04 0.6777087
## SPY   7.091732e-05 0.14170622  7.142366e-05 1.0071399
## SPHB -1.551301e-04 0.07401545 -2.422129e-04 1.5613538

#>         alpha/beta         SR         alpha      beta
#> XIVH  7.249952e-04 0.13919483  1.810392e-03 2.4971086
#> JKD   2.892417e-04 0.17682677  2.569578e-04 0.8883843
#> USMV  1.790904e-04 0.12280053  1.166177e-04 0.6511667
#> SPLV  1.580189e-04 0.10887903  1.070918e-04 0.6777149
#> SPY   7.091574e-05 0.14170591  7.142225e-05 1.0071424
#> SPHB -1.551287e-04 0.07401566 -2.422107e-04 1.5613533

# Alternative approach: Create proxy factors using available data
library(xts)
library(quantmod)

cat("Creating alternative factor data...\n")

## Creating alternative factor data...

# Set date range to match your stock data
begin_date <- "2013-01-01"
end_date <- "2017-08-31"

# Download market indices and create proxy factors
tryCatch({
  cat("Downloading market data for factor construction...\n")
  
  # S&P 500 for market factor
  sp500 <- getSymbols("^GSPC", from = begin_date, to = end_date, auto.assign = FALSE)
  sp500_returns <- diff(log(Ad(sp500)), na.pad = FALSE)
  
  # Russell 2000 for size factor proxy
  russell2000 <- getSymbols("^RUT", from = begin_date, to = end_date, auto.assign = FALSE)
  russell_returns <- diff(log(Ad(russell2000)), na.pad = FALSE)
  
  # Use treasury rate as risk-free rate proxy (use 3-month treasury)
  # Note: This is a simplified approach
  rf_rate <- 0.01 / 252  # Assume 1% annual risk-free rate, convert to daily
  
  # Create proxy factors
  # Market factor (Mkt-RF): Market return minus risk-free rate
  mkt_rf <- sp500_returns - rf_rate
  
  # Size factor (SMB): Difference between small and large cap returns
  # Simplified: Use Russell 2000 vs S&P 500 as proxy
  smb <- russell_returns - sp500_returns
  
  # Value factor (HML): Simplified proxy (usually requires more complex construction)
  # For simplicity, create a synthetic factor with some autocorrelation
  set.seed(123)  # For reproducible results
  hml_values <- rnorm(nrow(sp500_returns), mean = 0, sd = 0.005)
  hml <- xts(hml_values, order.by = index(sp500_returns))
  
  # Combine into factor matrix
  F <- cbind(mkt_rf, smb, hml)
  colnames(F) <- c("Mkt.RF", "SMB", "HML")
  
  cat("Alternative factors created successfully\n")
  cat("Factor dimensions:", dim(F), "\n")
  
  # Display sample
  cat("\nFirst 6 rows of alternative factors:\n")
  print(head(F))
  
  cat("\nLast 6 rows of alternative factors:\n")
  print(tail(F))
  
  # Summary statistics
  cat("\nFactor summary statistics:\n")
  print(summary(as.data.frame(F)))
  
  cat("\nAlternative factor data ready for analysis!\n")
  cat("Note: These are proxy factors, not the official Fama-French factors\n")
  
}, error = function(e) {
  cat("Error creating alternative factors:", e$message, "\n")
  stop("Failed to create alternative factor data")
})

## Downloading market data for factor construction...
## Alternative factors created successfully
## Factor dimensions: 1174 3 
## 
## First 6 rows of alternative factors:
##                  Mkt.RF           SMB           HML
## 2013-01-03 -0.002127478  0.0011485079 -0.0028023782
## 2013-01-04  0.004813617  0.0026250267 -0.0011508874
## 2013-01-07 -0.003167686 -0.0006898157  0.0077935416
## 2013-01-08 -0.003287322  0.0019908831  0.0003525420
## 2013-01-09  0.002612663  0.0028316156  0.0006464387
## 2013-01-10  0.007529018 -0.0056036500  0.0085753249
## 
## Last 6 rows of alternative factors:
##                   Mkt.RF          SMB           HML
## 2017-08-23 -0.0034992527 0.0021462792 -0.0064435812
## 2017-08-24 -0.0021162993 0.0050945404  0.0074120136
## 2017-08-25  0.0016317891 0.0009235993  0.0019257741
## 2017-08-28  0.0004472709 0.0029772411  0.0067082015
## 2017-08-29  0.0008027842 0.0002060658 -0.0047858524
## 2017-08-30  0.0045648492 0.0009017110  0.0008339064
## 
## Factor summary statistics:
##      Mkt.RF                SMB                  HML            
##  Min.   :-0.0402511   Min.   :-1.629e-02   Min.   :-1.405e-02  
##  1st Qu.:-0.0029987   1st Qu.:-3.383e-03   1st Qu.:-3.169e-03  
##  Median : 0.0003806   Median : 3.938e-05   Median : 9.928e-05  
##  Mean   : 0.0004025   Mean   :-4.557e-05   Mean   : 1.112e-04  
##  3rd Qu.: 0.0046251   3rd Qu.: 3.103e-03   3rd Qu.: 3.299e-03  
##  Max.   : 0.0382516   Max.   : 2.569e-02   Max.   : 1.621e-02  
## 
## Alternative factor data ready for analysis!
## Note: These are proxy factors, not the official Fama-French factors

K <- 3
alpha <- colMeans(X)
X_ <- X - matrix(alpha, T, N, byrow = TRUE)
Sigma_prev <- matrix(0, N, N)
Sigma <- (1/(T-1)) * t(X_) %*% X_
eigSigma <- eigen(Sigma)
while (norm(Sigma - Sigma_prev, "F")/norm(Sigma, "F") > 1e-3) {
  B <- eigSigma$vectors[, 1:K, drop = FALSE] %*% diag(sqrt(eigSigma$values[1:K]), K, K)
  Psi <- diag(diag(Sigma - B %*% t(B)))
  Sigma_prev <- Sigma
  Sigma <- B %*% t(B) + Psi
  eigSigma <- eigen(Sigma - Psi)
}
cbind(alpha, B)

##             alpha                                        
## SPY  0.0006831797 -0.002764581 -0.003632401 -1.163576e-03
## XIVH 0.0033271831 -0.023771161  0.002506272  2.260428e-05
## SPHB 0.0007061831 -0.004793759 -0.007496172  3.402997e-03
## SPLV 0.0005187473 -0.001244995 -0.002399070 -3.741520e-03
## USMV 0.0005121483 -0.001296234 -0.002229389 -3.175177e-03
## JKD  0.0007123943 -0.002694075 -0.002866826 -1.803859e-03

#>              alpha                                        
#> AAPL  0.0007074564 0.0002732114 -0.004631647 -0.0044814226
#> AMD   0.0013722468 0.0045782146 -0.035202146  0.0114549515
#> ADI   0.0006533116 0.0004151904 -0.007379066 -0.0053058139
#> ABBV  0.0007787929 0.0017513359 -0.003967816 -0.0056000810
#> AEZS -0.0041576357 0.0769496344  0.002935950  0.0006249473
#> A     0.0006902482 0.0012690079 -0.005680162 -0.0061507654
#> APD   0.0006236565 0.0005442926 -0.004229364 -0.0057976394
#> AA    0.0006277163 0.0027405024 -0.009796620 -0.0149177957
#> CF   -0.0000573028 0.0023108605 -0.007409061 -0.0153425661

# Your approach was already working well
statistical_factor_model <- function(X, K) {
  # Step 1: Get factors via PCA
  pca <- prcomp(X, center = TRUE, scale. = FALSE)
  F <- pca$x[, 1:K]  # First K principal components
  
  # Step 2: Estimate loadings (your working code)
  F_ <- cbind(ones = 1, F)
  Gamma <- t(solve(t(F_) %*% F_, t(F_) %*% X))
  colnames(Gamma) <- c("alpha", paste0("factor", 1:K))
  
  return(list(
    alpha = Gamma[, 1],
    beta = Gamma[, -1],
    factors = F,
    loadings = Gamma
  ))
}

# Use it
K <- 3
factor_model <- statistical_factor_model(X, K)
cbind(alpha = factor_model$alpha, beta = factor_model$beta)

##             alpha    factor1    factor2      factor3
## SPY  0.0006831797 0.11228681  0.3734384 -0.183360963
## XIVH 0.0033271831 0.96549450 -0.2576638  0.003562074
## SPHB 0.0007061831 0.19470434  0.7706634  0.536257944
## SPLV 0.0005187473 0.05056699  0.2466426 -0.589603864
## USMV 0.0005121483 0.05264810  0.2291981 -0.500357262
## JKD  0.0007123943 0.10942313  0.2947315 -0.284259356