knitr::opts_chunk$set(echo = TRUE)

Problem 1

Repeat the cross-sectional stock return predictability discussed in the class with the following

rm(list=ls())
rm(list = ls())
library(data.table)
source("C:/Users/user/Documents/OneDrive/桌面/機器學習與財務計量/Database/some_useful_functions.R")
ff5f <- fread("C:/Users/user/Documents/OneDrive/桌面/機器學習與財務計量/Database/ff5f_monthly.csv")
dat = setDT( readRDS("C:/Users/user/Documents/OneDrive/桌面/機器學習與財務計量/HW/HW2/DATA_for_Monthly_rebalancing.RDS") )
  1. At the end of each portfolio formation month, exclude the stocks with market capitalization less than the 20% quantile.
d = sort( unique(dat$eom) )  # take out the each value of eom and sort
result_dat <- dat[,.(mktcap_lag20 = quantile(mktcap_lag, 0.2)), keyby = eom]  # calculate the 20th percentile for each market_lag
dat <- merge(dat, result_dat, by = "eom")

Exclude the stocks with mktcap_lag less than the 20% quantile

dat80 <- dat[dat$mktcap_lag >= dat$mktcap_lag20,]  
rm(dat)
  1. Transform each predictor into ranked variable (at the end of each month) by using the function frank.
dat80[,.N, by = "accrual_lag"]  # Confirm the quantity for subsequent proofreading
pred_name <- names(dat80)[10:39]
col <- pred_name
dat80[, (paste0(col, "_rank")) := lapply(.SD, function(x) frank(x, ties.method = "min")),
      by = "eom",  .SDcols = pred_name]
head(dat80)
dat80[,.N, by = "accrual_lag"]  # Confirm the quantity, it should not change

Just leave the sorted data, reducing the burden of running data backwards.

names(dat80)
##  [1] "eom"                     "permno"                 
##  [3] "ret_adj"                 "mktcap"                 
##  [5] "mktcap_lag"              "exchange"               
##  [7] "industry"                "altprc"                 
##  [9] "shrout"                  "accrual_lag"            
## [11] "aftret_invcapx_lag"      "at_turn_lag"            
## [13] "bm_lag"                  "CAPEI_lag"              
## [15] "cash_debt_lag"           "cash_lt_lag"            
## [17] "de_ratio_lag"            "debt_at_lag"            
## [19] "debt_ebitda_lag"         "equity_invcap_lag"      
## [21] "evm_lag"                 "gpm_lag"                
## [23] "GProf_lag"               "lt_debt_lag"            
## [25] "lt_ppent_lag"            "npm_lag"                
## [27] "pay_turn_lag"            "pcf_lag"                
## [29] "pe_exi_lag"              "ps_lag"                 
## [31] "ptb_lag"                 "quick_ratio_lag"        
## [33] "rect_act_lag"            "rect_turn_lag"          
## [35] "roa_lag"                 "roe_lag"                
## [37] "sale_equity_lag"         "sale_nwc_lag"           
## [39] "short_debt_lag"          "mktcap_lag20"           
## [41] "accrual_lag_rank"        "aftret_invcapx_lag_rank"
## [43] "at_turn_lag_rank"        "bm_lag_rank"            
## [45] "CAPEI_lag_rank"          "cash_debt_lag_rank"     
## [47] "cash_lt_lag_rank"        "de_ratio_lag_rank"      
## [49] "debt_at_lag_rank"        "debt_ebitda_lag_rank"   
## [51] "equity_invcap_lag_rank"  "evm_lag_rank"           
## [53] "gpm_lag_rank"            "GProf_lag_rank"         
## [55] "lt_debt_lag_rank"        "lt_ppent_lag_rank"      
## [57] "npm_lag_rank"            "pay_turn_lag_rank"      
## [59] "pcf_lag_rank"            "pe_exi_lag_rank"        
## [61] "ps_lag_rank"             "ptb_lag_rank"           
## [63] "quick_ratio_lag_rank"    "rect_act_lag_rank"      
## [65] "rect_turn_lag_rank"      "roa_lag_rank"           
## [67] "roe_lag_rank"            "sale_equity_lag_rank"   
## [69] "sale_nwc_lag_rank"       "short_debt_lag_rank"
dat80[, c("accrual_lag", "aftret_invcapx_lag", "at_turn_lag",
           "bm_lag", "CAPEI_lag", "cash_debt_lag", "cash_lt_lag",
           "de_ratio_lag", "debt_at_lag", "debt_ebitda_lag",
           "equity_invcap_lag", "evm_lag", "gpm_lag", "GProf_lag",
           "lt_debt_lag", "lt_ppent_lag", "npm_lag", "pay_turn_lag",
           "pcf_lag", "pe_exi_lag", "ps_lag", "ptb_lag", "quick_ratio_lag",
           "rect_act_lag", "rect_turn_lag", "roa_lag", "roe_lag",
           "sale_equity_lag", "sale_nwc_lag", "short_debt_lag"):=NULL]
  1. Use the previous 30 years data for training the linear regression. The dependent variable is ret_adj, use all of the 30 predictors(accrual_lag_rank, …… , short_debt_lag), and re-estimate the linear model after 12 months.
#-------------------------------------------------------------------#
# Use 30 years of data to build linear model and
# predict the next 12 months ahead future cross-sectional return
#-------------------------------------------------------------------#

# Checking if the indexing is correct
for(i in seq(360,(length(d)-12),by=12) ) {
  cat( paste0("Training data: ", d[i-359], " ~ ", d[i]), "\n" )
  cat( paste0("   Prediction: ", d[i+1], " ~ ", d[i+12]), "\n" )
}
## Training data: 1971-07-31 ~ 2001-06-30 
##    Prediction: 2001-07-31 ~ 2002-06-30 
## Training data: 1972-07-31 ~ 2002-06-30 
##    Prediction: 2002-07-31 ~ 2003-06-30 
## Training data: 1973-07-31 ~ 2003-06-30 
##    Prediction: 2003-07-31 ~ 2004-06-30 
## Training data: 1974-07-31 ~ 2004-06-30 
##    Prediction: 2004-07-31 ~ 2005-06-30 
## Training data: 1975-07-31 ~ 2005-06-30 
##    Prediction: 2005-07-31 ~ 2006-06-30 
## Training data: 1976-07-31 ~ 2006-06-30 
##    Prediction: 2006-07-31 ~ 2007-06-30 
## Training data: 1977-07-31 ~ 2007-06-30 
##    Prediction: 2007-07-31 ~ 2008-06-30 
## Training data: 1978-07-31 ~ 2008-06-30 
##    Prediction: 2008-07-31 ~ 2009-06-30 
## Training data: 1979-07-31 ~ 2009-06-30 
##    Prediction: 2009-07-31 ~ 2010-06-30 
## Training data: 1980-07-31 ~ 2010-06-30 
##    Prediction: 2010-07-31 ~ 2011-06-30 
## Training data: 1981-07-31 ~ 2011-06-30 
##    Prediction: 2011-07-31 ~ 2012-06-30 
## Training data: 1982-07-31 ~ 2012-06-30 
##    Prediction: 2012-07-31 ~ 2013-06-30 
## Training data: 1983-07-31 ~ 2013-06-30 
##    Prediction: 2013-07-31 ~ 2014-06-30 
## Training data: 1984-07-31 ~ 2014-06-30 
##    Prediction: 2014-07-31 ~ 2015-06-30 
## Training data: 1985-07-31 ~ 2015-06-30 
##    Prediction: 2015-07-31 ~ 2016-06-30 
## Training data: 1986-07-31 ~ 2016-06-30 
##    Prediction: 2016-07-31 ~ 2017-06-30 
## Training data: 1987-07-31 ~ 2017-06-30 
##    Prediction: 2017-07-31 ~ 2018-06-30 
## Training data: 1988-07-31 ~ 2018-06-30 
##    Prediction: 2018-07-31 ~ 2019-06-30 
## Training data: 1989-07-31 ~ 2019-06-30 
##    Prediction: 2019-07-31 ~ 2020-06-30 
## Training data: 1990-07-31 ~ 2020-06-30 
##    Prediction: 2020-07-31 ~ 2021-06-30 
## Training data: 1991-07-31 ~ 2021-06-30 
##    Prediction: 2021-07-31 ~ 2022-06-30
names(dat80)
##  [1] "eom"                     "permno"                 
##  [3] "ret_adj"                 "mktcap"                 
##  [5] "mktcap_lag"              "exchange"               
##  [7] "industry"                "altprc"                 
##  [9] "shrout"                  "mktcap_lag20"           
## [11] "accrual_lag_rank"        "aftret_invcapx_lag_rank"
## [13] "at_turn_lag_rank"        "bm_lag_rank"            
## [15] "CAPEI_lag_rank"          "cash_debt_lag_rank"     
## [17] "cash_lt_lag_rank"        "de_ratio_lag_rank"      
## [19] "debt_at_lag_rank"        "debt_ebitda_lag_rank"   
## [21] "equity_invcap_lag_rank"  "evm_lag_rank"           
## [23] "gpm_lag_rank"            "GProf_lag_rank"         
## [25] "lt_debt_lag_rank"        "lt_ppent_lag_rank"      
## [27] "npm_lag_rank"            "pay_turn_lag_rank"      
## [29] "pcf_lag_rank"            "pe_exi_lag_rank"        
## [31] "ps_lag_rank"             "ptb_lag_rank"           
## [33] "quick_ratio_lag_rank"    "rect_act_lag_rank"      
## [35] "rect_turn_lag_rank"      "roa_lag_rank"           
## [37] "roe_lag_rank"            "sale_equity_lag_rank"   
## [39] "sale_nwc_lag_rank"       "short_debt_lag_rank"
pred_name <- names(dat80[,11:40])  # take out the predictors
for(i in seq(360,(length(d)-12),by=12) ) {
  train_dat = dat80[ eom %between% c(d[i-359], d[i]),
                   c("ret_adj",pred_name), with = FALSE]
  
  #----- where the magical prediction happens ----#
  
  lm_mdl = lm(ret_adj ~ ., data=train_dat)  # model estimation
  
  x_test = dat80[ eom %between% c(d[i+1], d[i+12]), pred_name, with=F ] 
  
  dat80[eom %between% c(d[i+1], d[i+12]),   # 呼叫i+1 ~ i+12的樣本
        pred := predict(lm_mdl, newdata = x_test )  ] 

  #----- end of prediction task ----#
  
}
  1. Form the quintile sort portfolios using the linear regression model prediction. Use the mkt_cap_lag as the portfolio weight.

Check the maximum and minimum values of the predicted values.

Range = dat80[, as.list(range(pred, na.rm=T)), keyby = eom]
dat_for_portfolio = na.omit(dat80[,.(eom, ret_adj, mktcap_lag, pred)],"pred")
dat_for_portfolio[, portf_group := assign_portfolio(pred, seq(0.2,0.8,0.2)), by = eom]  
# group by 20th percentile

Calculate the weighted average rate of return for each portfolio by the weight of mkt_cap_lag.

quintile_portf <- dat_for_portfolio[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                    keyby = .(eom, portf_group)]

setkey(quintile_portf, eom)
setkey(ff5f, eom)
  1. For each portfolio, report the average returns, volatility, and Sharpe ratio.

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

quintile_portf <- merge(quintile_portf, ff5f[,.(eom,RF)], all.x = TRUE)   
quintile_portf[, excess_ret := 100*ret - RF]
quintile_portf <- dcast(quintile_portf,eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns, volatility, and Sharpe ratio of each portfolio.

m <- unlist( quintile_portf[, lapply(.SD, mean), .SDcols = 2L:6L] )
s <- unlist( quintile_portf[, lapply(.SD, sd), .SDcols = 2L:6L] )
sharpe_ratio <- sqrt(12)*m/s
m
##         1         2         3         4         5 
## 0.4515270 0.6009793 0.7363267 0.9246373 0.8753206
s
##        1        2        3        4        5 
## 4.780716 4.585551 4.573277 5.064850 5.092912
sharpe_ratio
##         1         2         3         4         5 
## 0.3271759 0.4540028 0.5577425 0.6324052 0.5953764

The sorting based on linear regression prediction does not generate exactly monotonically increasing/decreasing portfolio performance. Because the fifth portfolio performed worse than the fourth portfolio with a lower Sharpe ratio.

Calculate the cumulated returns of long-short portfolios.

ls_portf <- unlist( quintile_portf[,6L] - quintile_portf[,2L] )
ls_portf_wealth = cumprod(1 + ls_portf/100)

library(xts)
ls_portf_wealth = xts(ls_portf_wealth, as.Date(quintile_portf[,eom]) )
plot(ls_portf_wealth)

Calculate the Sharpe ratio of the long-short portfolio constructed with linear regression prediction.

sqrt(12)*mean(ls_portf)/sd(ls_portf) 
## [1] 0.6063449
  1. Construct the following benchmark strategies and compare their Sharpe ratio with the long-short portfolio constructed with linear regression prediction. Market-capitalization weighted portfolio
benchmark_portfolio_B = na.omit(dat80,"pred")
benchmark_portfolio_B[, pred := bm_lag_rank]

Divide into five portfolio groups according to each 20th percentile of the predicted value under each eom.

benchmark_portfolio_B[, portf_group := assign_portfolio(pred, seq(0.2,0.8,0.2)), by = eom]  # 

Each portfolio group calculates the rate of return based on the weighted average of the market capitalization of each company.

bm_portf_B <- benchmark_portfolio_B[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                    keyby = .(eom, portf_group)]

setkey(bm_portf_B, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

bm_portf_B <- merge(bm_portf_B, ff5f[,.(eom,RF)], all.x = TRUE)
bm_portf_B[, excess_ret := 100*ret - RF]
bm_portf_B <- dcast(bm_portf_B, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

bm_m_B <- unlist( bm_portf_B[, lapply(.SD,mean), .SDcols = 2L:6L] )
bm_s_B <- unlist( bm_portf_B[, lapply(.SD,sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

bm_ls_portf_B <- unlist( bm_portf_B[,6L] - bm_portf_B[,2L] )
bm_ls_portf_wealth_B = cumprod(1 + bm_ls_portf_B/100)

library(xts)
bm_ls_portf_wealth_B = xts(bm_ls_portf_wealth_B, as.Date(bm_portf_B[,eom]) )
lines(bm_ls_portf_wealth_B, col = 2)

Calculate the Sharpe ratio of the long-short quintile sorted portfolio based on the Book-to-market ratio(bm).

sqrt(12)*mean(bm_ls_portf_B)/sd(bm_ls_portf_B) 
## [1] 0.1111413
benchmark_portfolio_E = na.omit(dat80,"pred")
benchmark_portfolio_E[, pred := roe_lag_rank]

Divide into five portfolio groups according to each 20th percentile of the predicted value under each eom.

benchmark_portfolio_E[, portf_group := assign_portfolio(pred, seq(0.2,0.8,0.2)), by = eom]   

Each portfolio group calculates the rate of return based on the weighted average of the market capitalization of each company.

bm_portf_E <- benchmark_portfolio_E[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                    keyby = .(eom, portf_group)]

setkey(bm_portf_E, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

bm_portf_E <- merge(bm_portf_E, ff5f[,.(eom,RF)], all.x = TRUE)
bm_portf_E[, excess_ret := 100*ret - RF]
bm_portf_E <- dcast(bm_portf_E, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

bm_m_E <- unlist( bm_portf_E[, lapply(.SD,mean), .SDcols = 2L:6L] )
bm_s_E <- unlist( bm_portf_E[, lapply(.SD,sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

bm_ls_portf_E <- unlist( bm_portf_E[,6L] - bm_portf_E[,2L] )
bm_ls_portf_wealth_E = cumprod(1 + bm_ls_portf_E/100)

library(xts)
bm_ls_portf_wealth_E = xts(bm_ls_portf_wealth_E, as.Date(bm_portf_E[,eom]) )
lines(bm_ls_portf_wealth_E, col = 3)

Calculate the Sharpe ratio of the long-short quintile sorted portfolio based on the Return on equity(roe).

sqrt(12)*mean(bm_ls_portf_E)/sd(bm_ls_portf_E) 
## [1] 0.0983983
benchmark_portfolio_G = na.omit(dat80,"pred")
benchmark_portfolio_G[, pred := gpm_lag_rank]

Divide into five portfolio groups according to each 20th percentile of the predicted value under each eom.

benchmark_portfolio_G[, portf_group := assign_portfolio(pred, seq(0.2,0.8,0.2)), by = eom]   

Each portfolio group calculates the rate of return based on the weighted average of the market capitalization of each company.

bm_portf_G <- benchmark_portfolio_G[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                    keyby = .(eom, portf_group)]

setkey(bm_portf_G, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

bm_portf_G <- merge(bm_portf_G, ff5f[,.(eom,RF)], all.x = TRUE)
bm_portf_G[, excess_ret := 100*ret - RF]
bm_portf_G <- dcast(bm_portf_G, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

bm_m_G <- unlist( bm_portf_G[, lapply(.SD,mean), .SDcols = 2L:6L] )
bm_s_G <- unlist( bm_portf_G[, lapply(.SD,sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

bm_ls_portf_G <- unlist( bm_portf_G[,6L] - bm_portf_G[,2L] )
bm_ls_portf_wealth_G = cumprod(1 + bm_ls_portf_G/100)

library(xts)
bm_ls_portf_wealth_G = xts(bm_ls_portf_wealth_G, as.Date(bm_portf_G[,eom]) )
lines(bm_ls_portf_wealth_G, col = 4)

Calculate the Sharpe ratio of the long-short quintile sorted portfolio based on the Gross-profit margin(gpm).

sqrt(12)*mean(bm_ls_portf_G)/sd(bm_ls_portf_G) 
## [1] -0.032847

All of three benchmark strategies have worse performance on Sharpe ratio.

Problem 2

Transform the ret_adj into binary variable y = 1 if ret_adj is positive and y = 0 if ret_adj is less than or equal to zero.

Repeat Problem 1, but now in the Step 3, use the newly transformed dependent variable as target.

To form the probability predictions, employ three separate statistical models: (1) logistic regression, (2) linear discriminant analysis, (3) naive Bayes classifer. Similar to the Problem 1, construct the quintile portfolio sorts for each of these statistical models, but now use the predicted probabilities as sorting variables.

Transform the ret_adj into binary variable.

dat80[, ret_adj_binary := ifelse(dat80[,ret_adj] > 0, 1, 0)]
#-------------------------------------------------------------------#
# Use 30 years of data to build logistic regression model and
# predict the next 12 months ahead future cross-sectional return
#-------------------------------------------------------------------#

# Checking if the indexing is correct
for(i in seq(360,(length(d)-12),by=12) ) {
  cat( paste0("Training data: ", d[i-359], " ~ ", d[i]), "\n" )
  cat( paste0("   Prediction: ", d[i+1], " ~ ", d[i+12]), "\n" )
}
names(dat80)
pred_name <- names(dat80[,11:40])  # take out the predictors
for(i in seq(360,(length(d)-12),by=12) ) {
  train_dat = dat80[ eom %between% c(d[i-359], d[i]),
                     c("ret_adj_binary",pred_name), with = FALSE]
  
  #----- where the magical prediction happens ----#
  
  logisticReg = glm(ret_adj_binary ~ ., data = train_dat,
                    family = binomial(link = "logit"))  # model estimation
  
  x_test = dat80[ eom %between% c(d[i+1], d[i+12]), pred_name, with=F ] 
  
  dat80[eom %between% c(d[i+1], d[i+12]),   # 呼叫i + 1 ~ i + 12的樣本
        ProbEstimateLog := predict(logisticReg, newdata = x_test, type = "response")] 
  
  #----- end of prediction task ----#
  
}

Form the quintile sort portfolios using the logistic regression model prediction. Use the mkt_cap_lag as the portfolio weight.

Check the maximum and minimum values of the predicted values.

Range = dat80[, as.list(range(ProbEstimateLog, na.rm=T)), keyby = eom]
dat_for_portfolio_Log = na.omit(dat80[,.(eom, ret_adj, mktcap_lag, ProbEstimateLog)],"ProbEstimateLog")
dat_for_portfolio_Log[, portf_group := assign_portfolio(ProbEstimateLog, seq(0.2,0.8,0.2)), by = eom]
# group by 20th percentile

Calculate the weighted average rate of return for each portfolio by the weight of mkt_cap_lag.

quintile_portf_Log <- dat_for_portfolio_Log[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                            keyby = .(eom, portf_group)]

setkey(quintile_portf_Log, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

quintile_portf_Log <- merge(quintile_portf_Log, ff5f[,.(eom,RF)], all.x=TRUE)   
quintile_portf_Log[, excess_ret := 100*ret - RF]
quintile_portf_Log <- dcast(quintile_portf_Log, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

m_Log <- unlist( quintile_portf_Log[, lapply(.SD, mean), .SDcols = 2L:6L] )
s_Log <- unlist( quintile_portf_Log[, lapply(.SD, sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

ls_portf_Log <- unlist( quintile_portf_Log[,6L] - quintile_portf_Log[,2L] )
ls_portf_wealth_Log = cumprod(1 + ls_portf_Log/100)

library(xts)
ls_portf_wealth_Log = xts(ls_portf_wealth_Log, as.Date(quintile_portf_Log[,eom]) )
plot(ls_portf_wealth_Log)

Calculate the Sharpe ratio of the long-short portfolio constructed with logistic regression prediction.

sqrt(12)*mean(ls_portf_Log)/sd(ls_portf_Log)
## [1] 0.3308841
#-------------------------------------------------------------------#
# Use 30 years of data to build linear discriminant analysis model and
# predict the next 12 months ahead future cross-sectional return
#-------------------------------------------------------------------#
library(MASS)  # package to estimate LDA model
# Checking if the indexing is correct
for(i in seq(360,(length(d)-12),by=12) ) {
  cat( paste0("Training data: ", d[i-359], " ~ ", d[i]), "\n" )
  cat( paste0("   Prediction: ", d[i+1], " ~ ", d[i+12]), "\n" )
}
names(dat80)
pred_name <- names(dat80[,11:40])  # take out the predictors
for(i in seq(360,(length(d)-12),by=12) ) {
  train_dat = dat80[ eom %between% c(d[i-359], d[i]),
                     c("ret_adj_binary",pred_name), with = FALSE]
  
  #----- where the magical prediction happens ----#
  
  lda_fit = lda(ret_adj_binary ~ ., method = "moment", data = train_dat)  # model estimation
  
  x_test = dat80[ eom %between% c(d[i+1], d[i+12]), pred_name, with=F ] 
  
  dat80[eom %between% c(d[i+1], d[i+12]),   # 呼叫i + 1 ~ i + 12的樣本
        lda_pred := predict(lda_fit, newdata = x_test)$posterior[,2]] 
  
  #----- end of prediction task ----#
  
}

Form the quintile sort portfolios using the linear discriminant analysis model prediction. Use the mkt_cap_lag as the portfolio weight.

Check the maximum and minimum values of the predicted values.

Range = dat80[, as.list(range(lda_pred, na.rm=T)), keyby = eom]
dat_for_portfolio_LDA = na.omit(dat80[,.(eom, ret_adj, mktcap_lag, lda_pred)],"lda_pred")
dat_for_portfolio_LDA[, portf_group := assign_portfolio(lda_pred, seq(0.2,0.8,0.2)), by = eom]
# group by 20th percentile

Calculate the weighted average rate of return for each portfolio by the weight of mkt_cap_lag.

quintile_portf_LDA <- dat_for_portfolio_LDA[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                            keyby = .(eom, portf_group)]

setkey(quintile_portf_LDA, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

quintile_portf_LDA <- merge(quintile_portf_LDA, ff5f[,.(eom,RF)], all.x = TRUE)   
quintile_portf_LDA[, excess_ret := 100*ret - RF]
quintile_portf_LDA <- dcast(quintile_portf_LDA, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

m_LDA <- unlist( quintile_portf_LDA[, lapply(.SD, mean), .SDcols = 2L:6L] )
s_LDA <- unlist( quintile_portf_LDA[, lapply(.SD, sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

ls_portf_LDA <- unlist( quintile_portf_LDA[,6L] - quintile_portf_LDA[,2L] )
ls_portf_wealth_LDA = cumprod(1 + ls_portf_LDA/100)

library(xts)
ls_portf_wealth_LDA = xts(ls_portf_wealth_LDA, as.Date(quintile_portf_LDA[,eom]) )
plot(ls_portf_wealth_LDA)

Calculate the Sharpe ratio of the long-short portfolio constructed with linear discriminant analysis prediction.

sqrt(12)*mean(ls_portf_LDA)/sd(ls_portf_LDA)
## [1] 0.3284319
#-------------------------------------------------------------------#
# Use 30 years of data to build linear discriminant analysis model and
# predict the next 12 months ahead future cross-sectional return
#-------------------------------------------------------------------#
library(e1071)  # package to fit naive Bayes classifier```
# Checking if the indexing is correct
for(i in seq(360,(length(d)-12),by=12) ) {
  cat( paste0("Training data: ", d[i-359], " ~ ", d[i]), "\n" )
  cat( paste0("   Prediction: ", d[i+1], " ~ ", d[i+12]), "\n" )
}
names(dat80)
pred_name <- names(dat80[,11:40])  # take out the predictors
for(i in seq(360,(length(d)-12),by=12) ) {
  train_dat = dat80[ eom %between% c(d[i-359], d[i]),
                     c("ret_adj_binary",pred_name), with = FALSE]
  
  #----- where the magical prediction happens ----#
  
  naiveBayes_fit = naiveBayes(ret_adj_binary ~ ., data = train_dat)  # model estimation
  
  x_test = dat80[ eom %between% c(d[i+1], d[i+12]), pred_name, with=F ] 
  
  result <- predict(naiveBayes_fit, newdata = x_test, type = "raw")
  
  dat80[eom %between% c(d[i+1], d[i+12]),   # 呼叫i + 1 ~ i + 12的樣本
        naiveBayes_pred := result[,2]] 
  
  #----- end of prediction task ----#
  
}

Form the quintile sort portfolios using the naive Bayes classifier model prediction. Use the mkt_cap_lag as the portfolio weight.

Check the maximum and minimum values of the predicted values.

Range = dat80[, as.list(range(naiveBayes_pred, na.rm=T)), keyby = eom]
dat_for_portfolio_NB = na.omit(dat80[,.(eom, ret_adj, mktcap_lag, naiveBayes_pred)],"naiveBayes_pred")
dat_for_portfolio_NB[, portf_group := assign_portfolio(naiveBayes_pred, seq(0.2,0.8,0.2)), by = eom]
# group by 20th percentile

Calculate the weighted average rate of return for each portfolio by the weight of mkt_cap_lag.

quintile_portf_NB <- dat_for_portfolio_NB[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                            keyby = .(eom, portf_group)]

setkey(quintile_portf_NB, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

quintile_portf_NB <- merge(quintile_portf_NB, ff5f[,.(eom,RF)], all.x = TRUE)   
quintile_portf_NB[, excess_ret := 100*ret - RF]
quintile_portf_NB <- dcast(quintile_portf_NB, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

m_NB <- unlist( quintile_portf_NB[, lapply(.SD, mean), .SDcols = 2L:6L] )
s_Nb <- unlist( quintile_portf_NB[, lapply(.SD, sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

ls_portf_NB <- unlist( quintile_portf_NB[,6L] - quintile_portf_NB[,2L] )
ls_portf_wealth_NB = cumprod(1 + ls_portf_NB/100)

library(xts)
ls_portf_wealth_NB = xts(ls_portf_wealth_NB, as.Date(quintile_portf_NB[,eom]) )
plot(ls_portf_wealth_NB)

Calculate the Sharpe ratio of the long-short portfolio constructed with naive Bayes classifier prediction.

sqrt(12)*mean(ls_portf_NB)/sd(ls_portf_NB)
## [1] 0.2786669

The Sharpe ratio of the logistic regression is the highest among these three classification models.

Problem 3

From the Problems 1 and 2, we have 7 long-short portfolios: Four long-short portfolios are based on the prediction from linear regression, logistic regression, linear discriminant analysis, and naive Bayes classifier; Three long-short portfolios based on simple univariate sort of three firm characteristics bm, roe, and gpm.

For each portfolio, estimate the alpha from the following linear regression:

𝑅portfolio𝑡 = 𝛼 + 𝛽 × MktRf𝑡 + 𝑠 × SMB𝑡 + ℎ × HML𝑡 + 𝜀𝑡

where 𝑅portfolio𝑡 is the long-short portfolio return.

ff5f <- ff5f[457:708,]   # retain the data from "2001-07-31" ~ "2022-06-30"
#---------- Prediction portfolio ----------------#
ls_portf_wealth <- as.data.table(ls_portf_wealth)  # transform xts to data.table
setnames(ls_portf_wealth, names(ls_portf_wealth), c("eom", "R_portfolio"))
ls_portf_wealth <- merge(ls_portf_wealth, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

lm_mdl_lin = lm(R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth)

Check the significance of alpha.

summary(lm_mdl_lin)
## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04837 -0.33657 -0.00937  0.27693  1.21309 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.869527   0.031092  60.130   <2e-16 ***
## MktRf        0.014489   0.007388   1.961   0.0510 .  
## SMB         -0.024699   0.012722  -1.941   0.0533 .  
## HML         -0.007802   0.010473  -0.745   0.4570    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4885 on 248 degrees of freedom
## Multiple R-squared:  0.02675,    Adjusted R-squared:  0.01498 
## F-statistic: 2.272 on 3 and 248 DF,  p-value: 0.08075
coefficients(lm_mdl_lin)[1]   # alpha
## (Intercept) 
##    1.869527
summary(lm_mdl_lin)$coefficients[1, "t value"]  # check the t value of alpha
## [1] 60.12973
#---------- Prediction portfolio ----------------#
ls_portf_wealth_Log <- as.data.table(ls_portf_wealth_Log)  # transform xts to data.table
setnames(ls_portf_wealth_Log, names(ls_portf_wealth_Log), c("eom", "R_portfolio"))
ls_portf_wealth_Log <- merge(ls_portf_wealth_Log, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

lm_mdl_Log = lm(R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_Log)

Check the significance of alpha.

summary(lm_mdl_Log)
## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_Log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43991 -0.14870 -0.02644  0.13297  0.99333 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.449098   0.013424 107.949  < 2e-16 ***
## MktRf       -0.008908   0.003190  -2.793  0.00563 ** 
## SMB         -0.010912   0.005493  -1.987  0.04807 *  
## HML         -0.002895   0.004522  -0.640  0.52264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2109 on 248 degrees of freedom
## Multiple R-squared:  0.07406,    Adjusted R-squared:  0.06286 
## F-statistic: 6.612 on 3 and 248 DF,  p-value: 0.0002581
coefficients(lm_mdl_Log)[1]   # alpha
## (Intercept) 
##    1.449098
summary(lm_mdl_Log)$coefficients[1, "t value"]  # check the t value of alpha
## [1] 107.9491
#---------- Prediction portfolio ----------------#
ls_portf_wealth_LDA <- as.data.table(ls_portf_wealth_LDA)
setnames(ls_portf_wealth_LDA, names(ls_portf_wealth_LDA), c("eom", "R_portfolio"))
ls_portf_wealth_LDA <- merge(ls_portf_wealth_LDA, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

lm_mdl_LDA = lm(R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_LDA)

Check the significance of alpha.

summary(lm_mdl_LDA)
## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_LDA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43221 -0.14758 -0.03006  0.13483  0.98217 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.440947   0.013330 108.095  < 2e-16 ***
## MktRf       -0.008911   0.003167  -2.813  0.00529 ** 
## SMB         -0.010631   0.005454  -1.949  0.05241 .  
## HML         -0.003000   0.004490  -0.668  0.50467    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2095 on 248 degrees of freedom
## Multiple R-squared:  0.074,  Adjusted R-squared:  0.0628 
## F-statistic: 6.607 on 3 and 248 DF,  p-value: 0.0002599
coefficients(lm_mdl_LDA)[1]   # alpha
## (Intercept) 
##    1.440947
summary(lm_mdl_LDA)$coefficients[1, "t value"]  # check the t value of alpha
## [1] 108.0953
#---------- Prediction portfolio ----------------#
ls_portf_wealth_NB <- as.data.table(ls_portf_wealth_NB)
setnames(ls_portf_wealth_NB, names(ls_portf_wealth_NB), c("eom", "R_portfolio"))
ls_portf_wealth_NB <- merge(ls_portf_wealth_NB, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

lm_mdl_NB = lm(R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_NB)

Check the significance of alpha.

summary(lm_mdl_NB)
## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_NB)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66616 -0.19711 -0.02503  0.17965  0.90459 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.679189   0.018153  92.504  < 2e-16 ***
## MktRf       -0.018235   0.004313  -4.228 3.32e-05 ***
## SMB         -0.002184   0.007428  -0.294    0.769    
## HML         -0.005831   0.006115  -0.954    0.341    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2852 on 248 degrees of freedom
## Multiple R-squared:  0.08747,    Adjusted R-squared:  0.07643 
## F-statistic: 7.924 on 3 and 248 DF,  p-value: 4.562e-05
coefficients(lm_mdl_NB)[1]   # alpha
## (Intercept) 
##    1.679189
summary(lm_mdl_NB)$coefficients[1, "t value"]  # check the t value of alpha
## [1] 92.50352
#---------- Benchmark portfolio ----------------#
bm_ls_portf_wealth_B <- as.data.table(bm_ls_portf_wealth_B)
setnames(bm_ls_portf_wealth_B, names(bm_ls_portf_wealth_B), c("eom", "R_portfolio"))
bm_ls_portf_wealth_B <- merge(bm_ls_portf_wealth_B, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

bm_lm_mdl_B = lm(R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_B)

Check the significance of alpha.

summary(bm_lm_mdl_B)
## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_B)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02479 -0.37155  0.08162  0.41163  0.65984 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.599897   0.028904  55.352   <2e-16 ***
## MktRf        0.007242   0.006868   1.054    0.293    
## SMB         -0.007065   0.011827  -0.597    0.551    
## HML          0.001167   0.009736   0.120    0.905    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4541 on 248 degrees of freedom
## Multiple R-squared:  0.004781,   Adjusted R-squared:  -0.007258 
## F-statistic: 0.3971 on 3 and 248 DF,  p-value: 0.7552
coefficients(bm_lm_mdl_B)[1]   # alpha
## (Intercept) 
##    1.599897
summary(bm_lm_mdl_B)$coefficients[1, "t value"]  # check the t value of alpha
## [1] 55.35217
#---------- Benchmark portfolio ----------------#
bm_ls_portf_wealth_E <- as.data.table(bm_ls_portf_wealth_E)
setnames(bm_ls_portf_wealth_E, names(bm_ls_portf_wealth_E), c("eom", "R_portfolio"))
bm_ls_portf_wealth_E <- merge(bm_ls_portf_wealth_E, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

bm_lm_mdl_E = lm(R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_E)

Check the significance of alpha.

summary(bm_lm_mdl_E)
## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_E)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48716 -0.16056 -0.04462  0.11936  1.00329 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.024285   0.014622  70.050  < 2e-16 ***
## MktRf       -0.015166   0.003474  -4.365 1.86e-05 ***
## SMB          0.005742   0.005983   0.960    0.338    
## HML         -0.006503   0.004926  -1.320    0.188    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2297 on 248 degrees of freedom
## Multiple R-squared:  0.08206,    Adjusted R-squared:  0.07096 
## F-statistic:  7.39 on 3 and 248 DF,  p-value: 9.221e-05
coefficients(bm_lm_mdl_E)[1]   # alpha
## (Intercept) 
##    1.024285
summary(bm_lm_mdl_E)$coefficients[1, "t value"]  # check the t value of alpha
## [1] 70.05035
#---------- Benchmark portfolio ----------------#
bm_ls_portf_wealth_G <- as.data.table(bm_ls_portf_wealth_G)
setnames(bm_ls_portf_wealth_G, names(bm_ls_portf_wealth_G), c("eom", "R_portfolio"))
bm_ls_portf_wealth_G <- merge(bm_ls_portf_wealth_G, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

bm_lm_mdl_G = lm(R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_G)

Check the significance of alpha.

summary(bm_lm_mdl_G)
## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_G)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18015 -0.10560 -0.04514  0.10464  0.29762 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.652e-01  8.195e-03 105.585   <2e-16 ***
## MktRf       -8.971e-05  1.947e-03  -0.046    0.963    
## SMB          1.977e-03  3.353e-03   0.590    0.556    
## HML         -1.619e-03  2.760e-03  -0.587    0.558    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1288 on 248 degrees of freedom
## Multiple R-squared:  0.002311,   Adjusted R-squared:  -0.009757 
## F-statistic: 0.1915 on 3 and 248 DF,  p-value: 0.9021
coefficients(bm_lm_mdl_G)[1]   # alpha
## (Intercept) 
##   0.8652204
summary(bm_lm_mdl_G)$coefficients[1, "t value"]  # check the t value of alpha
## [1] 105.5852

Every portfoio has statistically significant alpha, and the alpha of the linear regression model is the highest, which we can inference its performance is the best.