knitr::opts_chunk$set(echo = TRUE)

Problem 1

Repeat the cross-sectional stock return predictability discussed in the class with the following

rm(list=ls())
rm(list = ls())
library(data.table)
source("C:/Users/user/Documents/OneDrive/桌面/機器學習與財務計量/Database/some_useful_functions.R")
ff5f <- fread("C:/Users/user/Documents/OneDrive/桌面/機器學習與財務計量/Database/ff5f_monthly.csv")
dat = setDT( readRDS("C:/Users/user/Documents/OneDrive/桌面/機器學習與財務計量/HW/HW2/DATA_for_Monthly_rebalancing.RDS") )

At the end of each portfolio formation month, exclude the stocks with market capitalization less than the 20% quantile.

d = sort( unique(dat$eom) )  # take out the each value of eom and sort
result_dat <- dat[,.(mktcap_lag20 = quantile(mktcap_lag, 0.2)), keyby = eom]  # calculate the 20th percentile for each market_lag
dat <- merge(dat, result_dat, by = "eom")

Exclude the stocks with mktcap_lag less than the 20% quantile

dat80 <- dat[dat$mktcap_lag >= dat$mktcap_lag20,]  
rm(dat)

Transform each predictor into ranked variable (at the end of each month) by using the function frank.

dat80[,.N, by = "accrual_lag"]  # Confirm the quantity for subsequent proofreading

pred_name <- names(dat80)[10:39]
col <- pred_name
dat80[, (paste0(col, "_rank")) := lapply(.SD, function(x) frank(x, ties.method = "min")),
      by = "eom",  .SDcols = pred_name]

head(dat80)
dat80[,.N, by = "accrual_lag"]  # Confirm the quantity, it should not change

Just leave the sorted data, reducing the burden of running data backwards.

names(dat80)

##  [1] "eom"                     "permno"                 
##  [3] "ret_adj"                 "mktcap"                 
##  [5] "mktcap_lag"              "exchange"               
##  [7] "industry"                "altprc"                 
##  [9] "shrout"                  "accrual_lag"            
## [11] "aftret_invcapx_lag"      "at_turn_lag"            
## [13] "bm_lag"                  "CAPEI_lag"              
## [15] "cash_debt_lag"           "cash_lt_lag"            
## [17] "de_ratio_lag"            "debt_at_lag"            
## [19] "debt_ebitda_lag"         "equity_invcap_lag"      
## [21] "evm_lag"                 "gpm_lag"                
## [23] "GProf_lag"               "lt_debt_lag"            
## [25] "lt_ppent_lag"            "npm_lag"                
## [27] "pay_turn_lag"            "pcf_lag"                
## [29] "pe_exi_lag"              "ps_lag"                 
## [31] "ptb_lag"                 "quick_ratio_lag"        
## [33] "rect_act_lag"            "rect_turn_lag"          
## [35] "roa_lag"                 "roe_lag"                
## [37] "sale_equity_lag"         "sale_nwc_lag"           
## [39] "short_debt_lag"          "mktcap_lag20"           
## [41] "accrual_lag_rank"        "aftret_invcapx_lag_rank"
## [43] "at_turn_lag_rank"        "bm_lag_rank"            
## [45] "CAPEI_lag_rank"          "cash_debt_lag_rank"     
## [47] "cash_lt_lag_rank"        "de_ratio_lag_rank"      
## [49] "debt_at_lag_rank"        "debt_ebitda_lag_rank"   
## [51] "equity_invcap_lag_rank"  "evm_lag_rank"           
## [53] "gpm_lag_rank"            "GProf_lag_rank"         
## [55] "lt_debt_lag_rank"        "lt_ppent_lag_rank"      
## [57] "npm_lag_rank"            "pay_turn_lag_rank"      
## [59] "pcf_lag_rank"            "pe_exi_lag_rank"        
## [61] "ps_lag_rank"             "ptb_lag_rank"           
## [63] "quick_ratio_lag_rank"    "rect_act_lag_rank"      
## [65] "rect_turn_lag_rank"      "roa_lag_rank"           
## [67] "roe_lag_rank"            "sale_equity_lag_rank"   
## [69] "sale_nwc_lag_rank"       "short_debt_lag_rank"

dat80[, c("accrual_lag", "aftret_invcapx_lag", "at_turn_lag",
           "bm_lag", "CAPEI_lag", "cash_debt_lag", "cash_lt_lag",
           "de_ratio_lag", "debt_at_lag", "debt_ebitda_lag",
           "equity_invcap_lag", "evm_lag", "gpm_lag", "GProf_lag",
           "lt_debt_lag", "lt_ppent_lag", "npm_lag", "pay_turn_lag",
           "pcf_lag", "pe_exi_lag", "ps_lag", "ptb_lag", "quick_ratio_lag",
           "rect_act_lag", "rect_turn_lag", "roa_lag", "roe_lag",
           "sale_equity_lag", "sale_nwc_lag", "short_debt_lag"):=NULL]

Use the previous 30 years data for training the linear regression. The dependent variable is ret_adj, use all of the 30 predictors(accrual_lag_rank, …… , short_debt_lag), and re-estimate the linear model after 12 months.

#-------------------------------------------------------------------#
# Use 30 years of data to build linear model and
# predict the next 12 months ahead future cross-sectional return
#-------------------------------------------------------------------#

# Checking if the indexing is correct
for(i in seq(360,(length(d)-12),by=12) ) {
  cat( paste0("Training data: ", d[i-359], " ~ ", d[i]), "\n" )
  cat( paste0("   Prediction: ", d[i+1], " ~ ", d[i+12]), "\n" )
}

## Training data: 1971-07-31 ~ 2001-06-30 
##    Prediction: 2001-07-31 ~ 2002-06-30 
## Training data: 1972-07-31 ~ 2002-06-30 
##    Prediction: 2002-07-31 ~ 2003-06-30 
## Training data: 1973-07-31 ~ 2003-06-30 
##    Prediction: 2003-07-31 ~ 2004-06-30 
## Training data: 1974-07-31 ~ 2004-06-30 
##    Prediction: 2004-07-31 ~ 2005-06-30 
## Training data: 1975-07-31 ~ 2005-06-30 
##    Prediction: 2005-07-31 ~ 2006-06-30 
## Training data: 1976-07-31 ~ 2006-06-30 
##    Prediction: 2006-07-31 ~ 2007-06-30 
## Training data: 1977-07-31 ~ 2007-06-30 
##    Prediction: 2007-07-31 ~ 2008-06-30 
## Training data: 1978-07-31 ~ 2008-06-30 
##    Prediction: 2008-07-31 ~ 2009-06-30 
## Training data: 1979-07-31 ~ 2009-06-30 
##    Prediction: 2009-07-31 ~ 2010-06-30 
## Training data: 1980-07-31 ~ 2010-06-30 
##    Prediction: 2010-07-31 ~ 2011-06-30 
## Training data: 1981-07-31 ~ 2011-06-30 
##    Prediction: 2011-07-31 ~ 2012-06-30 
## Training data: 1982-07-31 ~ 2012-06-30 
##    Prediction: 2012-07-31 ~ 2013-06-30 
## Training data: 1983-07-31 ~ 2013-06-30 
##    Prediction: 2013-07-31 ~ 2014-06-30 
## Training data: 1984-07-31 ~ 2014-06-30 
##    Prediction: 2014-07-31 ~ 2015-06-30 
## Training data: 1985-07-31 ~ 2015-06-30 
##    Prediction: 2015-07-31 ~ 2016-06-30 
## Training data: 1986-07-31 ~ 2016-06-30 
##    Prediction: 2016-07-31 ~ 2017-06-30 
## Training data: 1987-07-31 ~ 2017-06-30 
##    Prediction: 2017-07-31 ~ 2018-06-30 
## Training data: 1988-07-31 ~ 2018-06-30 
##    Prediction: 2018-07-31 ~ 2019-06-30 
## Training data: 1989-07-31 ~ 2019-06-30 
##    Prediction: 2019-07-31 ~ 2020-06-30 
## Training data: 1990-07-31 ~ 2020-06-30 
##    Prediction: 2020-07-31 ~ 2021-06-30 
## Training data: 1991-07-31 ~ 2021-06-30 
##    Prediction: 2021-07-31 ~ 2022-06-30

names(dat80)

##  [1] "eom"                     "permno"                 
##  [3] "ret_adj"                 "mktcap"                 
##  [5] "mktcap_lag"              "exchange"               
##  [7] "industry"                "altprc"                 
##  [9] "shrout"                  "mktcap_lag20"           
## [11] "accrual_lag_rank"        "aftret_invcapx_lag_rank"
## [13] "at_turn_lag_rank"        "bm_lag_rank"            
## [15] "CAPEI_lag_rank"          "cash_debt_lag_rank"     
## [17] "cash_lt_lag_rank"        "de_ratio_lag_rank"      
## [19] "debt_at_lag_rank"        "debt_ebitda_lag_rank"   
## [21] "equity_invcap_lag_rank"  "evm_lag_rank"           
## [23] "gpm_lag_rank"            "GProf_lag_rank"         
## [25] "lt_debt_lag_rank"        "lt_ppent_lag_rank"      
## [27] "npm_lag_rank"            "pay_turn_lag_rank"      
## [29] "pcf_lag_rank"            "pe_exi_lag_rank"        
## [31] "ps_lag_rank"             "ptb_lag_rank"           
## [33] "quick_ratio_lag_rank"    "rect_act_lag_rank"      
## [35] "rect_turn_lag_rank"      "roa_lag_rank"           
## [37] "roe_lag_rank"            "sale_equity_lag_rank"   
## [39] "sale_nwc_lag_rank"       "short_debt_lag_rank"

pred_name <- names(dat80[,11:40])  # take out the predictors

for(i in seq(360,(length(d)-12),by=12) ) {
  train_dat = dat80[ eom %between% c(d[i-359], d[i]),
                   c("ret_adj",pred_name), with = FALSE]
  
  #----- where the magical prediction happens ----#
  
  lm_mdl = lm(ret_adj ~ ., data=train_dat)  # model estimation
  
  x_test = dat80[ eom %between% c(d[i+1], d[i+12]), pred_name, with=F ] 
  
  dat80[eom %between% c(d[i+1], d[i+12]),   # 呼叫i+1 ~ i+12的樣本
        pred := predict(lm_mdl, newdata = x_test )  ] 

  #----- end of prediction task ----#
  
}

Form the quintile sort portfolios using the linear regression model prediction. Use the mkt_cap_lag as the portfolio weight.

Check the maximum and minimum values of the predicted values.

Range = dat80[, as.list(range(pred, na.rm=T)), keyby = eom]

dat_for_portfolio = na.omit(dat80[,.(eom, ret_adj, mktcap_lag, pred)],"pred")
dat_for_portfolio[, portf_group := assign_portfolio(pred, seq(0.2,0.8,0.2)), by = eom]  
# group by 20th percentile

Calculate the weighted average rate of return for each portfolio by the weight of mkt_cap_lag.

quintile_portf <- dat_for_portfolio[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                    keyby = .(eom, portf_group)]

setkey(quintile_portf, eom)
setkey(ff5f, eom)

For each portfolio, report the average returns, volatility, and Sharpe ratio.

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

quintile_portf <- merge(quintile_portf, ff5f[,.(eom,RF)], all.x = TRUE)   
quintile_portf[, excess_ret := 100*ret - RF]
quintile_portf <- dcast(quintile_portf,eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns, volatility, and Sharpe ratio of each portfolio.

m <- unlist( quintile_portf[, lapply(.SD, mean), .SDcols = 2L:6L] )
s <- unlist( quintile_portf[, lapply(.SD, sd), .SDcols = 2L:6L] )
sharpe_ratio <- sqrt(12)*m/s
m

##         1         2         3         4         5 
## 0.4515270 0.6009793 0.7363267 0.9246373 0.8753206

##        1        2        3        4        5 
## 4.780716 4.585551 4.573277 5.064850 5.092912

sharpe_ratio

##         1         2         3         4         5 
## 0.3271759 0.4540028 0.5577425 0.6324052 0.5953764

The sorting based on linear regression prediction does not generate exactly monotonically increasing/decreasing portfolio performance. Because the fifth portfolio performed worse than the fourth portfolio with a lower Sharpe ratio.

Calculate the cumulated returns of long-short portfolios.

ls_portf <- unlist( quintile_portf[,6L] - quintile_portf[,2L] )
ls_portf_wealth = cumprod(1 + ls_portf/100)

library(xts)

ls_portf_wealth = xts(ls_portf_wealth, as.Date(quintile_portf[,eom]) )
plot(ls_portf_wealth)

Calculate the Sharpe ratio of the long-short portfolio constructed with linear regression prediction.

sqrt(12)*mean(ls_portf)/sd(ls_portf)

## [1] 0.6063449

Construct the following benchmark strategies and compare their Sharpe ratio with the long-short portfolio constructed with linear regression prediction. Market-capitalization weighted portfolio

Book-to-market ratio(bm)

benchmark_portfolio_B = na.omit(dat80,"pred")
benchmark_portfolio_B[, pred := bm_lag_rank]

Divide into five portfolio groups according to each 20th percentile of the predicted value under each eom.

benchmark_portfolio_B[, portf_group := assign_portfolio(pred, seq(0.2,0.8,0.2)), by = eom]  #

Each portfolio group calculates the rate of return based on the weighted average of the market capitalization of each company.

bm_portf_B <- benchmark_portfolio_B[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                    keyby = .(eom, portf_group)]

setkey(bm_portf_B, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

bm_portf_B <- merge(bm_portf_B, ff5f[,.(eom,RF)], all.x = TRUE)
bm_portf_B[, excess_ret := 100*ret - RF]
bm_portf_B <- dcast(bm_portf_B, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

bm_m_B <- unlist( bm_portf_B[, lapply(.SD,mean), .SDcols = 2L:6L] )
bm_s_B <- unlist( bm_portf_B[, lapply(.SD,sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

bm_ls_portf_B <- unlist( bm_portf_B[,6L] - bm_portf_B[,2L] )
bm_ls_portf_wealth_B = cumprod(1 + bm_ls_portf_B/100)

library(xts)
bm_ls_portf_wealth_B = xts(bm_ls_portf_wealth_B, as.Date(bm_portf_B[,eom]) )
lines(bm_ls_portf_wealth_B, col = 2)

Calculate the Sharpe ratio of the long-short quintile sorted portfolio based on the Book-to-market ratio(bm).

sqrt(12)*mean(bm_ls_portf_B)/sd(bm_ls_portf_B)

## [1] 0.1111413

Return on equity(roe)

benchmark_portfolio_E = na.omit(dat80,"pred")
benchmark_portfolio_E[, pred := roe_lag_rank]

Divide into five portfolio groups according to each 20th percentile of the predicted value under each eom.

benchmark_portfolio_E[, portf_group := assign_portfolio(pred, seq(0.2,0.8,0.2)), by = eom]

Each portfolio group calculates the rate of return based on the weighted average of the market capitalization of each company.

bm_portf_E <- benchmark_portfolio_E[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                    keyby = .(eom, portf_group)]

setkey(bm_portf_E, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

bm_portf_E <- merge(bm_portf_E, ff5f[,.(eom,RF)], all.x = TRUE)
bm_portf_E[, excess_ret := 100*ret - RF]
bm_portf_E <- dcast(bm_portf_E, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

bm_m_E <- unlist( bm_portf_E[, lapply(.SD,mean), .SDcols = 2L:6L] )
bm_s_E <- unlist( bm_portf_E[, lapply(.SD,sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

bm_ls_portf_E <- unlist( bm_portf_E[,6L] - bm_portf_E[,2L] )
bm_ls_portf_wealth_E = cumprod(1 + bm_ls_portf_E/100)

library(xts)
bm_ls_portf_wealth_E = xts(bm_ls_portf_wealth_E, as.Date(bm_portf_E[,eom]) )
lines(bm_ls_portf_wealth_E, col = 3)

Calculate the Sharpe ratio of the long-short quintile sorted portfolio based on the Return on equity(roe).

sqrt(12)*mean(bm_ls_portf_E)/sd(bm_ls_portf_E)

## [1] 0.0983983

Gross-profit margin(gpm)

benchmark_portfolio_G = na.omit(dat80,"pred")
benchmark_portfolio_G[, pred := gpm_lag_rank]

Divide into five portfolio groups according to each 20th percentile of the predicted value under each eom.

benchmark_portfolio_G[, portf_group := assign_portfolio(pred, seq(0.2,0.8,0.2)), by = eom]

Each portfolio group calculates the rate of return based on the weighted average of the market capitalization of each company.

bm_portf_G <- benchmark_portfolio_G[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                    keyby = .(eom, portf_group)]

setkey(bm_portf_G, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

bm_portf_G <- merge(bm_portf_G, ff5f[,.(eom,RF)], all.x = TRUE)
bm_portf_G[, excess_ret := 100*ret - RF]
bm_portf_G <- dcast(bm_portf_G, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

bm_m_G <- unlist( bm_portf_G[, lapply(.SD,mean), .SDcols = 2L:6L] )
bm_s_G <- unlist( bm_portf_G[, lapply(.SD,sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

bm_ls_portf_G <- unlist( bm_portf_G[,6L] - bm_portf_G[,2L] )
bm_ls_portf_wealth_G = cumprod(1 + bm_ls_portf_G/100)

library(xts)
bm_ls_portf_wealth_G = xts(bm_ls_portf_wealth_G, as.Date(bm_portf_G[,eom]) )
lines(bm_ls_portf_wealth_G, col = 4)

Calculate the Sharpe ratio of the long-short quintile sorted portfolio based on the Gross-profit margin(gpm).

sqrt(12)*mean(bm_ls_portf_G)/sd(bm_ls_portf_G)

## [1] -0.032847

All of three benchmark strategies have worse performance on Sharpe ratio.

Problem 2

Transform the ret_adj into binary variable y = 1 if ret_adj is positive and y = 0 if ret_adj is less than or equal to zero.

Repeat Problem 1, but now in the Step 3, use the newly transformed dependent variable as target.

To form the probability predictions, employ three separate statistical models: (1) logistic regression, (2) linear discriminant analysis, (3) naive Bayes classifer. Similar to the Problem 1, construct the quintile portfolio sorts for each of these statistical models, but now use the predicted probabilities as sorting variables.

Transform the ret_adj into binary variable.

dat80[, ret_adj_binary := ifelse(dat80[,ret_adj] > 0, 1, 0)]

logistic regression

#-------------------------------------------------------------------#
# Use 30 years of data to build logistic regression model and
# predict the next 12 months ahead future cross-sectional return
#-------------------------------------------------------------------#

# Checking if the indexing is correct
for(i in seq(360,(length(d)-12),by=12) ) {
  cat( paste0("Training data: ", d[i-359], " ~ ", d[i]), "\n" )
  cat( paste0("   Prediction: ", d[i+1], " ~ ", d[i+12]), "\n" )
}
names(dat80)
pred_name <- names(dat80[,11:40])  # take out the predictors

for(i in seq(360,(length(d)-12),by=12) ) {
  train_dat = dat80[ eom %between% c(d[i-359], d[i]),
                     c("ret_adj_binary",pred_name), with = FALSE]
  
  #----- where the magical prediction happens ----#
  
  logisticReg = glm(ret_adj_binary ~ ., data = train_dat,
                    family = binomial(link = "logit"))  # model estimation
  
  x_test = dat80[ eom %between% c(d[i+1], d[i+12]), pred_name, with=F ] 
  
  dat80[eom %between% c(d[i+1], d[i+12]),   # 呼叫i + 1 ~ i + 12的樣本
        ProbEstimateLog := predict(logisticReg, newdata = x_test, type = "response")] 
  
  #----- end of prediction task ----#
  
}

Form the quintile sort portfolios using the logistic regression model prediction. Use the mkt_cap_lag as the portfolio weight.

Check the maximum and minimum values of the predicted values.

Range = dat80[, as.list(range(ProbEstimateLog, na.rm=T)), keyby = eom]

dat_for_portfolio_Log = na.omit(dat80[,.(eom, ret_adj, mktcap_lag, ProbEstimateLog)],"ProbEstimateLog")
dat_for_portfolio_Log[, portf_group := assign_portfolio(ProbEstimateLog, seq(0.2,0.8,0.2)), by = eom]
# group by 20th percentile

Calculate the weighted average rate of return for each portfolio by the weight of mkt_cap_lag.

quintile_portf_Log <- dat_for_portfolio_Log[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                            keyby = .(eom, portf_group)]

setkey(quintile_portf_Log, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

quintile_portf_Log <- merge(quintile_portf_Log, ff5f[,.(eom,RF)], all.x=TRUE)   
quintile_portf_Log[, excess_ret := 100*ret - RF]
quintile_portf_Log <- dcast(quintile_portf_Log, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

m_Log <- unlist( quintile_portf_Log[, lapply(.SD, mean), .SDcols = 2L:6L] )
s_Log <- unlist( quintile_portf_Log[, lapply(.SD, sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

ls_portf_Log <- unlist( quintile_portf_Log[,6L] - quintile_portf_Log[,2L] )
ls_portf_wealth_Log = cumprod(1 + ls_portf_Log/100)

library(xts)
ls_portf_wealth_Log = xts(ls_portf_wealth_Log, as.Date(quintile_portf_Log[,eom]) )
plot(ls_portf_wealth_Log)

Calculate the Sharpe ratio of the long-short portfolio constructed with logistic regression prediction.

sqrt(12)*mean(ls_portf_Log)/sd(ls_portf_Log)

## [1] 0.3308841

linear discriminant analysis

#-------------------------------------------------------------------#
# Use 30 years of data to build linear discriminant analysis model and
# predict the next 12 months ahead future cross-sectional return
#-------------------------------------------------------------------#
library(MASS)  # package to estimate LDA model

# Checking if the indexing is correct
for(i in seq(360,(length(d)-12),by=12) ) {
  cat( paste0("Training data: ", d[i-359], " ~ ", d[i]), "\n" )
  cat( paste0("   Prediction: ", d[i+1], " ~ ", d[i+12]), "\n" )
}
names(dat80)
pred_name <- names(dat80[,11:40])  # take out the predictors

for(i in seq(360,(length(d)-12),by=12) ) {
  train_dat = dat80[ eom %between% c(d[i-359], d[i]),
                     c("ret_adj_binary",pred_name), with = FALSE]
  
  #----- where the magical prediction happens ----#
  
  lda_fit = lda(ret_adj_binary ~ ., method = "moment", data = train_dat)  # model estimation
  
  x_test = dat80[ eom %between% c(d[i+1], d[i+12]), pred_name, with=F ] 
  
  dat80[eom %between% c(d[i+1], d[i+12]),   # 呼叫i + 1 ~ i + 12的樣本
        lda_pred := predict(lda_fit, newdata = x_test)$posterior[,2]] 
  
  #----- end of prediction task ----#
  
}

Form the quintile sort portfolios using the linear discriminant analysis model prediction. Use the mkt_cap_lag as the portfolio weight.

Check the maximum and minimum values of the predicted values.

Range = dat80[, as.list(range(lda_pred, na.rm=T)), keyby = eom]

dat_for_portfolio_LDA = na.omit(dat80[,.(eom, ret_adj, mktcap_lag, lda_pred)],"lda_pred")
dat_for_portfolio_LDA[, portf_group := assign_portfolio(lda_pred, seq(0.2,0.8,0.2)), by = eom]
# group by 20th percentile

Calculate the weighted average rate of return for each portfolio by the weight of mkt_cap_lag.

quintile_portf_LDA <- dat_for_portfolio_LDA[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                            keyby = .(eom, portf_group)]

setkey(quintile_portf_LDA, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

quintile_portf_LDA <- merge(quintile_portf_LDA, ff5f[,.(eom,RF)], all.x = TRUE)   
quintile_portf_LDA[, excess_ret := 100*ret - RF]
quintile_portf_LDA <- dcast(quintile_portf_LDA, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

m_LDA <- unlist( quintile_portf_LDA[, lapply(.SD, mean), .SDcols = 2L:6L] )
s_LDA <- unlist( quintile_portf_LDA[, lapply(.SD, sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

ls_portf_LDA <- unlist( quintile_portf_LDA[,6L] - quintile_portf_LDA[,2L] )
ls_portf_wealth_LDA = cumprod(1 + ls_portf_LDA/100)

library(xts)
ls_portf_wealth_LDA = xts(ls_portf_wealth_LDA, as.Date(quintile_portf_LDA[,eom]) )
plot(ls_portf_wealth_LDA)

Calculate the Sharpe ratio of the long-short portfolio constructed with linear discriminant analysis prediction.

sqrt(12)*mean(ls_portf_LDA)/sd(ls_portf_LDA)

## [1] 0.3284319

naive Bayes classifier

#-------------------------------------------------------------------#
# Use 30 years of data to build linear discriminant analysis model and
# predict the next 12 months ahead future cross-sectional return
#-------------------------------------------------------------------#
library(e1071)  # package to fit naive Bayes classifier```

# Checking if the indexing is correct
for(i in seq(360,(length(d)-12),by=12) ) {
  cat( paste0("Training data: ", d[i-359], " ~ ", d[i]), "\n" )
  cat( paste0("   Prediction: ", d[i+1], " ~ ", d[i+12]), "\n" )
}
names(dat80)
pred_name <- names(dat80[,11:40])  # take out the predictors

for(i in seq(360,(length(d)-12),by=12) ) {
  train_dat = dat80[ eom %between% c(d[i-359], d[i]),
                     c("ret_adj_binary",pred_name), with = FALSE]
  
  #----- where the magical prediction happens ----#
  
  naiveBayes_fit = naiveBayes(ret_adj_binary ~ ., data = train_dat)  # model estimation
  
  x_test = dat80[ eom %between% c(d[i+1], d[i+12]), pred_name, with=F ] 
  
  result <- predict(naiveBayes_fit, newdata = x_test, type = "raw")
  
  dat80[eom %between% c(d[i+1], d[i+12]),   # 呼叫i + 1 ~ i + 12的樣本
        naiveBayes_pred := result[,2]] 
  
  #----- end of prediction task ----#
  
}

Form the quintile sort portfolios using the naive Bayes classifier model prediction. Use the mkt_cap_lag as the portfolio weight.

Check the maximum and minimum values of the predicted values.

Range = dat80[, as.list(range(naiveBayes_pred, na.rm=T)), keyby = eom]

dat_for_portfolio_NB = na.omit(dat80[,.(eom, ret_adj, mktcap_lag, naiveBayes_pred)],"naiveBayes_pred")
dat_for_portfolio_NB[, portf_group := assign_portfolio(naiveBayes_pred, seq(0.2,0.8,0.2)), by = eom]
# group by 20th percentile

Calculate the weighted average rate of return for each portfolio by the weight of mkt_cap_lag.

quintile_portf_NB <- dat_for_portfolio_NB[, .(ret = weighted.mean(ret_adj, w = mktcap_lag, na.rm = T)), 
                                            keyby = .(eom, portf_group)]

setkey(quintile_portf_NB, eom)
setkey(ff5f, eom)

Import the eom and RF(risk-free rate) of ff5f to calculate the excess return of each portfolio.

quintile_portf_NB <- merge(quintile_portf_NB, ff5f[,.(eom,RF)], all.x = TRUE)   
quintile_portf_NB[, excess_ret := 100*ret - RF]
quintile_portf_NB <- dcast(quintile_portf_NB, eom ~ portf_group, value.var = "excess_ret")

Calculate the average returns and volatility of each portfolio.

m_NB <- unlist( quintile_portf_NB[, lapply(.SD, mean), .SDcols = 2L:6L] )
s_Nb <- unlist( quintile_portf_NB[, lapply(.SD, sd), .SDcols = 2L:6L] )

Calculate the cumulated returns of long-short portfolios.

ls_portf_NB <- unlist( quintile_portf_NB[,6L] - quintile_portf_NB[,2L] )
ls_portf_wealth_NB = cumprod(1 + ls_portf_NB/100)

library(xts)
ls_portf_wealth_NB = xts(ls_portf_wealth_NB, as.Date(quintile_portf_NB[,eom]) )
plot(ls_portf_wealth_NB)

Calculate the Sharpe ratio of the long-short portfolio constructed with naive Bayes classifier prediction.

sqrt(12)*mean(ls_portf_NB)/sd(ls_portf_NB)

## [1] 0.2786669

The Sharpe ratio of the logistic regression is the highest among these three classification models.

Problem 3

From the Problems 1 and 2, we have 7 long-short portfolios: Four long-short portfolios are based on the prediction from linear regression, logistic regression, linear discriminant analysis, and naive Bayes classifier; Three long-short portfolios based on simple univariate sort of three firm characteristics bm, roe, and gpm.

For each portfolio, estimate the alpha from the following linear regression:

𝑅portfolio𝑡 = 𝛼 + 𝛽 × MktRf𝑡 + 𝑠 × SMB𝑡 + ℎ × HML𝑡 + 𝜀𝑡

where 𝑅portfolio𝑡 is the long-short portfolio return.

ff5f <- ff5f[457:708,]   # retain the data from "2001-07-31" ~ "2022-06-30"

linear regression

#---------- Prediction portfolio ----------------#
ls_portf_wealth <- as.data.table(ls_portf_wealth)  # transform xts to data.table
setnames(ls_portf_wealth, names(ls_portf_wealth), c("eom", "R_portfolio"))
ls_portf_wealth <- merge(ls_portf_wealth, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

lm_mdl_lin = lm(R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth)

Check the significance of alpha.

summary(lm_mdl_lin)

## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04837 -0.33657 -0.00937  0.27693  1.21309 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.869527   0.031092  60.130   <2e-16 ***
## MktRf        0.014489   0.007388   1.961   0.0510 .  
## SMB         -0.024699   0.012722  -1.941   0.0533 .  
## HML         -0.007802   0.010473  -0.745   0.4570    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4885 on 248 degrees of freedom
## Multiple R-squared:  0.02675,    Adjusted R-squared:  0.01498 
## F-statistic: 2.272 on 3 and 248 DF,  p-value: 0.08075

coefficients(lm_mdl_lin)[1]   # alpha

## (Intercept) 
##    1.869527

summary(lm_mdl_lin)$coefficients[1, "t value"]  # check the t value of alpha

## [1] 60.12973

logistic regression

#---------- Prediction portfolio ----------------#
ls_portf_wealth_Log <- as.data.table(ls_portf_wealth_Log)  # transform xts to data.table
setnames(ls_portf_wealth_Log, names(ls_portf_wealth_Log), c("eom", "R_portfolio"))
ls_portf_wealth_Log <- merge(ls_portf_wealth_Log, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

lm_mdl_Log = lm(R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_Log)

Check the significance of alpha.

summary(lm_mdl_Log)

## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_Log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43991 -0.14870 -0.02644  0.13297  0.99333 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.449098   0.013424 107.949  < 2e-16 ***
## MktRf       -0.008908   0.003190  -2.793  0.00563 ** 
## SMB         -0.010912   0.005493  -1.987  0.04807 *  
## HML         -0.002895   0.004522  -0.640  0.52264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2109 on 248 degrees of freedom
## Multiple R-squared:  0.07406,    Adjusted R-squared:  0.06286 
## F-statistic: 6.612 on 3 and 248 DF,  p-value: 0.0002581

coefficients(lm_mdl_Log)[1]   # alpha

## (Intercept) 
##    1.449098

summary(lm_mdl_Log)$coefficients[1, "t value"]  # check the t value of alpha

## [1] 107.9491

linear discriminant analysis

#---------- Prediction portfolio ----------------#
ls_portf_wealth_LDA <- as.data.table(ls_portf_wealth_LDA)
setnames(ls_portf_wealth_LDA, names(ls_portf_wealth_LDA), c("eom", "R_portfolio"))
ls_portf_wealth_LDA <- merge(ls_portf_wealth_LDA, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

lm_mdl_LDA = lm(R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_LDA)

Check the significance of alpha.

summary(lm_mdl_LDA)

## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_LDA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43221 -0.14758 -0.03006  0.13483  0.98217 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.440947   0.013330 108.095  < 2e-16 ***
## MktRf       -0.008911   0.003167  -2.813  0.00529 ** 
## SMB         -0.010631   0.005454  -1.949  0.05241 .  
## HML         -0.003000   0.004490  -0.668  0.50467    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2095 on 248 degrees of freedom
## Multiple R-squared:  0.074,  Adjusted R-squared:  0.0628 
## F-statistic: 6.607 on 3 and 248 DF,  p-value: 0.0002599

coefficients(lm_mdl_LDA)[1]   # alpha

## (Intercept) 
##    1.440947

summary(lm_mdl_LDA)$coefficients[1, "t value"]  # check the t value of alpha

## [1] 108.0953

naive Bayes clsaaifier

#---------- Prediction portfolio ----------------#
ls_portf_wealth_NB <- as.data.table(ls_portf_wealth_NB)
setnames(ls_portf_wealth_NB, names(ls_portf_wealth_NB), c("eom", "R_portfolio"))
ls_portf_wealth_NB <- merge(ls_portf_wealth_NB, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

lm_mdl_NB = lm(R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_NB)

Check the significance of alpha.

summary(lm_mdl_NB)

## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = ls_portf_wealth_NB)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66616 -0.19711 -0.02503  0.17965  0.90459 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.679189   0.018153  92.504  < 2e-16 ***
## MktRf       -0.018235   0.004313  -4.228 3.32e-05 ***
## SMB         -0.002184   0.007428  -0.294    0.769    
## HML         -0.005831   0.006115  -0.954    0.341    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2852 on 248 degrees of freedom
## Multiple R-squared:  0.08747,    Adjusted R-squared:  0.07643 
## F-statistic: 7.924 on 3 and 248 DF,  p-value: 4.562e-05

coefficients(lm_mdl_NB)[1]   # alpha

## (Intercept) 
##    1.679189

summary(lm_mdl_NB)$coefficients[1, "t value"]  # check the t value of alpha

## [1] 92.50352

Book-to-market ratio(bm)

#---------- Benchmark portfolio ----------------#
bm_ls_portf_wealth_B <- as.data.table(bm_ls_portf_wealth_B)
setnames(bm_ls_portf_wealth_B, names(bm_ls_portf_wealth_B), c("eom", "R_portfolio"))
bm_ls_portf_wealth_B <- merge(bm_ls_portf_wealth_B, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

bm_lm_mdl_B = lm(R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_B)

Check the significance of alpha.

summary(bm_lm_mdl_B)

## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_B)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02479 -0.37155  0.08162  0.41163  0.65984 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.599897   0.028904  55.352   <2e-16 ***
## MktRf        0.007242   0.006868   1.054    0.293    
## SMB         -0.007065   0.011827  -0.597    0.551    
## HML          0.001167   0.009736   0.120    0.905    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4541 on 248 degrees of freedom
## Multiple R-squared:  0.004781,   Adjusted R-squared:  -0.007258 
## F-statistic: 0.3971 on 3 and 248 DF,  p-value: 0.7552

coefficients(bm_lm_mdl_B)[1]   # alpha

## (Intercept) 
##    1.599897

summary(bm_lm_mdl_B)$coefficients[1, "t value"]  # check the t value of alpha

## [1] 55.35217

Return on Equity(roe)

#---------- Benchmark portfolio ----------------#
bm_ls_portf_wealth_E <- as.data.table(bm_ls_portf_wealth_E)
setnames(bm_ls_portf_wealth_E, names(bm_ls_portf_wealth_E), c("eom", "R_portfolio"))
bm_ls_portf_wealth_E <- merge(bm_ls_portf_wealth_E, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

bm_lm_mdl_E = lm(R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_E)

Check the significance of alpha.

summary(bm_lm_mdl_E)

## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_E)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48716 -0.16056 -0.04462  0.11936  1.00329 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.024285   0.014622  70.050  < 2e-16 ***
## MktRf       -0.015166   0.003474  -4.365 1.86e-05 ***
## SMB          0.005742   0.005983   0.960    0.338    
## HML         -0.006503   0.004926  -1.320    0.188    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2297 on 248 degrees of freedom
## Multiple R-squared:  0.08206,    Adjusted R-squared:  0.07096 
## F-statistic:  7.39 on 3 and 248 DF,  p-value: 9.221e-05

coefficients(bm_lm_mdl_E)[1]   # alpha

## (Intercept) 
##    1.024285

summary(bm_lm_mdl_E)$coefficients[1, "t value"]  # check the t value of alpha

## [1] 70.05035

Gross-profit margin(gpm)

#---------- Benchmark portfolio ----------------#
bm_ls_portf_wealth_G <- as.data.table(bm_ls_portf_wealth_G)
setnames(bm_ls_portf_wealth_G, names(bm_ls_portf_wealth_G), c("eom", "R_portfolio"))
bm_ls_portf_wealth_G <- merge(bm_ls_portf_wealth_G, ff5f[, .(eom, MktRf, SMB, HML)], by = "eom", all.x = TRUE)

Run the linear regression model.

bm_lm_mdl_G = lm(R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_G)

Check the significance of alpha.

summary(bm_lm_mdl_G)

## 
## Call:
## lm(formula = R_portfolio ~ MktRf + SMB + HML, data = bm_ls_portf_wealth_G)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18015 -0.10560 -0.04514  0.10464  0.29762 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.652e-01  8.195e-03 105.585   <2e-16 ***
## MktRf       -8.971e-05  1.947e-03  -0.046    0.963    
## SMB          1.977e-03  3.353e-03   0.590    0.556    
## HML         -1.619e-03  2.760e-03  -0.587    0.558    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1288 on 248 degrees of freedom
## Multiple R-squared:  0.002311,   Adjusted R-squared:  -0.009757 
## F-statistic: 0.1915 on 3 and 248 DF,  p-value: 0.9021

coefficients(bm_lm_mdl_G)[1]   # alpha

## (Intercept) 
##   0.8652204

summary(bm_lm_mdl_G)$coefficients[1, "t value"]  # check the t value of alpha

## [1] 105.5852

Every portfoio has statistically significant alpha, and the alpha of the linear regression model is the highest, which we can inference its performance is the best.

Machine Learning and Financial Econometrics HW2

108205065__Chen_Shuo_Chuan

2023-04-19

Problem 1

All of three benchmark strategies have worse performance on Sharpe ratio.

Problem 2

The Sharpe ratio of the logistic regression is the highest among these three classification models.

Problem 3

Every portfoio has statistically significant alpha, and the alpha of the linear regression model is the highest, which we can inference its performance is the best.