library(readr)
library(knitr)
raw_crsp <- read_csv('crsp_ticker.csv')
raw_ff <- read_csv('FFdata.csv')

Data Cleaning 1 - CRSP Dataset

library(dplyr)
raw_crsp <- raw_crsp %>% group_by(date) %>% arrange(date) %>% ungroup()
raw_crsp <- raw_crsp[raw_crsp['date']>'1995-01-01',]
head(raw_crsp) %>% kable()
PERMNO date SHRCD TICKER COMNAM PERMCO DIVAMT PRC RET SHROUT RETX
10001 1995-01-31 11 EWST ENERGY WEST INC 7953 NA -7.7500 -0.031250 2224 -0.031250
10002 1995-01-31 11 SABC SOUTH ALABAMA BANCORPORATION INC 7954 NA -13.1250 0.000000 2999 0.000000
10003 1995-01-31 11 GCBK GREAT COUNTRY BK ASONIA CT 7957 NA 2.1250 -0.055556 5038 -0.055556
10009 1995-01-31 11 IROQ IROQUOIS BANCORP INC 7965 NA 17.0000 0.007407 1164 0.007407
10010 1995-01-31 11 CBOT CABOT MEDICAL CORP 7967 NA 5.2500 0.050000 10359 0.050000
10011 1995-01-31 10 ATCE A T C ENVIRONMENTAL INC 9813 NA 14.5625 -0.103846 5721 -0.103846
crsp <- raw_crsp[,c(1,2,4,8,9)]
crsp <- crsp %>% group_by(PERMNO) %>% arrange(PERMNO) %>% ungroup()
kable(head(crsp))
PERMNO date TICKER PRC RET
10001 1995-01-31 EWST -7.75000 -0.031250
10001 1995-02-28 EWST 7.54688 -0.026210
10001 1995-03-31 EWST 7.50000 0.006377
10001 1995-04-28 EWST 7.50000 0.000000
10001 1995-05-31 EWST -7.87500 0.050000
10001 1995-06-30 EWST 8.25000 0.060317
# making negative stock price to positive
crsp$PRC <- abs(crsp$PRC)

# make the return column to numeric value
crsp$RET <- as.double(crsp$RET)
# How many values are duplicated?
cat("There are", sum(duplicated(crsp)), "duplicated values\n") 
## There are 4734 duplicated values
# get rid of duplicated observations
crsp <- crsp %>% arrange(PERMNO,date) %>% group_by(PERMNO,date) %>% slice_head(n=1)
crsp <- crsp %>% ungroup()
# How many values are duplicated after the data cleaning?
cat("There are", sum(duplicated(crsp)), "duplicated values\n") 
## There are 0 duplicated values
library(lubridate)

# celing_date makes the date to the start of the next month. We subtract 1day to make it month end
crsp$date <- ceiling_date(crsp$date, unit='months')-days(1)
head(crsp$date)
## [1] "1995-01-31" "1995-02-28" "1995-03-31" "1995-04-30" "1995-05-31"
## [6] "1995-06-30"

Data Cleaning 2 - Fama French Dataset

kable(head(raw_ff))
Date Mkt-RF SMB HML RF
192607 2.96 -2.56 -2.43 0.22
192608 2.64 -1.17 3.82 0.25
192609 0.36 -1.40 0.13 0.23
192610 -3.24 -0.09 0.70 0.32
192611 2.53 -0.10 -0.51 0.31
192612 2.62 -0.03 -0.05 0.28
# code for chaning the figures into monthend dateformat
raw_ff$Date <-ymd(paste0(raw_ff$Date, "01"))+months(1)-days(1)

# divide it by 100
raw_ff[,c(2,3,4,5)] <- raw_ff[,c(2,3,4,5)]/100

kable(head(raw_ff))
Date Mkt-RF SMB HML RF
1926-07-31 0.0296 -0.0256 -0.0243 0.0022
1926-08-31 0.0264 -0.0117 0.0382 0.0025
1926-09-30 0.0036 -0.0140 0.0013 0.0023
1926-10-31 -0.0324 -0.0009 0.0070 0.0032
1926-11-30 0.0253 -0.0010 -0.0051 0.0031
1926-12-31 0.0262 -0.0003 -0.0005 0.0028
# extract fromm 2000-01-31 to 2023-06-30
ff <- raw_ff[raw_ff['Date']>'1995-01-01'&raw_ff['Date']<'2023-07-01' ,]

# extract date, market premium, and risk-free rate
ff <- ff[,c(1,2,5)]

kable(head(ff))
Date Mkt-RF RF
1995-01-31 0.0180 0.0042
1995-02-28 0.0363 0.0040
1995-03-31 0.0219 0.0046
1995-04-30 0.0211 0.0044
1995-05-31 0.0290 0.0054
1995-06-30 0.0272 0.0047
# change the column names for the ff data
colnames(ff) <- c('date','MKTRF','RF')
colnames(ff)
## [1] "date"  "MKTRF" "RF"
# group CRSP dataset by date for merging purpose
crsp <- crsp%>%group_by(date)

# merge the dataset by date
df <- merge(crsp,ff,by='date')

# see the result
kable(df%>%group_by(date)%>%head(n=5))
date PERMNO TICKER PRC RET MKTRF RF
1995-01-31 10001 EWST 7.7500 -0.031250 0.018 0.0042
1995-01-31 87012 WELS 4.7500 0.000000 0.018 0.0042
1995-01-31 76680 UFG 2.0000 -0.030303 0.018 0.0042
1995-01-31 43692 PLR 8.7500 0.044776 0.018 0.0042
1995-01-31 66376 WSO 16.0625 -0.029925 0.018 0.0042
# make a new column for return premium
df$'RETRF' <- (df$RET-df$RF)

# delete all the Na values
df <- df %>% na.omit()

# this is your final dataset
kable(df%>%group_by(date)%>%tail(n=5))
date PERMNO TICKER PRC RET MKTRF RF RETRF
2023-06-30 14862 CLRB 1.95 0.250000 0.0646 0.004 0.246000
2023-06-30 13628 WDAY 225.89 0.065569 0.0646 0.004 0.061569
2023-06-30 22517 PPL 26.46 0.019084 0.0646 0.004 0.015084
2023-06-30 75912 PTC 142.30 0.058780 0.0646 0.004 0.054780
2023-06-30 93436 TSLA 261.77 0.283627 0.0646 0.004 0.279627

Regression Example - TESLA

# extract Tesla data
tesla <- df[df['TICKER']=='TSLA',]
kable(tesla %>% head())
date PERMNO TICKER PRC RET MKTRF RF RETRF
1091726 2010-07-31 93436 TSLA 19.940 -0.163240 0.0693 1e-04 -0.163340
1096546 2010-08-31 93436 TSLA 19.480 -0.023069 -0.0477 1e-04 -0.023169
1100878 2010-09-30 93436 TSLA 20.405 0.047485 0.0954 1e-04 0.047385
1105988 2010-10-31 93436 TSLA 21.840 0.070326 0.0388 1e-04 0.070226
1110213 2010-11-30 93436 TSLA 35.330 0.617674 0.0060 1e-04 0.617574
1114143 2010-12-31 93436 TSLA 26.630 -0.246250 0.0682 1e-04 -0.246350
tesla_reg <- lm(data=tesla, RETRF~MKTRF)
summary(tesla_reg)
## 
## Call:
## lm(formula = RETRF ~ MKTRF, data = tesla)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38709 -0.09618 -0.02755  0.08148  0.73560 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.02938    0.01431   2.054   0.0417 *  
## MKTRF        1.63279    0.31729   5.146    8e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.173 on 154 degrees of freedom
## Multiple R-squared:  0.1467, Adjusted R-squared:  0.1412 
## F-statistic: 26.48 on 1 and 154 DF,  p-value: 7.996e-07
library(ggplot2)
tesla %>%
  ggplot(aes(x=MKTRF,y=RETRF))+
  geom_point(color="#2563EB",size=4)+
  labs(x='Market Premium',y='Tesla Premium',title='CAPM Regression')+
  theme_minimal()+
  scale_y_continuous(labels = scales::percent)+
  scale_x_continuous(labels = scales::percent)+
  geom_smooth(method = "lm", se = FALSE,color='grey30', size = 1) +
  annotate(x=max(tesla$MKTRF)-0.01,y=max(tesla$RETRF)-0.5,
           'text',label=paste('y =', round(tesla_reg$coefficients[1], 3),'+',
                              round(tesla_reg$coefficients[2], 3),'X'),
           fontface = "bold",
           color='black')

Rolling Regression - TESLA

  • This part is just exploring whether the time-series beta has to do with stock return.

  • Recall that the regression above is conducted on 20 year monthly data. In this part, we will conduct 5 year rolling regression and see how the beta has been changing over time.

  • I will be refering to the code from https://www.tidy-finance.org/r/beta-estimation.html website to compute rolling beta.

library(slider)

# Create a function that estimates the capm model
estimate_capm <- function(data, min_obs = 60) {
  if (nrow(data) < min_obs) {
    beta <- as.numeric(NA)
  } else {
    fit <- lm(RETRF ~ MKTRF, data = data)
    beta <- as.numeric(fit$coefficients[2])
  }
  return(beta)
}

# rolling capm estimation
roll_capm_estimation <- function(data, months, min_obs) {
  data <- bind_rows(data) %>%
    arrange(date)

  betas <- slide_period_vec(
    .x = data,
    .i = data$date,
    .period = "month",
    .f = ~estimate_capm(., min_obs),
    .before = months - 1,
    .complete = FALSE
  )

  tibble(
    month = unique(data$date),
    beta = betas
  )
}
# extracting the result
tesla_beta <- tesla %>%
 mutate(roll_capm_estimation(pick(everything()), months = 60, min_obs = 60)) %>%
          select(date, PERMNO, beta,RET)

tesla_beta %>% tail %>% kable()
date PERMNO beta RET
1672865 2023-01-31 93436 2.123747 0.406235
1677153 2023-02-28 93436 2.088892 0.187565
1680675 2023-03-31 93436 2.043201 0.008507
1685049 2023-04-30 93436 2.048751 -0.207992
1690521 2023-05-31 93436 2.055980 0.241130
1695259 2023-06-30 93436 2.090119 0.283627
library(gridExtra)

# plot for tesla beta
t1 <- tesla_beta %>%
  ggplot(aes(x=date,y=beta))+
  geom_line(color="#2563EB")+
  labs(x=NULL,y='Beta',title='Rolling 60 Month Beta for TSLA')+
  theme_minimal()+
  scale_x_date(date_breaks = '2 years', date_labels = '%Y') 

# plot for tesla return
t2 <- tesla_beta %>%
  ggplot(aes(x=date,y=RET))+
  geom_line(color="#2563EB")+
  labs(x='Date',y='Return',title='Monthly Return for TSLA')+
  theme_minimal()+
  scale_y_continuous(labels = scales::percent)+
  scale_x_date(date_breaks = '2 years', date_labels = '%Y')

grid.arrange(t1,t2)

  • Some people might be curious whether there is any relationship between beta and return, or whether there is anyway we can predict the return using the beta information.

  • Let’s start with computing correlation

# conduct correlation test
cor.test(tesla_beta$RET,tesla_beta$beta)
## 
##  Pearson's product-moment correlation
## 
## data:  tesla_beta$RET and tesla_beta$beta
## t = 0.088053, df = 95, p-value = 0.93
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1907553  0.2081041
## sample estimates:
##         cor 
## 0.009033681
  • Unfortunately, there wasn’t any significant correlation (p-value is kind of signifcant but the coefficient is low) between the the beta and the return. Let’s try to lag the beta and compute the correlation. Thus, we are using T beta to find the correlation for T+1 return. This is somehow using prior beta information to find the correlation for future return.
# lag the beta
tesla_beta$lagbeta <- tesla_beta$beta %>% lag()
# conduct correlation test
cor.test(tesla_beta$RET,tesla_beta$lagbeta)
## 
##  Pearson's product-moment correlation
## 
## data:  tesla_beta$RET and tesla_beta$lagbeta
## t = -0.29348, df = 94, p-value = 0.7698
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2293511  0.1712685
## sample estimates:
##         cor 
## -0.03025635
  • Unfortunately, there wasn’t any significant correlation.

  • Now, I will be testing on whether beta helps predict the return. I will be regressing T beta percentage change on T+1 Return. If I don’t lag the the period and match the period, that will cause look-ahead bias.

# calculate % increase in lag beta
tesla_beta$lagchange <- lag((tesla_beta$beta/tesla_beta$lagbeta)-1)
# conduct regression to predict return using beta
lm(data=tesla_beta, RET~lagbeta) %>% summary()
## 
## Call:
## lm(formula = RET ~ lagbeta, data = tesla_beta)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40550 -0.12096 -0.03320  0.08491  0.69647 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.05815    0.05194   1.120    0.266
## lagbeta     -0.01032    0.03517  -0.293    0.770
## 
## Residual standard error: 0.1894 on 94 degrees of freedom
##   (60 observations deleted due to missingness)
## Multiple R-squared:  0.0009154,  Adjusted R-squared:  -0.009713 
## F-statistic: 0.08613 on 1 and 94 DF,  p-value: 0.7698
  • From the regression result, % change in beta does not help predict TESLA return.

  • Before even conducting this regression, we have to check the stationarity of the variables. Even though the coefficients are significant, there are some rules that we have to follow when conducting linear regression using OLS method which makes the model invalid. You will be learning this if you take FBE 543.

  • Therefore, this exercise is just conducted based on curiosity and it’s not really desirble method to predict stock return. Moreover, it is not even the main purpose of making this material

Mean Return by Beta Qunatile - TESLA

# Create quantiles based on beta
tesla_beta_quantiles <- tesla_beta %>%
  na.omit() %>% 
  mutate(quantile_group = cut(beta, breaks = quantile(beta, probs = seq(0, 1, 0.1)))) %>%
  group_by(quantile_group) %>%
  summarise(mean_return = mean(RET)) %>% na.omit()
# Plot the mean return for each quantile
ggplot(tesla_beta_quantiles, aes(x = quantile_group, y = mean_return)) +
  geom_bar(stat = "identity", fill = "#2563EB") +
  labs(x = 'Beta Quantiles', y = 'Mean Return', title = 'Mean Return for Each Beta Quantile') +
  theme_minimal() + 
  scale_y_continuous(labels = scales::percent)

Does Higher Beta Offers Higher Return?

# beta_df is the data that contains more than 60 observations for each PERMNO
beta_df <- df %>%
  group_by(PERMNO) %>%
  filter(n() >= 60) %>%
  ungroup()
# extracting the result
stocks_beta <- beta_df %>%
  group_by(PERMNO) %>%
  mutate(roll_capm_estimation(pick(everything()), months = 60, min_obs = 60)) %>%
  ungroup() %>%
  select(date, PERMNO, beta,RET,RETRF)
stocks_beta_final <- stocks_beta %>% na.omit()
stocks_beta_final %>% head()
## # A tibble: 6 x 5
##   date       PERMNO  beta     RET   RETRF
##   <date>      <dbl> <dbl>   <dbl>   <dbl>
## 1 1999-12-31  77659 1.53   0.112   0.108 
## 2 1999-12-31  75257 1.91  -0.0106 -0.0150
## 3 1999-12-31  79260 1.37   0.726   0.722 
## 4 1999-12-31  78868 0.760 -0.278  -0.282 
## 5 1999-12-31  79365 0.113 -0.401  -0.406 
## 6 1999-12-31  37699 1.33   0.473   0.468
stocks_beta_quantile <- stocks_beta_final %>%
  mutate(quantile_group = cut(beta, breaks = quantile(beta, probs = seq(0, 1, 0.1)))) %>%
  group_by(quantile_group) %>%
  summarise(mean_return = mean(RET)*12,
            risk = sd(RET)*sqrt(12),
            sharpe = mean(RETRF)*12/risk) %>% na.omit()

kable(stocks_beta_quantile)
quantile_group mean_return risk sharpe
(-8.96,0.26] 0.1727203 0.6074474 0.2482063
(0.26,0.484] 0.1451785 0.4297679 0.2896746
(0.484,0.674] 0.1285874 0.4452647 0.2464039
(0.674,0.851] 0.1261146 0.4643417 0.2340705
(0.851,1.02] 0.1172767 0.4803300 0.2102455
(1.02,1.2] 0.1259576 0.5197748 0.2131751
(1.2,1.41] 0.1212676 0.5573713 0.1923859
(1.41,1.68] 0.1120085 0.6200030 0.1591588
(1.68,2.14] 0.1111571 0.7341186 0.1319812
(2.14,9.79] 0.2319634 1.0102013 0.2122025
# Plot the mean return for each quantile
ggplot(stocks_beta_quantile, aes(x = quantile_group, y = sharpe)) +
  geom_bar(stat = "identity", fill = "#2563EB") +
  labs(x = 'Beta Quantiles', y = 'Sharpe Ratio', title = 'Sharpe Ratio for Each Beta Quantile (2000~2023)') +
  theme_minimal()

stocks_beta_final_recent <- stocks_beta_final[stocks_beta_final['date']>'2018-01-01',] %>%
  mutate(quantile_group = cut(beta, breaks = quantile(beta, probs = seq(0, 1, 0.1)))) %>%
  group_by(quantile_group) %>%
  summarise(mean_return = mean(RET)*12,
            risk = sd(RET)*sqrt(12),
            sharpe = mean(RETRF)*12/risk) %>% na.omit()

kable(stocks_beta_final_recent)
quantile_group mean_return risk sharpe
(-8.96,0.416] 0.0960968 0.9346077 0.0864004
(0.416,0.665] 0.0351043 0.4491116 0.0434323
(0.665,0.836] 0.0778037 0.4456290 0.1413074
(0.836,0.983] 0.0704497 0.4647352 0.1192782
(0.983,1.12] 0.0738895 0.4781614 0.1230703
(1.12,1.26] 0.0879407 0.4958023 0.1473986
(1.26,1.44] 0.1170988 0.5668250 0.1808610
(1.44,1.65] 0.0973321 0.6060496 0.1376442
(1.65,2.03] 0.0817184 0.7749122 0.0881162
(2.03,9.79] 0.4012588 1.2689997 0.3061907
# Plot the mean return for each quantile
ggplot(stocks_beta_final_recent, aes(x = quantile_group, y = sharpe)) +
  geom_bar(stat = "identity", fill = "#2563EB") +
  labs(x = 'Beta Quantiles', y = 'Sharpe Ratio', title = 'Sharpe Ratio for Each Beta Quantile (2018~2023)') +
  theme_minimal()