library(broom)  # for tidying output from various statistical procedures
library(ggpubr)  # for creating and customising ggplot2
library(Hmisc)  # For correlation analysis
library(kableExtra)  # for improving the appearance of tables

library(onewaytests)  # for significance testing
library(stargazer)  # for formatting regression and summary statistics tables
library(tidyquant)  # for importing stock data
library(tidyverse)  # for working with data
JWstocks <- c("AWR", "BAC", "MT") %>%
    tq_get(get = "stock.prices", from = "2000-01-01") %>%
    select(symbol, date, adjusted)

0.1 First 6 rows of imported data

# Displaying the first 6 rows of the imported stock prices
head(JWstocks, n = 6) %>%
    kable(caption = "First 6 rows of stock prices") %>%
    kable_styling(latex_options = "HOLD_position")
Table 0.1: First 6 rows of stock prices
symbol date adjusted
AWR 2000-01-03 6.42
AWR 2000-01-04 6.31
AWR 2000-01-05 6.32
AWR 2000-01-06 6.34
AWR 2000-01-07 6.38
AWR 2000-01-10 6.38

1 The Analysis

1.1 Plot prices over time

Plot the prices of each asset over time separately. Succinctly describe in words the evolution of each asset over time. (limit: 100 words for each time series).

# Filtering the list of stock prices for AWR only
AWRPrices <- JWstocks %>%
    filter(symbol == "AWR")

# Plotting the time series for AWR stock prices
ggplot(data = AWRPrices, aes(x = date, y = adjusted)) + geom_line(color = "#8483cf",
    size = 0.1) + labs(title = "Time Series of Stock Prices for AWR",
    x = "Date", y = "Price")

[Insert description here]

Describe the time series in general, talk about the trend, seasonality, approximate lowest and highest stock prices and around when they occurred.

E.g. The time series displayed an upward trend, while seasonality started somewhere around 2013. There was a gradual rise in stock prices as from 2001 till 2011, after which there was a sharp increase…

Mention the two significant financial events that have occurred in recent history, creating drops in stock prices (you’ll see these in Part 2.10 of the Project):

  1. On September 15 2008, Lehman Brothers declared bankruptcy and a Global Financial Crisis started
  2. On March 11 2020, the WHO declared COVID-19 a pandemic.

options(tinytex.verbose = TRUE)

# Filtering the list of stock prices for BAC only
BACPrices <- JWstocks %>%
    filter(symbol == "BAC")

# Plotting the time series for BAC stock prices
ggplot(data = BACPrices, aes(x = date, y = adjusted)) + geom_line(color = "#3db153",
    size = 0.1) + labs(title = "Time Series of Stock Prices for BAC",
    x = "Date", y = "Price")

# Filtering the list of stock prices for MT only
MTPrices <- JWstocks %>%
    filter(symbol == "MT")

# Plotting the time series for MT stock prices
ggplot(data = MTPrices, aes(x = date, y = adjusted)) + geom_line(color = "#c4594e",
    size = 0.1) + labs(title = "Time Series of Stock Prices for MT",
    x = "Date", y = "Price")

1.2 Calculate returns and plot returns over time

Calculate the daily percentage returns of each asset using the following formula: \[ r_t = 100\ln\Big(\frac{P_t}{P_{t-1}}\Big) \]

Where \(P_t\) is the asset price at time \(t\). Then plot the returns for each asset over time.

# Computing the returns and adding the column of returns to
# the imported data

JWstocks <- JWstocks %>%
    group_by(symbol) %>%
    mutate(returns = 100 * log(adjusted/lag(adjusted)))

1.2.1 First 6 rows of data updated with returns

# Displaying the first 6 rows of the updated data frame
head(JWstocks, n = 6) %>%
    kable(caption = "First 6 rows of stock prices updated with returns") %>%
    kable_styling(latex_options = "HOLD_position")
Table 1.1: First 6 rows of stock prices updated with returns
symbol date adjusted returns
AWR 2000-01-03 6.42
AWR 2000-01-04 6.31 -1.754
AWR 2000-01-05 6.32 0.177
AWR 2000-01-06 6.34 0.353
AWR 2000-01-07 6.38 0.527
AWR 2000-01-10 6.38 0.000

1.2.2 Time series of returns

# Filtering the list of returns for AWR only
AWRReturns <- JWstocks %>%
    filter(symbol == "AWR") %>%
    drop_na()

# Plotting the time series for AWR returns
ggplot(data = AWRReturns, aes(x = date, y = returns)) + geom_line(color = "#8483cf",
    size = 0.1) + labs(title = "Time Series of Returns for AWR",
    x = "Date", y = "Return")

# Filtering the list of returns for BAC only
BACReturns <- JWstocks %>%
    filter(symbol == "BAC") %>%
    drop_na()

# Plotting the time series for BAC returns
ggplot(data = BACReturns, aes(x = date, y = returns)) + geom_line(color = "#3db153",
    size = 0.1) + labs(title = "Time Series of Returns for BAC",
    x = "Date", y = "Return")

# Filtering the list of returns for MT only
MTReturns <- JWstocks %>%
    filter(symbol == "MT") %>%
    drop_na()

# Plotting the time series for MT returns
ggplot(data = MTReturns, aes(x = date, y = returns)) + geom_line(color = "#c4594e",
    size = 0.1) + labs(title = "Time Series of Returns for MT",
    x = "Date", y = "Return")

1.3 Histograms of returns

Create a histogram for each of the returns series (explain how you determined the number of bins to use).

# Calculating the numbers of cells and rounding up to the
# nearest integer
mybins <- ceiling(1 + 3.322 * log(length(JWstocks$returns)/3))

Based on Sturges’ rule, the number of bins, k, was found by using: k = 1 + 3.322 ln(n) where n = 5888, thus giving a value of 30 when rounded up to the nearest integer.

round_any = function(x, accuracy, f = round) {
    f(x/accuracy) * accuracy
}

maxAWRReturns <- max(AWRReturns$returns, na.rm = TRUE)
minAWRReturns <- min(AWRReturns$returns, na.rm = TRUE)
lhistlimitAWR = round_any((minAWRReturns), 10, f = floor)
uhistlimitAWR = round_any((maxAWRReturns), 10, f = ceiling)
ggplot(AWRReturns, aes(x = returns)) + geom_histogram(bins = mybins,
    col = "black", fill = "#8483cf", alpha = 0.5) + labs(title = "Histogram of Returns for AWR",
    x = "Returns", y = "Frequency") + xlim(c(lhistlimitAWR, uhistlimitAWR))

maxBACReturns <- max(BACReturns$returns, na.rm = TRUE)
minBACReturns <- min(BACReturns$returns, na.rm = TRUE)
lhistlimitBAC = round_any(min(BACReturns$returns, na.rm = TRUE),
    10, f = floor)
uhistlimitBAC = round_any(max(BACReturns$returns, na.rm = TRUE),
    10, f = ceiling)
labs(title = "Histograms of Returns for BAC", x = "Returns",
    y = "Frequency") + xlim(c(lhistlimitBAC, uhistlimitBAC))
## NULL
# To determine the span of the x-axis for the histogram of
# MT returns
maxMTReturns <- max(MTReturns$returns, na.rm = TRUE)
minMTReturns <- min(MTReturns$returns, na.rm = TRUE)
lhistlimitMT = round_any(min(MTReturns$returns, na.rm = TRUE),
    10, f = floor)
uhistlimitMT = round_any(max(MTReturns$returns, na.rm = TRUE),
    10, f = ceiling)
labs(title = "Histograms of Returns for MT", x = "Returns", y = "Frequency") +
    xlim(c(lhistlimitMT, uhistlimitMT))
## NULL

1.4 Summary table of returns

Report the descriptive statistics in a single table which includes the mean, median, variance, standard deviation, skewness and kurtosis for each series. What conclusions can you draw from these descriptive statistics?

Summary statistics for returns

# Extracting the count, mean, median, variance, standard
# deviation, skewness and kurtosis

JWstocks %>%
    group_by(symbol) %>%
    dplyr::summarise(Mean = mean(returns, na.rm = TRUE), Median = median(returns,
        na.rm = TRUE), Variance = var(returns, na.rm = TRUE),
        `Std Dev` = sd(returns, na.rm = TRUE), Skewness = skewness(returns,
            na.rm = TRUE), Kurtosis = kurtosis(returns, na.rm = TRUE)) %>%
    kable(caption = "Summary statistics for returns") %>%
    kable_styling(latex_options = "HOLD_position")
Table 1.2: Summary statistics for returns
symbol Mean Median Variance Std Dev Skewness Kurtosis
AWR 0.045 0.08 3.38 1.84 -0.117 7.55
BAC 0.013 0.03 7.93 2.82 -0.315 26.41
MT -0.005 0.00 12.42 3.52 0.124 9.95

[Enter description of each distribution here]

Example (AWR):

The mean return for AWR is 0.031 (3.1%) with a relatively high variance of 15.46, indicating that the observations are widely spread in the distribution. Also, the distribution is slightly negatively skewed (peak to the right) with a coefficient of skewness of -0.139, but is leptokurtic (peaked) with a coefficient of kurtosis of 8.49 (positive).

1.5 Are average returns significantly different from zero?

Under the assumption that the returns of each asset are drawn from an independently and identically distributed normal distribution, are the expected returns of each asset statistically different from zero at the 1% level of significance? Provide details for all 5 steps to conduct a hypothesis test, including the equation for the test statistic. Calculate and report all the relevant values for your conclusion and be sure to provide an interpretation of the results.

# Extract AWR returns from mystocks and convert into a
# vector
awr <- JWstocks %>%
    filter(symbol == "AWR") %>%
    select(returns) %>%
    drop_na() %>%
    pull()

1.5.1 One-sample t-test (AWR)

Step 1: Formulate the null and alternative hypotheses

\[ \begin{aligned} H_0&:\ \mu=0 \\ H_1&:\ \mu\neq0 \end{aligned} \]

alpha <- 0.01
n <- length(awr)

Step 2: The significance level, \(\alpha\), is 0.01 and the sample size, n, is 5887.

Step 3: The test statistic is

\[ \ t = \frac{\bar{x}\ -\ \mu\ }{{s}/{\sqrt n}} \] Since this is a two-tailed test, divide \(\alpha\) in two for the lower and upper tails. The t-test is used because

  1. the population variance is unknown
  2. the population is normally distributed.
## Conducting the test for AWR
tAWR <- t.test(awr, mu = 0, alpha = 0.01, alternative = "two.sided")
tAWR
## 
##  One Sample t-test
## 
## data:  awr
## t = 2, df = 5886, p-value = 0.06
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.00208  0.09179
## sample estimates:
## mean of x 
##    0.0449

Step 4: Critical values

lowerLimit <- round(qt(alpha/2, df = n - 1), 3)
upperLimit <- round(qt(1 - alpha/2, df = n - 1), 3)

decisionAWRCV <- ifelse(tAWR$statistic < lowerLimit | tAWR$statistic >
    upperLimit, "reject the null hypothesis", "do not reject the null hypothesis")

decisionAWRPV <- ifelse(tAWR$p.value < alpha, "reject the null hypothesis",
    "do not reject the null hypothesis")

The lower and upper critical values are -2.577 and 2.577 respectively.

The p-value is 0.061.

Step 5: Decision and conclusion

Critical value approach: Since -2.577 < 1.873 < 2.577, do not reject the null hypothesis.

p-value approach: Since 0.061 > 0.01, do not reject the null hypothesis.

Conclude that the mean return for AWR is not statistically different from 0.

1.5.2 One-sample t-test (BAC)

Step 1: Formulate the null and alternative hypotheses

\[ \begin{aligned} H_0&:\ \mu=0 \\ H_1&:\ \mu\neq0 \end{aligned} \]

Step 2: The significance level, \(\alpha\), is 0.01 and the sample size, n, is 5887.

Step 3: The test statistic is

\[ \ t = \frac{\bar{x}\ -\ \mu\ }{{s}/{\sqrt n}} \] Since this is a two-tailed test, divide \(\alpha\) in two for the lower and upper tails. The t-test is used because the population variance is unknown.

## Conducting the test for BAC
tBAC <- t.test(bac, mu = 0, alpha = 0.01, alternative = "two.sided")
tBAC
## 
##  One Sample t-test
## 
## data:  bac
## t = 0.3, df = 5886, p-value = 0.7
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.0592  0.0847
## sample estimates:
## mean of x 
##    0.0128

Step 4: Critical values

decisionBACCV <- ifelse(tBAC$statistic < lowerLimit | tBAC$statistic >
    upperLimit, "reject the null hypothesis", "do not reject the null hypothesis")
decisionBACPV <- ifelse(tBAC$p.value < alpha, "reject the null hypothesis",
    "do not reject the null hypothesis")

The lower and upper critical values are -2.577 and 2.577 respectively.

The p-value is 0.728.

Step 5: Decision and conclusion

Critical value approach: Since -2.577 < 0.348 < 2.577, do not reject the null hypothesis.

p-value approach: Since 0.728 > 0.01, do not reject the null hypothesis.

Conclude that the mean return for BAC is not statistically different from 0.

1.5.3 One-sample t-test (MT)

Step 1: Formulate the null and alternative hypotheses

\[ \begin{aligned} H_0&:\ \mu=0 \\ H_1&:\ \mu\neq0 \end{aligned} \]

Step 2: The significance level, \(\alpha\), is 0.01 and the sample size, n, is 5887.

Step 3: The test statistic is

\[ \ t = \frac{\bar{x}\ -\ \mu\ }{{s}/{\sqrt n}} \] Since this is a two-tailed test, divide \(\alpha\) in two for the lower and upper tails. The t-test is used because the population variance is unknown.

## Conducting the test for MT
tMT <- t.test(mt, mu = 0, alpha = 0.01, alternative = "two.sided")
tMT
## 
##  One Sample t-test
## 
## data:  mt
## t = -0.1, df = 5886, p-value = 0.9
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.0948  0.0852
## sample estimates:
## mean of x 
##   -0.0048

Step 4: Critical values

decisionMTCV <- ifelse(tMT$statistic < lowerLimit | tMT$statistic >
    upperLimit, "reject the null hypothesis", "do not reject the null hypothesis")

decisionMTPV <- ifelse(tMT$p.value < alpha, "reject the null hypothesis",
    "do not reject the null hypothesis")

The lower and upper critical values are -2.577 and 2.577 respectively.

The p-value is 0.917.

Step 5: DeBACsion and conclusion

Critical value approach: Since -2.577 < -0.105 < 2.577, do not reject the null hypothesis.

p-value approach: Since 0.917 > 0.01, do not reject the null hypothesis.

Conclude that the mean return for MT is not statistically different from 0.

1.6 Are average returns different from each other?

Assume the returns of each asset are independent from each other. With this assumption, are the mean returns statistically different from each other at the 1% level of significance? Provide details for all 5 steps to conduct each of the hypothesis tests using what your have learned in the unit. Calculate and report all the relevant values for your conclusion and be sure to provide and interpretation of the results. (Hint: You need to discuss the equality of variances to determine which type of test to use.)

## Using one-way ANOVA for testing differences in mean
## returns

Assume that Population 1 = AWR, Population 2 = BAC and Population 3 = MT.

Procedure:

  1. Check for homogeneity of population variances by using Levene’s test
  2. Depending on the outcome of Levene’s test, use the F-test if the variances are homogeneous, else use Welch’s heteroscedastic F-test if the variances are heterogeneous.

1.6.1 Levene’s Test for homogeneity of variances

Step 1: State the null and alternative hypotheses

\[ \begin{aligned} H_0&:\ \sigma_1^2=\sigma_2^2=\sigma_3^2 \\ H_1&:\text{ At least one of the } \sigma_j^2 \text{ is not equal to the others} \end{aligned} \]

Step 2: The significance level, \(\alpha\), is 0.01 and the sample size, n, is 5887.

Step 3: The test-statistic is

\[ W=\frac{(n-c)}{(c-1)} \cdot \frac{\sum_{j=1}^{c} n_{j}\left(\bar{X}_{j}-\bar{X}\right)^{2}}{\sum_{j=1}^{c} \Sigma_{i=1}^{n_{j}}\left(X_{i j}-\bar{X}_{j}\right)^{2}} \]

## Running Levene's test for homogeneity of variances

homog_var <- onewaytests::homog.test(returns ~ symbol, data = JWstocks,
    alpha = 0.01, method = "Levene")
## 
##   Levene's Homogeneity Test (alpha = 0.01) 
## ----------------------------------------------- 
##   data : returns and symbol 
## 
##   statistic  : 417 
##   num df     : 2 
##   denum df   : 17658 
##   p.value    : 1.37e-177 
## 
##   Result     : Variances are not homogeneous. 
## -----------------------------------------------

Since this is a one-tailed test, \(\alpha\) is in the upper tail only.

criticalLevene <- round(qf(1 - alpha, df1 = 2, df2 = n - 3),
    3)  # upper critical value
decisionLeveneCV <- ifelse(homog_var$statistic > criticalLevene,
    "reject the null hypothesis", "do not reject the null hypothesis")

decisionLevenePV <- ifelse(homog_var$p.value < alpha, "reject the null hypothesis",
    "do not reject the null hypothesis")

Step 4: The upper critical value is 4.609.

Step 5: Decision and conclusion

Critical value approach: Since 416.778 > 4.609, reject the null hypothesis.

p-value approach: Since 1.374^{-177} < 0.01, reject the null hypothesis.

Conclude that the variance in returns is different for at least one stock.

1.6.2 Welch’s heteroscedastic F-test for equality of means

Step 1: State the null and alternative hypotheses

\[ \begin{aligned} H_0&:\ \mu_1=\mu_2=\mu_3 \\ H_1&:\text{ At least one of the } \mu_j \text{ is not equal to the others} \end{aligned} \]

Step 2: The significance level, \(\alpha\), is 0.01 and the sample size, n, is 5887.

Step 3: The test-statistic is

\[ F_{W}=\frac{\sum_{j} w_{j}\left(\bar{X}_{. j}-X_{. .}^{\prime}\right)^{2} /(k-1)}{\left[1+\frac{2}{3}((k-2) v)\right]} \sim F_{k-1,1 / v} \] where

\[ w_{j}=\frac{n_{j}}{S_{j}^{2}} \]

\[ s_{j}^{2}=\sum_{i}\left(X_{i j}-\bar{X}_{. j}\right)^{2} /\left(n_{j}-1\right) \]

\[ X_{. .}^{\prime}=\frac{\sum_{j} w_{j} \bar{X}_{. j}}{\sum_{j} w_{j}} \]

\[ v=\frac{3 \sum_{j}\left[\left(1-\frac{w_{j}}{\sum_{j} w_{j}}\right)^{2} /\left(n_{j}-1\right)\right]}{k^{2}-1} \]

## Running Welch's heteroscedastic F test for equality of
## means

welch_test <- onewaytests::welch.test(returns ~ symbol, data = JWstocks,
    alpha = 0.01, na.rm = TRUE, verbose = TRUE)
## 
##   Welch's Heteroscedastic F Test (alpha = 0.01) 
## ------------------------------------------------------------- 
##   data : returns and symbol 
## 
##   statistic  : 0.588 
##   num df     : 2 
##   denom df   : 10933 
##   p.value    : 0.556 
## 
##   Result     : Difference is not statistically significant. 
## -------------------------------------------------------------
criticalWelch <- round(qf(1 - alpha, df1 = 2, df2 = welch_test$parameter[2]),
    3)  # upper critical value

decisionWelchCV <- ifelse(welch_test$statistic > criticalWelch,
    "reject the null hypothesis", "do not reject the null hypothesis")

decisionWelchPV <- ifelse(welch_test$p.value < alpha, "reject the null hypothesis",
    "do not reject the null hypothesis")

Step 4: The upper critical value is 4.607.

Since this is a one-tailed test, \(\alpha\) is in the upper tail only.

Step 5: Decision and conclusion

Critical value approach: Since 0.588 < 4.607, do not reject the null hypothesis.

p-value approach: Since 0.556 > 0.01, do not reject the null hypothesis.

Conclude that the differences between the three means are not statistically significant.

1.7 Correlations

Calculate and present the correlation matrix of the returns. Discuss the direction and strength of the correlations.

# Creating the matrix of correlations

stocksMatrix <- JWstocks %>%
    select(date, symbol, returns) %>%
    drop_na() %>%
    pivot_wider(date, names_from = symbol, values_from = returns)

cor(stocksMatrix %>%
    select(-date)) %>%
    kable(caption = "Correlation matrix") %>%
    kable_styling(latex_options = "HOLD_position")
Table 1.3: Correlation matrix
AWR BAC MT
AWR 1.000 0.302 0.233
BAC 0.302 1.000 0.413
MT 0.233 0.413 1.000

Interpretation of results:

  1. There is a weak positive correlation between AWR and BAC.
  2. There is a moderate positive correlation between AWR and MT.
  3. There is a moderate positive correlation between BAC and MT.

Reference:

Shortell, Timothy. “An Introduction to Data Analysis & Presentation.” The RADty University of New York. Accessed May 30, 2023. http://academic.brooklyn.cuny.edu/soc/courses/712/chap18.html.

1.8 Testing the significance of correlations

Is the assumption of independence of stock returns realistic? Provide evidence (the hypothesis test including all 5 steps of the hypothesis test and the equation for the test statistic) and a rationale to support your conclusion.

Procedure:

Assume that Population 1 = AWR, Population 2 = BAC and Population 3 = MT. The same procedure will be followed to test whether the correlations between the three possible pairs of stocks are significant. An example of how to test for the significance of the correlation between BAC and MT returns is given below.

Correlation test

Step 1: State the null and alternative hypotheses

\[ \begin{aligned} H_0&:\ \rho = 0 \\ H_1&:\ \rho \neq0 \end{aligned} \]

Step 2: The significance level, \(\alpha\), is 0.01 and the sample size, n, is 5887.

Step 3: The test-statistic is

\[ t=\frac{r}{\sqrt{1-r^{2}}} \sqrt{n-2} \]

## Running the correlation test

correl <- cor.test(bac, mt, method = c("pearson", "kendall",
    "spearman"))
correl
## 
##  Pearson's product-moment correlation
## 
## data:  bac and mt
## t = 35, df = 5885, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.392 0.434
## sample estimates:
##   cor 
## 0.413

Since this is a two-tailed test, \(\alpha\) is divided into two parts: the lower and upper tails of the t-distribution.

lcritCorr <- round(qt(alpha/2, df = n - 2), 3)  # lower critical value
ucritCorr <- round(qt(1 - alpha/2, df = n - 2), 3)  # upper critical value
decisionCorr <- ifelse(correl$p.value < alpha, "reject the null hypothesis",
    "do not reject the null hypothesis")

Step 4: The lower and upper critical values are -2.577 and 2.577 respectively.

tinytex::reinstall_tinytex() > Step 5: Decision and conclusion

p-value approach: Since 9.226^{-242} < 0.01, reject the null hypothesis.

Conclude that the returns of the two stocks are not independent. latexmk(engine = ‘pdflatex’, emulation = TRUE). update.packages(ask = FALSE, checkBuilt = TRUE)

# Creating the data frame myreturns
myreturns = data.frame(awr, bac, mt)

Sys.which(‘pdflatex’)

It is observed that all three p-values in the above table are very close to 0 and hence significant at the 1% level. It means that each stock is not independent from the other two. Therefore, the assumption of independence of stock returns is not realistic.