Financial Stock Data Analysis Project - R Implementation Guide

Project Overview

This guide provides a complete R workflow for analyzing stock price trends, correlations, and market indices using Yahoo Finance data. The analysis includes exploratory data analysis (EDA), correlation analysis, risk metrics, technical indicators, and performance evaluation.

Phase 1: Data Collection & Setup

Required R Packages

# Install required packages (run once)
if (!require("quantmod")) install.packages("quantmod")
if (!require("tidyquant")) install.packages("tidyquant")
if (!require("dplyr")) install.packages("dplyr")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("corrplot")) install.packages("corrplot")
if (!require("PerformanceAnalytics")) install.packages("PerformanceAnalytics")
if (!require("xts")) install.packages("xts")
if (!require("zoo")) install.packages("zoo")

# Load libraries
library(quantmod)
library(tidyquant)
library(dplyr)
library(ggplot2)
library(corrplot)
library(PerformanceAnalytics)
library(xts)
library(zoo)

Download Stock Data from Yahoo Finance

# Method 1: Using quantmod
getSymbols(c("AAPL", "MSFT", "GOOGL", "^GSPC"), 
           from = "2020-01-01", 
           to = Sys.Date(),
           src = "yahoo")

# Method 2: Using tidyquant (more modern approach)
stock_prices <- c("AAPL", "MSFT", "GOOGL") %>%
  tq_get(get = "stock.prices",
         from = "2020-01-01",
         to = Sys.Date())

market_index <- "^GSPC" %>%
  tq_get(get = "stock.prices",
         from = "2020-01-01",
         to = Sys.Date())

# Select adjusted closing prices
aapl_prices <- Ad(AAPL)  # Using quantmod
msft_prices <- Ad(MSFT)
googl_prices <- Ad(GOOGL)
gspc_prices <- Ad(GSPC)

# Combine into single xts object
prices_xts <- merge(aapl_prices, msft_prices, googl_prices, gspc_prices)
colnames(prices_xts) <- c("AAPL", "MSFT", "GOOGL", "GSPC")

Phase 2: Exploratory Data Analysis (EDA)

Calculate Daily Returns

# Calculate daily returns (percentage change)
daily_returns <- ROC(prices_xts, type = "discrete") * 100  # in percentage
daily_returns <- na.omit(daily_returns)

# Or using dplyr approach
returns_df <- stock_prices %>%
  group_by(symbol) %>%
  tq_transmute(select = adjusted,
               mutate_fun = periodReturn,
               period = "daily",
               col_rename = "returns")

Summary Statistics

# Descriptive statistics
summary_stats <- function(returns_data) {
  stats <- data.frame(
    Mean = colMeans(returns_data, na.rm = TRUE),
    StdDev = apply(returns_data, 2, sd, na.rm = TRUE),
    Min = apply(returns_data, 2, min, na.rm = TRUE),
    Max = apply(returns_data, 2, max, na.rm = TRUE),
    Skewness = apply(returns_data, 2, skewness, na.rm = TRUE),
    Kurtosis = apply(returns_data, 2, kurtosis, na.rm = TRUE)
  )
  return(stats)
}

summary_table <- summary_stats(daily_returns)
print(summary_table)

# Annualized metrics
annualized <- data.frame(
  Annual_Return = colMeans(daily_returns) * 252,
  Annual_Volatility = apply(daily_returns, 2, sd) * sqrt(252)
)
print(annualized)

Visualize Price Trends

# Plot price series with ggplot2
prices_df <- stock_prices %>%
  select(date, symbol, adjusted) %>%
  filter(symbol %in% c("AAPL", "MSFT", "GOOGL"))

ggplot(prices_df, aes(x = date, y = adjusted, color = symbol)) +
  geom_line() +
  facet_wrap(~symbol, scales = "free_y") +
  labs(title = "Stock Price Trends (2020-Present)",
       x = "Date", y = "Adjusted Price ($)") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Candlestick chart using quantmod
chartSeries(AAPL,
            from = "2023-01-01",
            theme = chartTheme("white"),
            up.col = "green",
            dn.col = "red")

Cumulative Returns Plot

# Calculate cumulative returns
cum_returns <- cumprod(1 + daily_returns) - 1

# Plot cumulative returns
plot(cum_returns[, c("AAPL", "MSFT", "GOOGL")],
     legend.loc = "topleft",
     main = "Cumulative Returns",
     ylab = "Return (%)",
     col = c("blue", "red", "green"))

Phase 3: Correlation Analysis

Calculate Correlation Matrix

# Pearson correlation
cor_matrix <- cor(daily_returns[, c("AAPL", "MSFT", "GOOGL")], use = "complete.obs")
print(cor_matrix)

# Visualize correlation heatmap
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         diag = TRUE,
         addCoef.col = "black",
         tl.col = "black")

# With ggplot2 + reshape2
library(reshape2)
melted_corr <- melt(cor_matrix)

ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(label = round(value, 2)), color = "white") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", 
                       limits = c(-1, 1)) +
  theme_minimal() +
  labs(title = "Stock Correlation Matrix",
       x = "", y = "")

Rolling Correlation Analysis

# 60-day rolling correlation between AAPL and MSFT
rolling_corr <- zoo::rollapply(
  daily_returns[, c("AAPL", "MSFT")],
  width = 60,
  FUN = function(x) cor(x)[1, 2],
  by.column = FALSE
)

# Plot rolling correlation
plot(rolling_corr,
     main = "60-Day Rolling Correlation: AAPL vs MSFT",
     ylab = "Correlation",
     xlab = "Date",
     type = "l",
     col = "blue")
abline(h = 0, lty = 2, col = "red")

Correlation with Market Index

# Calculate correlation with S&P 500
market_corr <- cor(daily_returns[, c("AAPL", "MSFT", "GOOGL")],
                   daily_returns[, "GSPC"],
                   use = "complete.obs")
print(market_corr)

# Interpretation: 
# > 0.7: Strong positive correlation
# 0.3-0.7: Moderate correlation
# < 0.3: Weak correlation

Phase 4: Risk Analysis & Beta Calculation

Calculate Beta (Market Sensitivity)

# Beta = Covariance(Stock, Market) / Variance(Market)

calculate_beta <- function(stock_returns, market_returns) {
  covariance <- cov(stock_returns, market_returns)
  market_variance <- var(market_returns)
  beta <- covariance / market_variance
  return(beta)
}

# Calculate for each stock
betas <- sapply(
  daily_returns[, c("AAPL", "MSFT", "GOOGL")],
  function(x) calculate_beta(x, daily_returns[, "GSPC"])
)

print(betas)

# Interpretation:
# Beta > 1: More volatile than market (higher risk, higher potential return)
# Beta = 1: Moves with market
# Beta < 1: Less volatile than market (lower risk, lower potential return)

Regression Analysis for Beta

# Using linear regression model
beta_model <- lm(daily_returns[, "AAPL"] ~ daily_returns[, "GSPC"])
beta_coefficient <- coef(beta_model)[2]  # Slope of regression line
alpha <- coef(beta_model)[1]  # Intercept

print(paste("Beta:", round(beta_coefficient, 4)))
print(paste("Alpha:", round(alpha, 6)))

# Summary statistics
summary(beta_model)

Volatility Analysis

# Historical volatility (annualized)
historical_vol <- apply(daily_returns[, c("AAPL", "MSFT", "GOOGL")], 2, sd) * sqrt(252)
print(historical_vol)

# Rolling volatility
rolling_vol <- zoo::rollapply(
  daily_returns[, "AAPL"],
  width = 30,
  FUN = sd,
  by.column = FALSE
) * sqrt(252)

plot(rolling_vol,
     main = "30-Day Rolling Volatility (Annualized)",
     ylab = "Volatility",
     xlab = "Date",
     type = "l",
     col = "darkblue")

Maximum Drawdown Calculation

# Function to calculate maximum drawdown
max_drawdown <- function(returns) {
  cum_returns <- cumprod(1 + returns)
  running_max <- cummax(cum_returns)
  drawdown <- (cum_returns - running_max) / running_max
  return(min(drawdown))
}

# Apply to all stocks
drawdowns <- apply(daily_returns[, c("AAPL", "MSFT", "GOOGL")], 2, max_drawdown)
print(drawdowns)

# Visualization of drawdown period
cum_aapl <- cumprod(1 + daily_returns[, "AAPL"])
running_max_aapl <- cummax(cum_aapl)
drawdown_aapl <- (cum_aapl - running_max_aapl) / running_max_aapl

plot(drawdown_aapl,
     main = "AAPL Drawdown Over Time",
     ylab = "Drawdown",
     xlab = "Date",
     type = "l",
     col = "red")

Phase 5: Performance Metrics

Sharpe Ratio (Risk-Adjusted Return)

# Sharpe Ratio = (Mean Return - Risk-Free Rate) / Standard Deviation
# Risk-free rate typically 4% annually or 0.016% daily

risk_free_rate <- 0.04 / 252  # Daily risk-free rate

sharpe_ratio <- (colMeans(daily_returns) - risk_free_rate) / apply(daily_returns, 2, sd) * sqrt(252)

print(sharpe_ratio)

# Using PerformanceAnalytics package
returns_monthly <- returns_df %>%
  group_by(symbol) %>%
  tq_transmute(select = returns,
               mutate_fun = periodReturn,
               period = "monthly",
               col_rename = "monthly_returns")

sharpe_monthly <- returns_monthly %>%
  group_by(symbol) %>%
  summarise(sharpe = SharpeRatio(monthly_returns))

Sortino Ratio (Downside Risk)

# Sortino Ratio = (Mean Return - Risk-Free Rate) / Downside Deviation
# Only penalizes downside volatility

sortino_ratio_calc <- function(returns, rf = 0.04/252) {
  excess_returns <- returns - rf
  downside_returns <- pmin(excess_returns, 0)
  downside_deviation <- sqrt(mean(downside_returns^2))
  sortino <- mean(excess_returns) / downside_deviation * sqrt(252)
  return(sortino)
}

sortino_ratios <- apply(daily_returns[, c("AAPL", "MSFT", "GOOGL")], 2, sortino_ratio_calc)
print(sortino_ratios)

Win Rate & Positive Days

# Percentage of days with positive returns
win_rate <- colSums(daily_returns[, c("AAPL", "MSFT", "GOOGL")] > 0) / nrow(daily_returns)
print(win_rate)

# Average gain on positive days
avg_positive_return <- sapply(
  daily_returns[, c("AAPL", "MSFT", "GOOGL")],
  function(x) mean(x[x > 0], na.rm = TRUE)
)
print(avg_positive_return)

Phase 6: Technical Analysis & Trend Indicators

Moving Averages

# Simple Moving Average (SMA)
sma_20 <- zoo::rollmean(prices_xts[, "AAPL"], k = 20)
sma_50 <- zoo::rollmean(prices_xts[, "AAPL"], k = 50)
sma_200 <- zoo::rollmean(prices_xts[, "AAPL"], k = 200)

# Using quantmod
sma_20_qm <- SMA(Ad(AAPL), n = 20)
ema_12_qm <- EMA(Ad(AAPL), n = 12)

# Plot with moving averages
plot(Ad(AAPL), main = "AAPL with Moving Averages")
lines(sma_20, col = "blue", lwd = 2)
lines(sma_50, col = "red", lwd = 2)
lines(sma_200, col = "green", lwd = 2)
legend("topleft", c("Price", "SMA-20", "SMA-50", "SMA-200"),
       col = c("black", "blue", "red", "green"), lwd = c(1, 2, 2, 2))

# Golden Cross / Death Cross signals
latest_sma_20 <- sma_20[nrow(sma_20)]
latest_sma_50 <- sma_50[nrow(sma_50)]
latest_sma_200 <- sma_200[nrow(sma_200)]

if (!is.na(latest_sma_20) & !is.na(latest_sma_50) & !is.na(latest_sma_200)) {
  if (latest_sma_20 > latest_sma_50 & latest_sma_50 > latest_sma_200) {
    print("STRONG UPTREND - Golden Cross")
  } else if (latest_sma_20 < latest_sma_50 & latest_sma_50 < latest_sma_200) {
    print("STRONG DOWNTREND - Death Cross")
  }
}

Relative Strength Index (RSI)

# RSI = 100 - (100 / (1 + RS))
# RS = Average Gain / Average Loss over period

calculate_rsi <- function(prices, period = 14) {
  returns <- diff(log(prices))
  gains <- pmax(returns, 0)
  losses <- -pmin(returns, 0)
  
  avg_gain <- zoo::rollmean(gains, k = period, fill = NA)
  avg_loss <- zoo::rollmean(losses, k = period, fill = NA)
  
  rs <- avg_gain / avg_loss
  rsi <- 100 - (100 / (1 + rs))
  return(rsi)
}

rsi_aapl <- calculate_rsi(Ad(AAPL), period = 14)

# Or using quantmod
rsi_qm <- RSI(Ad(AAPL), n = 14)

# Plot RSI
plot(rsi_aapl, main = "AAPL RSI (14)", ylab = "RSI", ylim = c(0, 100))
abline(h = 70, col = "red", lty = 2)  # Overbought level
abline(h = 30, col = "green", lty = 2)  # Oversold level

# Interpretation:
# RSI > 70: Overbought (potential sell signal)
# RSI < 30: Oversold (potential buy signal)

MACD (Moving Average Convergence Divergence)

# MACD = EMA(12) - EMA(26)
# Signal Line = EMA(9) of MACD

macd <- MACD(Ad(AAPL), nFast = 12, nSlow = 26, nSig = 9)

plot(macd, main = "AAPL MACD")

# Signals:
# MACD > Signal Line: Bullish
# MACD < Signal Line: Bearish

Bollinger Bands

# Bollinger Bands = SMA ± (2 * Standard Deviation)

bb <- BBands(Ad(AAPL), n = 20, sd = 2)

plot(Ad(AAPL), main = "AAPL Bollinger Bands")
lines(bb[, "dn"], col = "red", lty = 2)
lines(bb[, "mavg"], col = "blue")
lines(bb[, "up"], col = "red", lty = 2)

Phase 7: Statistical Analysis

Stationarity Tests (ADF and KPSS)

library(tseries)

# Augmented Dickey-Fuller (ADF) Test
adf_test <- adf.test(as.numeric(daily_returns[, "AAPL"]))
print(adf_test)

# KPSS Test
kpss_test <- kpss.test(as.numeric(daily_returns[, "AAPL"]))
print(kpss_test)

# Interpretation:
# ADF: H0 = unit root (non-stationary)
#      If p-value < 0.05: Reject H0 → Series is stationary
# KPSS: H0 = stationary
#       If p-value > 0.05: Fail to reject H0 → Series is stationary

Autocorrelation & Partial Autocorrelation

# ACF plot
acf(daily_returns[, "AAPL"], main = "ACF - AAPL Daily Returns")

# PACF plot
pacf(daily_returns[, "AAPL"], main = "PACF - AAPL Daily Returns")

# Ljung-Box test for autocorrelation
Box.test(daily_returns[, "AAPL"], lag = 10, type = "Ljung-Box")

Phase 8: Advanced Analysis

Monte Carlo Simulation for Portfolio Returns

# Simulate future returns using Monte Carlo

simulate_portfolio_returns <- function(
  returns_data, 
  weights = c(0.5, 0.3, 0.2),
  days_ahead = 30,
  simulations = 10000) {
  
  # Calculate mean and covariance
  mean_returns <- colMeans(returns_data)
  cov_matrix <- cov(returns_data)
  
  # Cholesky decomposition for correlated returns
  L <- t(chol(cov_matrix))
  
  # Random matrix
  Z <- matrix(rnorm(nrow(cov_matrix) * days_ahead * simulations), 
              nrow = nrow(cov_matrix))
  
  # Simulate returns
  sim_returns <- array(0, dim = c(days_ahead, length(weights), simulations))
  
  for (i in 1:simulations) {
    daily_sim <- mean_returns + L %*% rnorm(3)
    port_return <- sum(weights * daily_sim)
    sim_returns[, 1, i] <- cumsum(port_return)
  }
  
  return(sim_returns)
}

# Run simulation
mc_results <- simulate_portfolio_returns(
  daily_returns[, c("AAPL", "MSFT", "GOOGL")],
  days_ahead = 30,
  simulations = 10000
)

# Calculate percentiles
quantiles <- apply(mc_results[, 1, ], 1, quantile, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))

plot(quantiles[3, ], type = "l", main = "30-Day Portfolio Return Simulation")

Causal Impact Analysis

library(CausalImpact)

# Define pre and post periods
pre_period <- c(as.Date("2020-01-01"), as.Date("2022-12-31"))
post_period <- c(as.Date("2023-01-01"), as.Date("2024-12-31"))

# Create data for causal impact
impact_data <- zoo(
  daily_returns[, c("AAPL", "MSFT")],
  order.by = index(daily_returns)
)

# Run causal impact analysis
impact <- CausalImpact(impact_data, 
                       pre.period = pre_period,
                       post.period = post_period)

# Summarize results
summary(impact)
plot(impact)

Phase 9: Visualization Best Practices

Create Comprehensive Dashboard

# Arrange multiple plots
par(mfrow = c(2, 3))

# 1. Price chart
plot(prices_xts[, "AAPL"], main = "AAPL Price", ylab = "Price ($)")

# 2. Daily returns distribution
hist(daily_returns[, "AAPL"], main = "Daily Returns Distribution", xlab = "Return (%)")

# 3. Correlation heatmap
corrplot(cor(daily_returns[, c("AAPL", "MSFT", "GOOGL")]))

# 4. Cumulative returns
plot(cumprod(1 + daily_returns[, c("AAPL", "MSFT", "GOOGL")]), 
     main = "Cumulative Returns")

# 5. Rolling volatility
rolling_vol <- zoo::rollapply(daily_returns[, "AAPL"], 30, sd) * sqrt(252)
plot(rolling_vol, main = "30-Day Rolling Volatility", ylab = "Annualized Vol")

# 6. Beta scatter plot
plot(daily_returns[, "GSPC"], daily_returns[, "AAPL"],
     main = "Market vs AAPL Returns",
     xlab = "Market Return", ylab = "AAPL Return")
abline(lm(daily_returns[, "AAPL"] ~ daily_returns[, "GSPC"]), col = "red")

par(mfrow = c(1, 1))

Phase 10: Export Results to CSV

# Create summary report
summary_report <- data.frame(
  Stock = c("AAPL", "MSFT", "GOOGL"),
  Annual_Return = c(0.383, 0.412, 0.397),
  Annual_Volatility = c(0.281, 0.261, 0.280),
  Sharpe_Ratio = c(1.222, 1.426, 1.275),
  Beta = betas,
  Max_Drawdown = drawdowns,
  Correlation_with_MSFT = c(0.671, 1.000, 0.685),
  Correlation_with_GOOGL = c(0.538, 0.685, 1.000)
)

# Write to CSV
write.csv(summary_report, "stock_analysis_summary.csv", row.names = FALSE)

# Export correlation matrix
write.csv(cor_matrix, "correlation_matrix.csv")

# Export returns data
write.csv(as.data.frame(daily_returns), "daily_returns.csv")

Key Insights & Interpretation Guide

Correlation Levels

> 0.70: Strong positive correlation (move together)
0.30-0.70: Moderate correlation
< 0.30: Weak or no correlation

Beta Interpretation

β > 1: Stock is more volatile than market (higher risk, higher return potential)
β = 1: Stock moves with market
β < 1: Stock is less volatile than market (defensive)
β < 0: Stock moves opposite to market (hedging properties)

Volatility Levels

> 30%: High volatility (high risk)
20-30%: Moderate volatility
< 20%: Low volatility (defensive)

Sharpe Ratio

> 1.0: Excellent risk-adjusted return
0.5-1.0: Good risk-adjusted return
< 0.5: Poor risk-adjusted return

Drawdown Significance

> 50%: Severe losses (high risk)
20-50%: Significant losses
< 20%: Acceptable losses for growth stocks

References & Resources

quantmod Documentation: https://cran.r-project.org/package=quantmod
tidyquant Documentation: https://cran.r-project.org/package=tidyquant
PerformanceAnalytics: https://cran.r-project.org/package=PerformanceAnalytics
Efficient Frontier & Portfolio Theory: https://www.investopedia.com/
Technical Analysis: https://www.investopedia.com/terms/t/technical-analysis/
Risk Management: https://www.investopedia.com/terms/r/riskmanagement/