Research project

# INstall nescessary package
library(quantmod)
library(dplyr)
library(zoo)
library(caTools)

# Load data of JPM's price from Yahoo Finance
symbol <- "JPM"
start_date <- "2010-08-01"
end_date <- "2024-08-01"
getSymbols(symbol, src = "yahoo", from = start_date, to = end_date)

## [1] "JPM"

# Changing data to dataframe
stock_data <- data.frame(Date = index(JPM), coredata(JPM))
colnames(stock_data) <- c("Date", "Open", "High", "Low", "Close", "Volume", "Adjusted")

The binary variable that represents whether the JPM’s stock price experiences a significant fluctuate of more than 1% within 1 day.

\[\text{Target Variable} = \begin{cases} 1 \text{ if } |r_{JPM,t}| > 1\% \text{ within 1 day }\\ 0 \text{ otherwise} \end{cases}\]

# Create tagert variable
stock_data <- stock_data %>%
  mutate(Return = (Close - lag(Close, 1)) / lag(Close, 1),
         Event = ifelse(Return < -0.01 | Return > 0.01, 1, 0))

# View dataframe
head(stock_data)

##         Date  Open  High   Low Close   Volume Adjusted        Return Event
## 1 2010-08-02 40.98 41.70 40.73 41.64 35566700 28.57778            NA    NA
## 2 2010-08-03 41.50 41.65 40.87 41.08 29967900 28.19345 -0.0134485487     1
## 3 2010-08-04 41.22 41.40 40.86 41.29 23078100 28.33757  0.0051119541     0
## 4 2010-08-05 40.97 41.43 40.94 41.27 19830500 28.32385 -0.0004843899     0
## 5 2010-08-06 40.74 40.95 39.97 40.44 34906600 27.75422 -0.0201115053     1
## 6 2010-08-09 40.55 40.56 39.66 39.82 37790900 27.32870 -0.0153313292     1

# Check number of event JPM’s stock price experiences a significant fluctuate of more than 1% within 1 day
count_event_1 <- sum(stock_data$Event == 1, na.rm = TRUE)
count_event_1

## [1] 1497

# Loading macro FRED (Federal Reserve Economic Data)
getSymbols("DGS10", src = "FRED", from = start_date, to = end_date) #  Market Yield on U.S. Treasury Securities at 10-Year Constant Maturity

## [1] "DGS10"

getSymbols("^GSPC", src = "yahoo", from = start_date, to = end_date) # S&P 500 Index

## [1] "GSPC"

daily_return <- dailyReturn(GSPC$GSPC.Adjusted, type = "log") # calculate return of SP500

# Combine macroeconomic features into dataframe
economic_data <- merge(DGS10, daily_return, all = FALSE)
economic_data <- data.frame(Date = index(economic_data), coredata(economic_data))

# View dataframe
head(economic_data)

##         Date DGS10 daily.returns
## 1 2010-08-02  2.99   0.000000000
## 2 2010-08-03  2.94  -0.004807895
## 3 2010-08-04  2.98   0.006032878
## 4 2010-08-05  2.94  -0.001269330
## 5 2010-08-06  2.86  -0.003710916
## 6 2010-08-09  2.86   0.005468087

## Changing name of columns in dataframe
economic_data <- economic_data[, c("Date", "DGS10", "daily.returns")]
colnames(economic_data)  <- c("Date","DGS10", "daily.returns")

# View dataframe
head(economic_data)

##         Date DGS10 daily.returns
## 1 2010-08-02  2.99   0.000000000
## 2 2010-08-03  2.94  -0.004807895
## 3 2010-08-04  2.98   0.006032878
## 4 2010-08-05  2.94  -0.001269330
## 5 2010-08-06  2.86  -0.003710916
## 6 2010-08-09  2.86   0.005468087

# # Combine stock dataframe and macroeconomic features
combined_data <- merge(stock_data, economic_data, by = "Date", all.x = TRUE)

# Xử lý dữ liệu thiếu
combined_data <- na.locf(combined_data)
combined_data <- na.omit(combined_data)

head(combined_data)

##         Date  Open  High   Low Close   Volume Adjusted        Return Event
## 2 2010-08-03 41.50 41.65 40.87 41.08 29967900 28.19345 -0.0134485487     1
## 3 2010-08-04 41.22 41.40 40.86 41.29 23078100 28.33757  0.0051119541     0
## 4 2010-08-05 40.97 41.43 40.94 41.27 19830500 28.32385 -0.0004843899     0
## 5 2010-08-06 40.74 40.95 39.97 40.44 34906600 27.75422 -0.0201115053     1
## 6 2010-08-09 40.55 40.56 39.66 39.82 37790900 27.32870 -0.0153313292     1
## 7 2010-08-10 39.67 39.73 39.16 39.17 34166500 26.88261 -0.0163234940     1
##   DGS10 daily.returns
## 2  2.94  -0.004807895
## 3  2.98   0.006032878
## 4  2.94  -0.001269330
## 5  2.86  -0.003710916
## 6  2.86   0.005468087
## 7  2.79  -0.005985282

combined_data <- combined_data[, c("Date","DGS10", "daily.returns" ,"Event")]

head(combined_data)

##         Date DGS10 daily.returns Event
## 2 2010-08-03  2.94  -0.004807895     1
## 3 2010-08-04  2.98   0.006032878     0
## 4 2010-08-05  2.94  -0.001269330     0
## 5 2010-08-06  2.86  -0.003710916     1
## 6 2010-08-09  2.86   0.005468087     1
## 7 2010-08-10  2.79  -0.005985282     1

# Scale data to range [0, 1] function
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

# Scaling DGS10, T10Y2Y and daily.returns columns
combined_data$DGS10 <- normalize(combined_data$DGS10)
combined_data$daily.returns <- normalize(combined_data$daily.returns)

# View dataframe
head(combined_data)

##         Date     DGS10 daily.returns Event
## 2 2010-08-03 0.5426009     0.5652291     1
## 3 2010-08-04 0.5515695     0.6151094     0
## 4 2010-08-05 0.5426009     0.5815107     0
## 5 2010-08-06 0.5246637     0.5702765     1
## 6 2010-08-09 0.5246637     0.6125107     1
## 7 2010-08-10 0.5089686     0.5598117     1

# Split data to training set and testing set with ratio=0.7
set.seed(123)
split <- sample.split(combined_data$Event, SplitRatio = 0.7)
train_data <- subset(combined_data, split == TRUE)
test_data <- subset(combined_data, split == FALSE)

# Runing Logistic Regression
logit_model <- glm(Event ~ DGS10 +  daily.returns,
                   data = train_data, family = binomial)

# Results
summary(logit_model)

## 
## Call:
## glm(formula = Event ~ DGS10 + daily.returns, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.4840     0.4892   0.990    0.322    
## DGS10          -1.2824     0.2096  -6.120 9.38e-10 ***
## daily.returns  -0.4172     0.8136  -0.513    0.608    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3362.9  on 2465  degrees of freedom
## Residual deviance: 3324.0  on 2463  degrees of freedom
## AIC: 3330
## 
## Number of Fisher Scoring iterations: 4

# Predict with training set
predicted_probs <- predict(logit_model, newdata = train_data, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)

# Confusion matrix
confusion_matrix <- table(Predicted = predicted_classes, Actual = train_data$Event)
print(confusion_matrix)

##          Actual
## Predicted    0    1
##         0 1346  913
##         1   72  135

# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))

## [1] "Accuracy: 0.600567721005677"

# Predict with testing set
predicted_probs <- predict(logit_model, newdata = test_data, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)

# Confusion matrix
confusion_matrix <- table(Predicted = predicted_classes, Actual = test_data$Event)
print(confusion_matrix)

##          Actual
## Predicted   0   1
##         0 569 396
##         1  38  53

# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))

## [1] "Accuracy: 0.589015151515151"

train_data <- na.omit(train_data)

# Number of scenario and step forecast
n_scenarios <- 10000
n_steps <- 22  # Khoảng thời gian mô phỏng (22 ngày giao dịch = 1 tháng)

# Initialize the simulation result storage matrix
mc_results <- matrix(NA, nrow = n_scenarios, ncol = n_steps)

set.seed(123)

# Simulate scenarios based on logistic model
for (i in 1:n_scenarios) {
  simulated_economic_data <- data.frame(
    DGS10  = rnorm(n_steps, mean = mean(combined_data$DGS10), sd = sd(combined_data$DGS10)),
    daily.returns = rnorm(n_steps, mean = mean(combined_data$daily.returns), sd = sd(combined_data$daily.returns)))
  
  # Event forecasting for each scenario
  simulated_probs <- predict(logit_model, newdata = simulated_economic_data, type = "response")
  mc_results[i, ] <- simulated_probs
}

# Tính toán xác suất trung bình của sự kiện trong mỗi ngày
average_probs <- apply(mc_results, 2, mean)

# Calculate the average probability of an event per day
plot(average_probs, type = "l", col = "blue", lwd = 2, xlab = "Days", ylab = "Probability", 
     main = "Average Probability of Significant Price Drop over Time")

# Simulation results analysis
print("Average probability of significant price drop over 22 days:")

## [1] "Average probability of significant price drop over 22 days:"

print(average_probs)

##  [1] 0.4259960 0.4256583 0.4251853 0.4255004 0.4249340 0.4245675 0.4254334
##  [8] 0.4250063 0.4264270 0.4264576 0.4262455 0.4249419 0.4264182 0.4248062
## [15] 0.4254879 0.4261870 0.4251955 0.4255195 0.4248647 0.4251158 0.4262779
## [22] 0.4249302

Research project - Group 1

PhuongTrinh

2024-09-01