# Load data of JPM's price from Yahoo Finance
symbol <- "JPM"
start_date <- "2010-08-01"
end_date <- "2024-08-01"
getSymbols(symbol, src = "yahoo", from = start_date, to = end_date)## [1] "JPM"
# Changing data to dataframe
stock_data <- data.frame(Date = index(JPM), coredata(JPM))
colnames(stock_data) <- c("Date", "Open", "High", "Low", "Close", "Volume", "Adjusted")The binary variable that represents whether the JPM’s stock price experiences a significant fluctuate of more than 1% within 1 day.
\[\text{Target Variable} = \begin{cases} 1 \text{ if } |r_{JPM,t}| > 1\% \text{ within 1 day }\\ 0 \text{ otherwise} \end{cases}\]
# Create tagert variable
stock_data <- stock_data %>%
mutate(Return = (Close - lag(Close, 1)) / lag(Close, 1),
Event = ifelse(Return < -0.01 | Return > 0.01, 1, 0))## Date Open High Low Close Volume Adjusted Return Event
## 1 2010-08-02 40.98 41.70 40.73 41.64 35566700 28.57778 NA NA
## 2 2010-08-03 41.50 41.65 40.87 41.08 29967900 28.19345 -0.0134485487 1
## 3 2010-08-04 41.22 41.40 40.86 41.29 23078100 28.33757 0.0051119541 0
## 4 2010-08-05 40.97 41.43 40.94 41.27 19830500 28.32385 -0.0004843899 0
## 5 2010-08-06 40.74 40.95 39.97 40.44 34906600 27.75422 -0.0201115053 1
## 6 2010-08-09 40.55 40.56 39.66 39.82 37790900 27.32870 -0.0153313292 1
# Check number of event JPM’s stock price experiences a significant fluctuate of more than 1% within 1 day
count_event_1 <- sum(stock_data$Event == 1, na.rm = TRUE)
count_event_1## [1] 1497
# Loading macro FRED (Federal Reserve Economic Data)
getSymbols("DGS10", src = "FRED", from = start_date, to = end_date) # Market Yield on U.S. Treasury Securities at 10-Year Constant Maturity## [1] "DGS10"
## [1] "GSPC"
daily_return <- dailyReturn(GSPC$GSPC.Adjusted, type = "log") # calculate return of SP500
# Combine macroeconomic features into dataframe
economic_data <- merge(DGS10, daily_return, all = FALSE)
economic_data <- data.frame(Date = index(economic_data), coredata(economic_data))## Date DGS10 daily.returns
## 1 2010-08-02 2.99 0.000000000
## 2 2010-08-03 2.94 -0.004807895
## 3 2010-08-04 2.98 0.006032878
## 4 2010-08-05 2.94 -0.001269330
## 5 2010-08-06 2.86 -0.003710916
## 6 2010-08-09 2.86 0.005468087
## Changing name of columns in dataframe
economic_data <- economic_data[, c("Date", "DGS10", "daily.returns")]
colnames(economic_data) <- c("Date","DGS10", "daily.returns")## Date DGS10 daily.returns
## 1 2010-08-02 2.99 0.000000000
## 2 2010-08-03 2.94 -0.004807895
## 3 2010-08-04 2.98 0.006032878
## 4 2010-08-05 2.94 -0.001269330
## 5 2010-08-06 2.86 -0.003710916
## 6 2010-08-09 2.86 0.005468087
# # Combine stock dataframe and macroeconomic features
combined_data <- merge(stock_data, economic_data, by = "Date", all.x = TRUE)
# Xử lý dữ liệu thiếu
combined_data <- na.locf(combined_data)
combined_data <- na.omit(combined_data)## Date Open High Low Close Volume Adjusted Return Event
## 2 2010-08-03 41.50 41.65 40.87 41.08 29967900 28.19345 -0.0134485487 1
## 3 2010-08-04 41.22 41.40 40.86 41.29 23078100 28.33757 0.0051119541 0
## 4 2010-08-05 40.97 41.43 40.94 41.27 19830500 28.32385 -0.0004843899 0
## 5 2010-08-06 40.74 40.95 39.97 40.44 34906600 27.75422 -0.0201115053 1
## 6 2010-08-09 40.55 40.56 39.66 39.82 37790900 27.32870 -0.0153313292 1
## 7 2010-08-10 39.67 39.73 39.16 39.17 34166500 26.88261 -0.0163234940 1
## DGS10 daily.returns
## 2 2.94 -0.004807895
## 3 2.98 0.006032878
## 4 2.94 -0.001269330
## 5 2.86 -0.003710916
## 6 2.86 0.005468087
## 7 2.79 -0.005985282
## Date DGS10 daily.returns Event
## 2 2010-08-03 2.94 -0.004807895 1
## 3 2010-08-04 2.98 0.006032878 0
## 4 2010-08-05 2.94 -0.001269330 0
## 5 2010-08-06 2.86 -0.003710916 1
## 6 2010-08-09 2.86 0.005468087 1
## 7 2010-08-10 2.79 -0.005985282 1
# Scale data to range [0, 1] function
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# Scaling DGS10, T10Y2Y and daily.returns columns
combined_data$DGS10 <- normalize(combined_data$DGS10)
combined_data$daily.returns <- normalize(combined_data$daily.returns)## Date DGS10 daily.returns Event
## 2 2010-08-03 0.5426009 0.5652291 1
## 3 2010-08-04 0.5515695 0.6151094 0
## 4 2010-08-05 0.5426009 0.5815107 0
## 5 2010-08-06 0.5246637 0.5702765 1
## 6 2010-08-09 0.5246637 0.6125107 1
## 7 2010-08-10 0.5089686 0.5598117 1
# Split data to training set and testing set with ratio=0.7
set.seed(123)
split <- sample.split(combined_data$Event, SplitRatio = 0.7)
train_data <- subset(combined_data, split == TRUE)
test_data <- subset(combined_data, split == FALSE)# Runing Logistic Regression
logit_model <- glm(Event ~ DGS10 + daily.returns,
data = train_data, family = binomial)
# Results
summary(logit_model)##
## Call:
## glm(formula = Event ~ DGS10 + daily.returns, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4840 0.4892 0.990 0.322
## DGS10 -1.2824 0.2096 -6.120 9.38e-10 ***
## daily.returns -0.4172 0.8136 -0.513 0.608
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3362.9 on 2465 degrees of freedom
## Residual deviance: 3324.0 on 2463 degrees of freedom
## AIC: 3330
##
## Number of Fisher Scoring iterations: 4
# Predict with training set
predicted_probs <- predict(logit_model, newdata = train_data, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
# Confusion matrix
confusion_matrix <- table(Predicted = predicted_classes, Actual = train_data$Event)
print(confusion_matrix)## Actual
## Predicted 0 1
## 0 1346 913
## 1 72 135
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))## [1] "Accuracy: 0.600567721005677"
# Predict with testing set
predicted_probs <- predict(logit_model, newdata = test_data, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
# Confusion matrix
confusion_matrix <- table(Predicted = predicted_classes, Actual = test_data$Event)
print(confusion_matrix)## Actual
## Predicted 0 1
## 0 569 396
## 1 38 53
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))## [1] "Accuracy: 0.589015151515151"
# Number of scenario and step forecast
n_scenarios <- 10000
n_steps <- 22 # Khoảng thời gian mô phỏng (22 ngày giao dịch = 1 tháng)
# Initialize the simulation result storage matrix
mc_results <- matrix(NA, nrow = n_scenarios, ncol = n_steps)
set.seed(123)
# Simulate scenarios based on logistic model
for (i in 1:n_scenarios) {
simulated_economic_data <- data.frame(
DGS10 = rnorm(n_steps, mean = mean(combined_data$DGS10), sd = sd(combined_data$DGS10)),
daily.returns = rnorm(n_steps, mean = mean(combined_data$daily.returns), sd = sd(combined_data$daily.returns)))
# Event forecasting for each scenario
simulated_probs <- predict(logit_model, newdata = simulated_economic_data, type = "response")
mc_results[i, ] <- simulated_probs
}
# Tính toán xác suất trung bình của sự kiện trong mỗi ngày
average_probs <- apply(mc_results, 2, mean)
# Calculate the average probability of an event per day
plot(average_probs, type = "l", col = "blue", lwd = 2, xlab = "Days", ylab = "Probability",
main = "Average Probability of Significant Price Drop over Time")## [1] "Average probability of significant price drop over 22 days:"
## [1] 0.4259960 0.4256583 0.4251853 0.4255004 0.4249340 0.4245675 0.4254334
## [8] 0.4250063 0.4264270 0.4264576 0.4262455 0.4249419 0.4264182 0.4248062
## [15] 0.4254879 0.4261870 0.4251955 0.4255195 0.4248647 0.4251158 0.4262779
## [22] 0.4249302