# Load necessary libraries
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# Step 1: Data Collection
symbol <- "AAPL"  # Use Apple stock as an example
getSymbols(symbol, src = "yahoo", from = "2018-01-01", to = "2023-12-31")
## [1] "AAPL"
# Extract Adjusted Closing Prices
prices <- Cl(get(symbol))  # Get the Adjusted Close price
prices <- na.omit(prices)  # Remove any missing data

# Step 2: Prepare Data with Multiple Lagged Features
# Create a lagged dataset manually
data <- data.frame(Date = index(prices), Price = as.numeric(prices))
data$Lag1 <- c(NA, head(data$Price, -1))  # 1-day lag
data$Lag2 <- c(NA, NA, head(data$Price, -2))  # 2-day lag
data$Lag3 <- c(NA, NA, NA, head(data$Price, -3))  # 3-day lag
data <- na.omit(data)  # Remove rows with missing values

# Step 3: Split Data into Training and Testing Sets
set.seed(123)  # For reproducibility
train_size <- floor(0.8 * nrow(data))  # 80% for training
train_data <- data[1:train_size, ]     # First 80% as training data
test_data <- data[(train_size + 1):nrow(data), ]  # Remaining 20% as testing data

# Step 4: Build the Multiple Regression Model
model <- lm(Price ~ Lag1 + Lag2 + Lag3, data = train_data)

# Print Model Summary
summary(model)
## 
## Call:
## lm(formula = Price ~ Lag1 + Lag2 + Lag3, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6632  -0.8162  -0.0270   0.8664  11.0606 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.19819    0.14255   1.390    0.165    
## Lag1         0.93376    0.02890  32.311   <2e-16 ***
## Lag2         0.04823    0.03962   1.217    0.224    
## Lag3         0.01688    0.02895   0.583    0.560    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.144 on 1200 degrees of freedom
## Multiple R-squared:  0.9979, Adjusted R-squared:  0.9979 
## F-statistic: 1.865e+05 on 3 and 1200 DF,  p-value: < 2.2e-16
# Step 5: Make Predictions
predictions <- predict(model, newdata = test_data)

# Step 6: Evaluate the Model
actuals <- test_data$Price
mse <- mean((predictions - actuals)^2)  # Mean Squared Error
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 6.139336
# Step 7: Visualize Results
plot(test_data$Date, actuals, type = "l", col = "blue", lwd = 2,
     main = "Actual vs Predicted Prices", xlab = "Date", ylab = "Price")
lines(test_data$Date, predictions, col = "red", lwd = 2)
legend("topright", legend = c("Actual", "Predicted"), col = c("blue", "red"), lty = 1, lwd = 2)