# First, I load the necessary libraries for time series analysis and plotting.
library(quantmod)   # For financial data
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(forecast)   # For ARIMA and time series modeling
library(ggplot2)    # For plotting

# i. Plot the daily closing prices for IBM stock and the ACF and PACF.

# I fetch daily IBM stock prices from Yahoo Finance starting from January 1, 2020.
getSymbols("IBM", from = "2020-01-01", to = Sys.Date())
## [1] "IBM"
# I extract the adjusted closing prices since they account for dividends and splits.
ibm_prices <- Ad(IBM)

# I plot the adjusted daily closing prices to observe any trends or seasonality.
price_plot <- ggplot(
  data.frame(Date = index(ibm_prices), Price = as.numeric(ibm_prices)),
  aes(x = Date, y = Price)
) +
  geom_line() +
  labs(
    title = "Daily Adjusted Closing Prices of IBM Stock",
    y = "Price",
    x = "Date"
  ) +
  theme_minimal()

print(price_plot)  # Display the price plot

# I calculate log returns to stabilize variance and check for stationarity.
ibm_returns <- diff(log(ibm_prices))[-1]

# I check if there are any missing values in the return series.
sum(is.na(ibm_returns))  # Count of NA values
## [1] 0
# I remove any missing values from the returns.
ibm_returns_cleaned <- na.omit(ibm_returns)

# I plot the ACF of the cleaned returns to examine autocorrelation.
acf(ibm_returns_cleaned, main = "ACF of IBM Daily Returns (Cleaned)")

# I also plot the PACF to understand direct lags affecting the series.
pacf(ibm_returns_cleaned, main = "PACF of IBM Daily Returns (Cleaned)")

# --- Explanation (ii) ---
# - The price plot shows a clear trend over time, indicating non-stationarity.
# - ACF of returns drops quickly, indicating that returns are more stationary.
# - PACF confirms that only a few lags have strong influence, which is typical of stationary data.

# iii. Apply differencing on price series to achieve stationarity.

# I apply first-order differencing to the price series to remove the trend.
ibm_prices_diff <- diff(ibm_prices)

# I check for and remove any missing values in the differenced series.
sum(is.na(ibm_prices_diff))
## [1] 1
ibm_prices_diff_cleaned <- na.omit(ibm_prices_diff)

# I plot the differenced prices to visually confirm if stationarity is achieved.
diff_price_plot <- ggplot(
  data.frame(Date = index(ibm_prices_diff_cleaned), Diff_Price = as.numeric(ibm_prices_diff_cleaned)),
  aes(x = Date, y = Diff_Price)
) +
  geom_line() +
  labs(
    title = "First-Order Differenced Adjusted Closing Prices of IBM Stock",
    y = "Differenced Price",
    x = "Date"
  ) +
  theme_minimal()

print(diff_price_plot)  # Show the differenced price plot

# I plot ACF and PACF of the differenced series to verify if it’s now stationary.
acf(ibm_prices_diff_cleaned, main = "ACF of Differenced IBM Prices")

pacf(ibm_prices_diff_cleaned, main = "PACF of Differenced IBM Prices")

# Apply ARIMA model to capture the patterns in the cleaned returns data.

# I use auto.arima to automatically determine the best ARIMA model for the returns.
arima_model_returns_cleaned <- auto.arima(ibm_returns_cleaned)
summary(arima_model_returns_cleaned)  # Display the ARIMA model summary
## Series: ibm_returns_cleaned 
## ARIMA(3,0,3) with non-zero mean 
## 
## Coefficients:
##           ar1      ar2     ar3     ma1     ma2      ma3   mean
##       -1.6220  -0.6881  0.0926  1.5894  0.6795  -0.0444  7e-04
## s.e.   0.6938   1.2439  0.6449  0.6997  1.2236   0.6094  5e-04
## 
## sigma^2 = 0.0002932:  log likelihood = 3578.72
## AIC=-7141.43   AICc=-7141.32   BIC=-7099.77
## 
## Training set error measures:
##                       ME       RMSE        MAE  MPE MAPE      MASE         ACF1
## Training set 5.14115e-06 0.01707879 0.01133028 -Inf  Inf 0.6829957 0.0004746699
# If I want to fit ARIMA on the differenced prices instead, I could uncomment the code below:
# arima_model_prices_diff_cleaned <- auto.arima(ibm_prices_diff_cleaned)
# summary(arima_model_prices_diff_cleaned)