TD3_chenjieLI

Exercice 2

# Set the values for p and q
p <- 2; q <- 1

# Set the AR and MA coefficients
ar_coefs <- c(0.8, 0.5)
ma_coefs <- 0.3

# Set the length of the simulated process
n <- 1000

# Generate random series
random_series <- rnorm(n, mean = 0, sd = 1)

# Create an empty vector to store the simulated process
arma_process <- numeric(n)

# Simulate the ARMA process
for (i in 1:n) {
  if (i <= p) {
    arma_process[i] <- random_series[i]
  } else {
    arma_process[i] <- sum(ar_coefs * arma_process[(i - p):(i - 1)]) +
      ma_coefs * random_series[i - q]
  }
}

# Plot the simulated ARMA process
plot(arma_process, type = "l", main = "Simulated Explosive ARMA(2,1) Process")

acf(arma_process, lag.max = 20)

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# Convert the arma_process to a time series object
arma_ts <- ts(arma_process)

# Fit an ARIMA model to the arma_ts
arma_model <- auto.arima(arma_ts)

# Make a forecast for 12 periods ahead
forecast_result <- forecast(arma_model, h = 12)

# Plot the forecasted values and interval confidences
plot(forecast_result, main = "Forecasted Values and Interval Confidences")

Exercise 3 - Trend and seasonal factor with true data

Step 1 : Trend and seasonal factor

1. Load the csv file and import the three time series. Check their status ? Convert them into a time series and generate a date series as well.

# Load the required libraries
library(readxl)

# Load the Excel file
excel_data <- read_excel("transportation_index.xlsx")

## New names:
## • `` -> `...1`

# Check the structure of the data
str(excel_data)

## tibble [516 × 4] (S3: tbl_df/tbl/data.frame)
##  $ ...1     : POSIXct[1:516], format: NA "2000-01-31" ...
##  $ USTRASERV: chr [1:516] NA "101.7" NA "101.1" ...
##  $ USTRAFREI: chr [1:516] NA "105.1" NA "103.1" ...
##  $ USTRAPASS: chr [1:516] NA "95.1" NA "97" ...

# handle with the NA values
excel_data <- na.omit(excel_data)

# Convert the data into time series
time_series_USTRASERV <- ts((as.numeric(excel_data$USTRASERV)), start = c(2000, 1), frequency = 12)
time_series_USTRAFREI <- ts((as.numeric(excel_data$USTRAFREI)), start = c(2000, 1), frequency = 12)
time_series_USTRAPASS <- ts((as.numeric(excel_data$USTRAPASS)), start = c(2000, 1), frequency = 12)

# Check the structure of the time series
str(time_series_USTRASERV)

##  Time-Series [1:258] from 2000 to 2021: 101.7 101.1 99.4 98.3 99.9 ...

str(time_series_USTRAFREI)

##  Time-Series [1:258] from 2000 to 2021: 105.1 103.1 99.7 97.9 98.8 ...

str(time_series_USTRAPASS)

##  Time-Series [1:258] from 2000 to 2021: 95.1 97 98.7 99.2 102.2 ...

# Generate a date series
date_series <- seq(as.Date("2000-01-31"), by = "month", length.out = nrow(excel_data))

# Check the structure of the date series
str(date_series)

##  Date[1:258], format: "2000-01-31" "2000-03-02" "2000-03-31" "2000-05-01" "2000-05-31" ...

2. Plot the three indicators within the same charts with dates index-axis(there are two possibilities).

#Combine the three time series into a data frame
plot_data <- data.frame(
  Date = date_series,
  USTRASERV = time_series_USTRASERV,
  USTRAFREI = time_series_USTRAFREI,
  USTRAPASS = time_series_USTRAPASS
)

# Plot using base R plot
plot(plot_data$Date, plot_data$USTRASERV, type = "l", col = "blue", xlab = "Date", ylab = "Indicator Values", ylim = range(c(plot_data$USTRASERV, plot_data$USTRAFREI, plot_data$USTRAPASS)))
lines(plot_data$Date, plot_data$USTRAFREI, col = "red")
lines(plot_data$Date, plot_data$USTRAPASS, col = "green")
legend("topright", legend = c("USTRASERV", "USTRAFREI", "USTRAPASS"), col = c("blue", "red", "green"), lty = 1)

# Add a title
title("Transportation Indicators Over Time")

3.Remind the definition of the autocorrelation function and the partial autocorrelation function.

Autocorrelation Function (ACF): - The ACF measures the correlation between a time series and its lagged values. - It calculates the correlation coefficient between the series at different lags (time points in the past). - ACF helps identify patterns or dependencies in the time series.

Partial Autocorrelation Function (PACF): - The PACF measures the correlation between a time series and its lagged values, controlling for the effects of intermediate lags. - It measures the direct relationship between two data points at a specific lag, excluding the influence of other lags in between. - PACF helps identify the direct linear relationship between a data point and its lagged values. - It provides insights into the specific lags that contribute significantly to the time series.

4.Plot the ACF function of the passenger index data. What are your conclusions ?

# Plot ACF
acf(time_series_USTRAPASS, type="correlation", main = "ACF Plot of Passenger Index Data")

5.Enumerate the existing tests of autocorrelations? Perform the Ljung-Box test. Remind the dif- ference between the Ljung-Box test and the Box-Pierce one? Is your time series ready to be modeled ?

# Perform Ljung-Box test
ljung_box_test <- Box.test(time_series_USTRAPASS, lag = 20, type = "Ljung-Box")

# Print the test results
print(ljung_box_test)

## 
##  Box-Ljung test
## 
## data:  time_series_USTRAPASS
## X-squared = 1131.4, df = 20, p-value < 2.2e-16

With the ACF we find that the process is not stationary so we could not modelise this time series

Step 2 : Seasonal data : what does it look like ?

1. By using the function “scan”, import both the data of births and sales from the following links

# Import the data of births using scan
births_url <- "http://robjhyndman.com/tsdldata/data/nybirths.dat"
births <- scan(births_url)

# Import the data of sales using scan
sales_url <- "http://robjhyndman.com/tsdldata/data/fancy.dat"
sales <- scan(sales_url)

# Print the imported data
head(births)

## [1] 26.663 23.598 26.931 24.740 25.806 24.364

head(sales)

## [1] 1664.81 2397.53 2840.71 3547.29 3752.96 3714.74

births.ts <- ts(births, frequency=12, start=c(1946,1))
sales.ts <- ts(sales, frequency=12, start=c(1946,1))

2. Plot the two series in two separated charts.

# Plot the births data
plot(births.ts, type = "l", xlab = "Time", ylab = "Births", main = "Births Data")

# Plot the sales data
plot(sales.ts, type = "l", xlab = "Time", ylab = "Sales", main = "Sales Data")

3. Based on the charts observation, select the appropriate model to treat the seasonality in each case1. What kind of transformation can we introduce to move from an multiplicative seasonal scheme to an additive one ?

the approriate method is the logarithmic transfromation

# Apply logarithmic transformation to the births data
births_log <- log(births.ts)

# Plot the transformed births data
plot(births_log, type = "l", xlab = "Time", ylab = "Log(Births)", main = "Transformed log Births Data")

# Apply logarithmic transformation to the sales data
sales_log <- log(sales.ts)

# Plot the transformed sales data
plot(sales_log, type = "l", xlab = "Time", ylab = "Log(Sales)", main = "Transformed log Sales Data")

4. Plot the ACF function of the birth and the sales data using a lag of 36 months. What are your conclusions ? Is there a difference between the ACF function generated for transportation indices ?

# Plot ACF for births data
acf(births, lag.max = 36, main = "ACF Plot of Births Data")

# Plot ACF for sales data
acf(sales, lag.max = 36, main = "ACF Plot of Sales Data")

Exercise 4 : Regression analysis

1. Find a simple way to construct the required number of seasonal dummies

months <- rep(1:12, length.out = length(births))
# The factor() function is used to convert the months variable into a factor, which is then used to create the seasonal dummies using the model.matrix() function.
seasonal_dummies <- model.matrix(~factor(months))

2. Use these dummies within a linear model and estimate this latter (pay attention to the number of regressors). Comment your results

# - births_ts is the response variable
# - seasonal_dummies are the regressors
model <- lm(births ~ seasonal_dummies)
summary(model)

## 
## Call:
## lm(formula = births ~ seasonal_dummies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1096 -1.9188 -0.0052  1.8997  3.7131 
## 
## Coefficients: (1 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       24.3786     0.5744  42.442   <2e-16 ***
## seasonal_dummies(Intercept)            NA         NA      NA       NA    
## seasonal_dummiesfactor(months)2   -1.4931     0.8123  -1.838   0.0680 .  
## seasonal_dummiesfactor(months)3    1.5164     0.8123   1.867   0.0638 .  
## seasonal_dummiesfactor(months)4   -0.1501     0.8123  -0.185   0.8536    
## seasonal_dummiesfactor(months)5    0.9371     0.8123   1.154   0.2504    
## seasonal_dummiesfactor(months)6    0.4919     0.8123   0.606   0.5457    
## seasonal_dummiesfactor(months)7    2.0946     0.8123   2.578   0.0108 *  
## seasonal_dummiesfactor(months)8    1.9083     0.8123   2.349   0.0201 *  
## seasonal_dummiesfactor(months)9    1.4235     0.8123   1.752   0.0817 .  
## seasonal_dummiesfactor(months)10   1.4891     0.8123   1.833   0.0687 .  
## seasonal_dummiesfactor(months)11  -0.4001     0.8123  -0.493   0.6230    
## seasonal_dummiesfactor(months)12   0.3504     0.8123   0.431   0.6668    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.149 on 156 degrees of freedom
## Multiple R-squared:  0.1975, Adjusted R-squared:  0.1409 
## F-statistic:  3.49 on 11 and 156 DF,  p-value: 0.000222

3. Add a linear trend to the estimated specification in order to remove the trend

# The trend variable is created to represent a linear trend from 1 to the length of the births data.
trend <- 1:length(births)

model_with_trend <- lm(births ~ seasonal_dummies + trend)
summary(model_with_trend)

## 
## Call:
## lm(formula = births ~ seasonal_dummies + trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1819 -0.5458 -0.1180  0.4999  5.1607 
## 
## Coefficients: (1 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      21.465397   0.323663  66.320  < 2e-16 ***
## seasonal_dummies(Intercept)             NA         NA      NA       NA    
## seasonal_dummiesfactor(months)2  -1.529948   0.414028  -3.695 0.000304 ***
## seasonal_dummiesfactor(months)3   1.442604   0.414039   3.484 0.000642 ***
## seasonal_dummiesfactor(months)4  -0.260772   0.414058  -0.630 0.529755    
## seasonal_dummiesfactor(months)5   0.789637   0.414084   1.907 0.058377 .  
## seasonal_dummiesfactor(months)6   0.307546   0.414117   0.743 0.458815    
## seasonal_dummiesfactor(months)7   1.873312   0.414157   4.523  1.2e-05 ***
## seasonal_dummiesfactor(months)8   1.650150   0.414205   3.984 0.000104 ***
## seasonal_dummiesfactor(months)9   1.128488   0.414260   2.724 0.007188 ** 
## seasonal_dummiesfactor(months)10  1.157254   0.414323   2.793 0.005879 ** 
## seasonal_dummiesfactor(months)11 -0.768908   0.414393  -1.856 0.065423 .  
## seasonal_dummiesfactor(months)12 -0.055213   0.414470  -0.133 0.894197    
## trend                             0.036877   0.001747  21.108  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.095 on 155 degrees of freedom
## Multiple R-squared:  0.7929, Adjusted R-squared:  0.7768 
## F-statistic: 49.44 on 12 and 155 DF,  p-value: < 2.2e-16

4. Calculate the residuals of the model. Perform the following quality check : check the normality (using charts for instance) and test of autocorrelation. What is your conclusion ? Does the model succeed in removing both trend an seasonal pattern

# Calculate the residuals
residuals <- residuals(model_with_trend)

# Plot a histogram of residuals to check for normality
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals")

# Perform a Q-Q plot to further assess normality
qqnorm(residuals)
qqline(residuals)

# Plot the ACF of residuals to check for autocorrelation
acf(residuals, main = "Autocorrelation Function of Residuals")

# Set the lag value for the Ljung-Box test
lag_value <- 12  # You can adjust this value based on the seasonality

# Perform Ljung-Box test
ljung_box_test <- Box.test(residuals, lag = lag_value, type = "Ljung-Box")

# Print the results of the Ljung-Box test
ljung_box_test

## 
##  Box-Ljung test
## 
## data:  residuals
## X-squared = 260.6, df = 12, p-value < 2.2e-16

This suggests that there is still significant autocorrelation in the residuals, meaning that the model has not fully captured the underlying structure of the data, including both trend and seasonal patterns. The autocorrelation present in the residuals implies that there are systematic patterns that the model has not accounted for, and the residuals are not behaving like white noise.

Exercise 5 : Moving average filters

1. Load the the US monthly natural gas consumption. Plot and describe the data.

library(TSstudio)

head(USgas)

##         Jan    Feb    Mar    Apr    May    Jun
## 2000 2510.5 2330.7 2050.6 1783.3 1632.9 1513.1

ts_info(USgas)

##  The USgas series is a ts object with 1 variable and 238 observations
##  Frequency: 12 
##  Start time: 2000 1 
##  End time: 2019 10

ts_plot(USgas)

2. The ts_ma function of the TStudio package is used to smooth the data using both one sided and two sided moving average. Use a one sided moving average of 11 periods

# Apply one-sided moving average smoothing with a window size of 11
one_side_ma <- ts_ma(ts.obj = USgas, n_left = 11,n=NULL,plot=TRUE)

3. Smooth the gas consumption data using a two sided moving average filter. You have to choose the right number for both sides of the filter (on the left and the right)

# Apply two-sided moving average smoothing with appropriate window sizes
two_side_ma <- ts_ma(ts.obj = USgas, n=6,plot=TRUE)

4. Plot on the same graphics the US consumption of gas, the one sided moving average and the two side moving average. What do you observe ?

USgas_ts <- ts(USgas,frequency=12)

one_side_ma_ts<-ts(one_side_ma$unbalanced_ma_12,frequency=12)
two_side_ma_ts<-ts(two_side_ma$ma_6,frequency=12)
plot(USgas_ts, type = "l", col = "black", xlab = "Time", ylab = "Gas Consumption", main = "Gas Consumption with Moving Averages")
lines(one_side_ma_ts, col = "red")
lines(two_side_ma_ts, col = "blue")

legend("topright", legend = c("Gas Consumption", "One-sided Moving Average", "Two-sided Moving Average"), col = c("black", "red", "blue"), lty = 1,cex=0.6)

5. Using both an ACF plot check the dynamic properties of the smoothed series. Are the results satisfying ?

# ACF plot of the original data
acf(USgas, main = "ACF of Original Data")

# ACF plot of the one-sided moving average smoothed data
acf(one_side_ma_ts, main = "ACF of One-sided Moving Average")

# ACF plot of the two-sided moving average smoothed data
acf(two_side_ma_ts, main = "ACF of Two-sided Moving Average")

Exercise 6 Using the differenciation to remove trends and seasonal patterns

1. Load the Mauna Loa Atmospheric CO2 Concentration available under the R datasets

# Load necessary libraries
library(ggplot2)

# Load the Mauna Loa Atmospheric CO2 Concentration dataset
co2<-datasets::co2

# Display the first few rows of the dataset
head(co2)

##         Jan    Feb    Mar    Apr    May    Jun
## 1959 315.42 316.31 316.50 317.56 318.13 318.00

2. Stationarize this time series using a first difference filter. Does it remove the seasonal pattern observed in the raw data.

By doing this, it remove the seasonal pattern

co2_diff <- diff(co2, differences = 1)
acf(co2_diff)

3. Fund the appropriate filter (i.e.the appropriate differentiation) to neutralize the seasonal factor.

co2_seasonal_diff <- diff(co2, lag=1, differences = 12)
acf(co2_seasonal_diff)

4. Check the filtered data and confirm whether the seasonal factor has been deleted.

To neutralize the seasonal factor, you can apply seasonal differencing by subtracting the observation from the previous observation from the same season. In this case, the Mauna Loa Atmospheric CO2 Concentration data is measured monthly, so the seasonal difference is the difference between an observation and the observation from the previous month.

# Calculate and plot the ACF of the time series
plot(co2_diff, main ="Mauna Loa Atmospheric CO2 Concentration")

plot(co2_seasonal_diff, main = "Mauna Loa Atmospheric CO2 Concentration (Seasonal Difference)")

Part 3 : Moving to real data

Exercise 7 : The French Consumer Price Index

1. Load the CPI data from your workspace. Convert and plot the time series. Can the producer prices index be modeled without any transformation ? Explain.

# Load the CPI data from the CSV file
cpi <- read.csv("us_cpi.csv", sep=";", stringsAsFactors = FALSE)

# Convert the date column to a proper date format
cpi$Date <- as.Date(cpi$date, format = "%d/%m/%Y")

# Convert the CPI column to numeric type and replace commas with decimal points
cpi$CPI <- as.numeric(gsub(",", ".", cpi$CPI))

# Convert the data into a time series object
cpi_ts <- ts(cpi$CPI, start = c(1971, 1), frequency = 12)

plot(cpi_ts)

2. Calculate the monthly growth rate of the CPI between January 1990 and December 2005. Can the CPI inflation be modeled now with an ARMA(p,q) ?

# Subset the CPI data for the specified period
cpi_subset <- cpi[cpi$Date >= as.Date("1990-01-01") & cpi$Date <= as.Date("2005-12-31"), ]

# Calculate the return
monthly_growth <- numeric(length = length(cpi_subset$CPI) - 1)
for (i in 2:length(cpi_subset$CPI)) {
  monthly_growth[i-1] <- ((cpi_subset$CPI[i] - cpi_subset$CPI[i-1]) / cpi_subset$CPI[i-1]) * 100
}

# Plot the returns
plot(monthly_growth, type = "l")

acf(monthly_growth)

After calculating the monthly growth rate, we can model the CPI inflation rate using an ARMA(p, q) model.

3. Identification : Use the ACF and the PACF to identify the most appropriate lags to model this inflation rate.

# Plot ACF and PACF
acf_result <- acf(monthly_growth, lag.max = 24, plot = FALSE)
pacf_result <- pacf(monthly_growth, lag.max = 24, plot = FALSE)

# Plot ACF
plot(acf_result, main = "ACF of Monthly CPI Growth Rate", xlab = "Lag", ylab = "Autocorrelation")

# Plot PACF
plot(pacf_result, main = "PACF of Monthly CPI Growth Rate", xlab = "Lag", ylab = "Partial Autocorrelation")

4. Using the arima.sim function calculate the likelihood values for different combination p and q. Calculate then the values of the Akaike and the Bayesian information criteria for all the couple p and q. Which combination of p and q is qualified according to the Akaike and the Bayesian criteria ?

# Define the range of p and q values to explore
p_values <- 0:3
q_values <- 0:3

# Create an empty matrix to store the likelihood values
likelihood_matrix <- matrix(NA, nrow = length(p_values), ncol = length(q_values))

# Loop over each combination of p and q
for (i in 1:length(p_values)) {
  for (j in 1:length(q_values)) {
    # Fit an ARMA model to the differenced time series
    arma_model <- arima(monthly_growth, order = c(p_values[i], 1, q_values[j]))
    
    # Calculate the log-likelihood value
    likelihood_value <- logLik(arma_model)
    
    # Store the log-likelihood value in the likelihood matrix
    likelihood_matrix[i, j] <- likelihood_value
  }
}

# Calculate the AIC and BIC values
AIC_matrix <- -2 * likelihood_matrix + 2 * matrix(rep(p_values + 1, each = length(p_values)), nrow = length(p_values))
BIC_matrix <- -2 * likelihood_matrix + log(length(monthly_growth)) * matrix(rep(p_values + 1, each = length(p_values)), nrow = length(p_values))

# Find the indices of the minimum AIC and BIC values
min_AIC_indices <- which(AIC_matrix == min(AIC_matrix), arr.ind = TRUE)
min_BIC_indices <- which(BIC_matrix == min(BIC_matrix), arr.ind = TRUE)

# Extract the corresponding p and q values
qualified_AIC_p <- p_values[min_AIC_indices[, 1]]
qualified_AIC_q <- q_values[min_AIC_indices[, 2]]
qualified_BIC_p <- p_values[min_BIC_indices[, 1]]
qualified_BIC_q <- q_values[min_BIC_indices[, 2]]

# Print the qualified combinations according to AIC
cat("Qualified combination according to AIC: p =", qualified_AIC_p, ", q =", qualified_AIC_q, "\n")

## Qualified combination according to AIC: p = 3 , q = 1

# Print the qualified combinations according to BIC
cat("Qualified combination according to BIC: p =", qualified_BIC_p, ", q =", qualified_BIC_q, "\n")

## Qualified combination according to BIC: p = 3 , q = 1

6. Estimation : Estimate the coefficients of the specification you selected in the previous question. To do so, use the package arima.sim. Calculate the yˆ and estimated residuals as well (εˆ).

library(forecast)

# Fit an ARIMA model to the differenced monthly growth rate
arima_model <- Arima(monthly_growth, order = c(3, 1, 1), include.mean = FALSE)

# Generate simulated data based on the ARIMA model
simulated_data <- simulate(arima_model, nsim = length(monthly_growth))

# Calculate the predicted values using the estimated ARIMA model
predicted_values <- fitted(arima_model)

# Calculate the estimated residuals
estimated_residuals <- residuals(arima_model)

# Print the estimated coefficients
cat("Estimated Coefficients:\n")

## Estimated Coefficients:

head(coef(arima_model))

##         ar1         ar2         ar3         ma1 
##  0.26583995 -0.30441125 -0.03864879 -0.91776518

# Print the predicted values
cat("Predicted Values:\n")

## Predicted Values:

head(predicted_values)

## Time Series:
## Start = 1 
## End = 6 
## Frequency = 1 
## [1] 0.3917647 0.4098791 0.4059460 0.2528447 0.3123343 0.5166998

# Print the estimated residuals
cat("Estimated Residuals:\n")

## Estimated Residuals:

head(estimated_residuals)

## Time Series:
## Start = 1 
## End = 6 
## Frequency = 1 
## [1]  0.0003921565  0.0588709173 -0.1726645316 -0.0976856850  0.3073403927
## [6] -0.0548060014

7. Plot the simulated and the dependent variables within the same chart. Check the significance of the estimated coefficients. What do you conclude? Its it possible to affirm that the estimated specification is the most relevant model to infer the behavior of the inflation dynamics ?

# Plot the simulated data and the dependent variable
plot(monthly_growth, type = "l", col = "blue", ylab = "Monthly Growth Rate")
lines(predicted_values, col = "red")
legend("topleft", legend = c("Monthly Growth Rate", "Predicted Data"), col = c("blue", "red"), lty = 1)

8. Quality check : Explain the type of quality check we have to run to derive a robust conclu- sion regarding the estimated specification. Run the required quality checks seen during the class. Comment your results

# ACF and PACF Plots
acf(residuals,lag.max = 60)

pacf(residuals, lag.max = 60)

# Ljung-Box Test
Box.test(residuals, lag = 20, type = "Ljung-Box")

## 
##  Box-Ljung test
## 
## data:  residuals
## X-squared = 261.85, df = 20, p-value < 2.2e-16

# Normality Test
shapiro.test(residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals
## W = 0.90277, p-value = 4.343e-09

qqnorm(residuals)
qqline(residuals)

# Stability Test
split_point <- floor(length(monthly_growth)/2)
first_half <- monthly_growth[1:split_point]
second_half <- monthly_growth[(split_point+1):length(monthly_growth)]

arima_model_1st_half <- arima(first_half, order = c(3, 1, 1))

## Warning in log(s2): NaNs produced

arima_model_2nd_half <- arima(second_half, order = c(3, 1, 1))

coef_comparison <- cbind(coef(arima_model_1st_half), coef(arima_model_2nd_half))
colnames(coef_comparison) <- c("1st Half", "2nd Half")
print(coef_comparison)

##         1st Half   2nd Half
## ar1  0.297214660  0.1947362
## ar2 -0.005112823 -0.4067466
## ar3  0.111002942 -0.1465214
## ma1 -0.935528686 -0.8859582

9.Forecasting : We want to forecast the CPI on a monthly basis, over the next three months. Use the required R function available in R to produce the values yt+1,yt+2 and yt+3.

# Fit an ARIMA model to the monthly growth rate
arima_model <- arima(monthly_growth, order = c(3, 1, 1))

# Forecast the CPI for the next three months
forecast_result <- forecast(arima_model, h = 3)

# Extract the forecasted values
forecast_values <- forecast_result$mean

# Print the forecasted values
print(forecast_values)

## Time Series:
## Start = 192 
## End = 194 
## Frequency = 1 
## [1] 0.4456701 0.4364974 0.2789802

10. What will happen if we redo the same analysis including data until 2021 ?

# Subset the CPI data for the specified period
cpi_subset <- cpi[cpi$Date >= as.Date("1990-01-01") & cpi$Date <= as.Date("2021-12-31"), ]

# Calculate the return
monthly_growth_2021 <- numeric(length = length(cpi_subset$CPI) - 1)
for (i in 2:length(cpi_subset$CPI)) {
  monthly_growth_2021[i-1] <- ((cpi_subset$CPI[i] - cpi_subset$CPI[i-1]) / cpi_subset$CPI[i-1]) * 100
}

# Plot the returns
plot(monthly_growth_2021, type = "l")

acf(monthly_growth_2021)

# Define the range of p and q values to explore
p_values <- 0:3
q_values <- 0:3

# Create an empty matrix to store the likelihood values
likelihood_matrix_2021 <- matrix(NA, nrow = length(p_values), ncol = length(q_values))

# Loop over each combination of p and q
for (i in 1:length(p_values)) {
  for (j in 1:length(q_values)) {
    # Fit an ARMA model to the differenced time series
    arma_model_2021 <- arima(monthly_growth_2021, order = c(p_values[i], 1, q_values[j]))
    
    # Calculate the log-likelihood value
    likelihood_value <- logLik(arma_model_2021)
    
    # Store the log-likelihood value in the likelihood matrix
    likelihood_matrix_2021[i, j] <- likelihood_value
  }
}

# Calculate the AIC and BIC values
AIC_matrix_2021 <- -2 * likelihood_matrix_2021 + 2 * matrix(rep(p_values + 1, each = length(p_values)), nrow = length(p_values))
BIC_matrix_2021 <- -2 * likelihood_matrix_2021 + log(length(monthly_growth_2021)) * matrix(rep(p_values + 1, each = length(p_values)), nrow = length(p_values))

# Find the indices of the minimum AIC and BIC values
min_AIC_indices_2021 <- which(AIC_matrix_2021 == min(AIC_matrix_2021), arr.ind = TRUE)
min_BIC_indices_2021 <- which(BIC_matrix_2021 == min(BIC_matrix_2021), arr.ind = TRUE)

# Extract the corresponding p and q values
qualified_AIC_p_2021 <- p_values[min_AIC_indices[, 1]]
qualified_AIC_q_2021 <- q_values[min_AIC_indices[, 2]]
qualified_BIC_p_2021 <- p_values[min_BIC_indices[, 1]]
qualified_BIC_q_2021 <- q_values[min_BIC_indices[, 2]]

# Print the qualified combinations according to AIC
cat("Qualified combination according to AIC: p =", qualified_AIC_p_2021, ", q =", qualified_AIC_q_2021, "\n")

## Qualified combination according to AIC: p = 3 , q = 1

# Print the qualified combinations according to BIC
cat("Qualified combination according to BIC: p =", qualified_BIC_p_2021, ", q =", qualified_BIC_q_2021, "\n")

## Qualified combination according to BIC: p = 3 , q = 1

Exercise 8 : The US stock market

# Read the CSV file
df <- read.csv("s&p500.csv")

# Convert the 'Date' column to Date type
df$Date <- as.Date(df$Date)
plot(df$Close, type = "l")

# Calculate the monthly returns
df$Returns <- c(NA, diff(log(df$Close)))

plot(df$Returns, type = "l", main = "monthly returns S&P500")

sp500_returns_ts <- na.omit(ts(df$Returns))
# Plot the ACF and PACF
acf(sp500_returns_ts, main = "ACF Plot of S&P500 Index", lag.max = 60)

pacf(sp500_returns_ts, main = "PACF Plot of S&P500 Index", lag.max = 60)

# Perform Ljung-Box test
ljung_box_test <- Box.test(sp500_returns_ts, lag = 20, type = "Ljung-Box")

# Print the test results
print(ljung_box_test)

## 
##  Box-Ljung test
## 
## data:  sp500_returns_ts
## X-squared = 19.989, df = 20, p-value = 0.4586

Monthly returns of S&P500

# Define the range of p and q values to explore
p_values <- 0:4
q_values <- 0:4

# Create an empty matrix to store the likelihood values
likelihood_matrix_2021 <- matrix(NA, nrow = length(p_values), ncol = length(q_values))

# Loop over each combination of p and q
for (i in 1:length(p_values)) {
  for (j in 1:length(q_values)) {
    # Fit an ARMA model to the differenced time series
    arma_model_2021 <- arima(sp500_returns_ts, order = c(p_values[i], 0, q_values[j]))
    
    # Calculate the log-likelihood value
    likelihood_value <- logLik(arma_model_2021)
    
    # Store the log-likelihood value in the likelihood matrix
    likelihood_matrix_2021[i, j] <- likelihood_value
  }
}

## Warning in arima(sp500_returns_ts, order = c(p_values[i], 0, q_values[j])):
## possible convergence problem: optim gave code = 1

## Warning in arima(sp500_returns_ts, order = c(p_values[i], 0, q_values[j])):
## possible convergence problem: optim gave code = 1

# Calculate the AIC and BIC values
AIC_matrix_2021 <- -2 * likelihood_matrix_2021 + 2 * matrix(rep(p_values + 1, each = length(p_values)), nrow = length(p_values))
BIC_matrix_2021 <- -2 * likelihood_matrix_2021 + log(length(sp500_returns_ts)) * matrix(rep(p_values + 1, each = length(p_values)), nrow = length(p_values))

# Find the indices of the minimum AIC and BIC values
min_AIC_indices_2021 <- which(AIC_matrix_2021 == min(AIC_matrix_2021), arr.ind = TRUE)
min_BIC_indices_2021 <- which(BIC_matrix_2021 == min(BIC_matrix_2021), arr.ind = TRUE)

# Extract the corresponding p and q values
qualified_AIC_p_2021 <- p_values[min_AIC_indices[, 1]]
qualified_AIC_q_2021 <- q_values[min_AIC_indices[, 2]]
qualified_BIC_p_2021 <- p_values[min_BIC_indices[, 1]]
qualified_BIC_q_2021 <- q_values[min_BIC_indices[, 2]]

# Print the qualified combinations according to AIC
cat("Qualified combination according to AIC: p =", qualified_AIC_p_2021, ", q =", qualified_AIC_q_2021, "\n")

## Qualified combination according to AIC: p = 3 , q = 1

# Print the qualified combinations according to BIC
cat("Qualified combination according to BIC: p =", qualified_BIC_p_2021, ", q =", qualified_BIC_q_2021, "\n")

## Qualified combination according to BIC: p = 3 , q = 1

library(forecast)

# Fit an ARIMA model to the differenced monthly growth rate
arima_model <- Arima(sp500_returns_ts, order = c(3, 0, 1), include.mean = FALSE)

# Generate simulated data based on the ARIMA model
simulated_data <- simulate(arima_model, nsim = length(sp500_returns_ts))

# Calculate the predicted values using the estimated ARIMA model
predicted_values <- fitted(arima_model)

# Calculate the estimated residuals
estimated_residuals <- residuals(arima_model)

# Print the estimated coefficients
cat("Estimated Coefficients:\n")

## Estimated Coefficients:

head(coef(arima_model))

##         ar1         ar2         ar3         ma1 
##  0.42939463 -0.08037551 -0.03693234 -0.38258299

# Print the predicted values
cat("Predicted Values:\n")

## Predicted Values:

head(predicted_values)

## Time Series:
## Start = 2 
## End = 7 
## Frequency = 1 
## [1]  7.314067e-05  6.863118e-04 -5.487718e-04 -1.199745e-03 -1.384428e-03
## [6]  6.574403e-04

# Print the estimated residuals
cat("Estimated Residuals:\n")

## Estimated Residuals:

head(estimated_residuals)

## Time Series:
## Start = 2 
## End = 7 
## Frequency = 1 
## [1]  0.0134154657  0.0052109543  0.0002656875 -0.0143649520 -0.0002087052
## [6]  0.0298212030

# Plot the simulated data and the dependent variable
plot(sp500_returns_ts, type = "l", col = "blue", ylab = "Monthly Growth Rate")
lines(predicted_values, col = "red")
legend("topleft", legend = c("Monthly Growth Rate", "Predicted Data"), col = c("blue", "red"), lty = 1)