Data

CO2_data <- read.csv("C:/Users/nghimire/OneDrive - University of Texas at Tyler/Desktop/Data analysis-Statistical Modeling/time_series_report_data_nops/CO2.csv", skip = 57, col.names = c("Year", "Month", "Date_Excel", "Date", "CO2_ppm", "s_adj_ppm", "fit_ppm", "s_adj_fit_ppm", "CO2_filled_ppm", "s_adj_filled_ppm"))
head(CO2_data)
  Year Month Date_Excel     Date CO2_ppm s_adj_ppm fit_ppm s_adj_fit_ppm CO2_filled_ppm
1 1958     2      21231 1958.126  -99.99    -99.99  -99.99        -99.99         -99.99
2 1958     3      21259 1958.203  315.70    314.43  316.19        314.90         315.70
3 1958     4      21290 1958.288  317.45    315.16  317.30        314.98         317.45
4 1958     5      21320 1958.370  317.51    314.71  317.86        315.06         317.51
5 1958     6      21351 1958.455  -99.99    -99.99  317.24        315.14         317.24
6 1958     7      21381 1958.537  315.86    315.19  315.86        315.21         315.86
  s_adj_filled_ppm
1           -99.99
2           314.43
3           315.16
4           314.71
5           315.14
6           315.19

Getting Rid of Missing Values: Casewise Deletion

# Replace all instances of -99.99 with NA
CO2_data[CO2_data == -99.99] <- NA

# Remove rows with missing values
CO2_data_clean <- na.omit(CO2_data)

# Print the number of rows and columns in the cleaned data frame
cat("Number of rows in original data frame:", nrow(CO2_data), "\n")
Number of rows in original data frame: 743 
cat("Number of columns in original data frame:", ncol(CO2_data), "\n")
Number of columns in original data frame: 10 
cat("Number of rows in cleaned data frame:", nrow(CO2_data_clean), "\n")
Number of rows in cleaned data frame: 733 
cat("Number of columns in cleaned data frame:", ncol(CO2_data_clean), "\n")
Number of columns in cleaned data frame: 10 
head(CO2_data_clean)
  Year Month Date_Excel     Date CO2_ppm s_adj_ppm fit_ppm s_adj_fit_ppm CO2_filled_ppm
2 1958     3      21259 1958.203  315.70    314.43  316.19        314.90         315.70
3 1958     4      21290 1958.288  317.45    315.16  317.30        314.98         317.45
4 1958     5      21320 1958.370  317.51    314.71  317.86        315.06         317.51
6 1958     7      21381 1958.537  315.86    315.19  315.86        315.21         315.86
7 1958     8      21412 1958.622  314.93    316.19  313.99        315.29         314.93
8 1958     9      21443 1958.707  313.21    316.08  312.46        315.35         313.21
  s_adj_filled_ppm
2           314.43
3           315.16
4           314.71
6           315.19
7           316.19
8           316.08

Converting Year and Month columns to date format

# Convert year and month columns to date format
CO2_data_clean$Date <- as.Date(paste(CO2_data_clean$Year, CO2_data_clean$Month, "01", sep = "-"))

# Create time series plot
ggplot(CO2_data_clean, aes(x = Date, y = CO2_ppm)) +
  geom_line() +
  labs(x = "Year", y = "CO2 Concentration (ppm)", title = "Mauna Loa CO2 Concentration (1958-2021)")

Fitting the Time Series Regression Model

# Fit time series regression model
ts_model <- auto.arima(CO2_data_clean$CO2_ppm, seasonal = TRUE)

# Print model summary
summary(ts_model)
Series: CO2_data_clean$CO2_ppm 
ARIMA(2,1,1) with drift 

Coefficients:
         ar1      ar2      ma1   drift
      1.5320  -0.8392  -0.9036  0.1321
s.e.  0.0203   0.0201   0.0115  0.0080

sigma^2 = 0.462:  log likelihood = -755.39
AIC=1520.77   AICc=1520.85   BIC=1543.75

Training set error measures:
                        ME      RMSE       MAE          MPE      MAPE      MASE
Training set -0.0002162403 0.6773845 0.5433793 -0.002975161 0.1534257 0.4859663
                    ACF1
Training set -0.02561056

Plotting the Residual

# Plot residuals
ggplot(data.frame(residuals = ts_model$residuals, Date = CO2_data_clean$Date), aes(x = Date, y = residuals)) +
  geom_line() +
  labs(x = "Year", y = "Residuals", title = "Residuals from Time Series Model")

Periodic Singnal for the Timeseries Model

# Create time series object
ts_data <- ts(CO2_data_clean$CO2_ppm, start = c(min(CO2_data_clean$Year)), end = c(max(CO2_data_clean$Year)), frequency = 12)

# Fit the auto.arima() model
ts_model <- auto.arima(ts_data, seasonal = TRUE)
summary(ts_model)
Series: ts_data 
ARIMA(3,1,3)(2,1,1)[12] 

Coefficients:
         ar1      ar2     ar3      ma1     ma2      ma3    sar1    sar2     sma1
      2.5377  -2.3703  0.8033  -2.6124  2.3940  -0.7750  0.1345  0.1055  -0.7728
s.e.  0.0392   0.0648  0.0339   0.0408  0.0746   0.0375  0.0737  0.0565   0.0605

sigma^2 = 0.1794:  log likelihood = -398.8
AIC=817.61   AICc=817.92   BIC=863.4

Training set error measures:
                     ME      RMSE       MAE         MPE       MAPE      MASE      ACF1
Training set 0.02767412 0.4171741 0.3090387 0.007335288 0.08808438 0.1903605 0.0186576
# Create seasonal plot
seasonplot(ts_data,
  year.labels = TRUE, col = "blue",
  xlab = "Year", ylab = "CO2_ppm", main = "Seasonal plot of CO2_ppm data"
)

Fitting the Linear Model

options(scipen = 999)
# Fit a linear model
lm_fit <- lm(CO2_ppm ~ Year, data = CO2_data_clean)
summary(lm_fit)

Call:
lm(formula = CO2_ppm ~ Year, data = CO2_data_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.8992 -3.0813 -0.6952  2.6047 12.4730 

Coefficients:
                Estimate   Std. Error t value            Pr(>|t|)    
(Intercept) -2763.420601    17.293281  -159.8 <0.0000000000000002 ***
Year            1.567993     0.008696   180.3 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.161 on 731 degrees of freedom
Multiple R-squared:  0.978, Adjusted R-squared:  0.978 
F-statistic: 3.251e+04 on 1 and 731 DF,  p-value: < 0.00000000000000022
# Plot the data and the fit
plot(CO2_ppm ~ Year, data = CO2_data_clean, main = "Mauna Loa CO2 Concentration")
abline(lm_fit, col = "red")

Plotting the Residual Error

# Plot the residuals
plot(lm_fit$fitted.values, lm_fit$residuals,
  main = "Residual plot for Mauna Loa CO2 Concentration",
  xlab = "Fitted values",
  ylab = "Residuals"
)

# Add a horizontal line at y=0
abline(h = 0, col = "red")

Calculating RMSE and MAPE for Linear Model

# Load the Metrics package
library(Metrics)

# Calculate RMSE
rmse <- rmse(lm_fit$fitted.values, CO2_data_clean$CO2_ppm)
cat("RMSE:", rmse, "\n")
RMSE: 4.155777 
# Calculate MAPE
mape <- mape(lm_fit$fitted.values, CO2_data_clean$CO2_ppm)
cat("MAPE:", mape, "%\n")
MAPE: 0.00945464 %

Splitting the Data in 80:20 Training and Test Datasets

# Split the data into training and test sets
library(caret)
set.seed(123) # Set seed for reproducibility
index <- createDataPartition(CO2_data_clean$CO2_ppm, p = 0.8, list = FALSE)
train <- CO2_data_clean[index, ]
test <- CO2_data_clean[-index, ]

# Fit a linear model on the training data
# lm_fit <- lm(CO2_ppm ~ Date, data = train)

# Calculate RMSE
# rmse <- rmse(lm_fit$fitted.values, test$CO2_ppm)
# cat("RMSE:", rmse, "\n")

# Calculate MAPE
# mape <- mape(lm_fit$fitted.values, test$CO2_ppm)
# cat("MAPE:", mape, "%\n")

Fitting a Quadratic Model

# fit a quadratic model on the training data
model_quad <- lm(CO2_ppm ~ poly(Year, 2), data = train)

# summary of the quadratic model
summary(model_quad)

Call:
lm(formula = CO2_ppm ~ poly(Year, 2), data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.5559 -1.4544 -0.0298  1.5633  4.8790 

Coefficients:
                Estimate Std. Error t value            Pr(>|t|)    
(Intercept)    354.66924    0.08271 4287.92 <0.0000000000000002 ***
poly(Year, 2)1 676.05049    2.00740  336.78 <0.0000000000000002 ***
poly(Year, 2)2  90.02177    2.00740   44.84 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.007 on 586 degrees of freedom
Multiple R-squared:  0.9949,    Adjusted R-squared:  0.9949 
F-statistic: 5.772e+04 on 2 and 586 DF,  p-value: < 0.00000000000000022

Plotting the Residual Error for the Quadratic Model

plot(predict(model_quad), residuals(model_quad),
  xlab = "Fitted values", ylab = "Residuals",
  main = "Residual plot for Quadratic Model"
)

Validating in Test Data

# Predict the CO2 concentration in the test dataset
test$CO2_pred <- predict(model_quad, newdata = test)

# Calculate the residual errors
test$residuals <- test$CO2_ppm - test$CO2_pred

# Plot the residual errors
plot(test$Year, test$residuals, type = "l", xlab = "Year", ylab = "Residual Error", main = "Residual Plot for Quadratic Model")

Calculating RMSE and MAPE for the Quadratic Model

# Obtain predicted values for the test set
pred_quad <- predict(model_quad, newdata = test)

# Calculate RMSE
rmse_quad <- sqrt(mean((test$CO2_ppm - pred_quad)^2))
cat("RMSE for quadratic model:", rmse_quad, "\n")
RMSE for quadratic model: 1.909819 
# Calculate MAPE
mape_quad <- mean(abs((test$CO2_ppm - pred_quad) / test$CO2_ppm)) * 100
cat("MAPE for quadratic model:", mape_quad, "%")
MAPE for quadratic model: 0.4547349 %
## Checking Values
model_quad <- lm(CO2_ppm ~ poly(Year, 2), data = train)
beta_hat <- coef(model_quad)
beta_hat
   (Intercept) poly(Year, 2)1 poly(Year, 2)2 
     354.66924      676.05049       90.02177 

Fitting a Cubic Model

model_cubic <- lm(CO2_ppm ~ poly(Year, 3), data = train)
summary(model_cubic)

Call:
lm(formula = CO2_ppm ~ poly(Year, 3), data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.4144 -1.4244 -0.0546  1.5400  4.9257 

Coefficients:
                Estimate Std. Error  t value            Pr(>|t|)    
(Intercept)    354.66924    0.08256 4295.928 <0.0000000000000002 ***
poly(Year, 3)1 676.05049    2.00366  337.408 <0.0000000000000002 ***
poly(Year, 3)2  90.02177    2.00366   44.929 <0.0000000000000002 ***
poly(Year, 3)3   3.57923    2.00366    1.786              0.0746 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.004 on 585 degrees of freedom
Multiple R-squared:  0.995, Adjusted R-squared:  0.995 
F-statistic: 3.862e+04 on 3 and 585 DF,  p-value: < 0.00000000000000022

Plot residuals

# Predict CO2_ppm using the cubic model
test$predicted <- predict(model_cubic, newdata = test)

# Calculate residuals
test$residuals <- test$CO2_ppm - test$predicted

# Plot residuals
plot(test$Year, test$residuals, pch = 16, xlab = "Year", ylab = "Residuals", main = "Residual Plot - Cubic Model")
abline(h = 0, lty = 2, col = "red")

Calculating RMSE and MAPE of the Cubic Model

# Predict CO2 concentration using cubic model
y_pred_cubic <- predict(model_cubic, newdata = test)

# Calculate RMSE for cubic model
RMSE_cubic <- sqrt(mean((test$CO2_ppm - y_pred_cubic)^2))
RMSE_cubic
[1] 1.903641
# Calculate MAPE for cubic model
MAPE_cubic <- mean(abs((test$CO2_ppm - y_pred_cubic) / test$CO2_ppm)) * 100
MAPE_cubic
[1] 0.4535137

Plotting the Periodic Signal Pi

Plot the periodic signal Pi. (Your plot should have 1 data point for each month, so 12 in total.) Clearly state the definition the Pi, and make sure your plot is clearly labeled.

library(ggplot2)

# Create a sequence of years for the plot
years <- seq(min(train$Year), max(train$Year), length.out = 100)

# Predict CO2_ppm values for each year in the sequence
predictions <- predict(model_cubic, newdata = data.frame(Year = years))

# Create a periodic signal using sine function
period <- 10
amplitude <- 5
frequency <- 2 * pi / period
phase <- pi / 4
periodic_signal <- amplitude * sin(frequency * years + phase)

# Plot the predicted CO2_ppm values and the periodic signal
ggplot() +
  geom_line(aes(x = years, y = predictions)) +
  geom_line(aes(x = years, y = periodic_signal)) +
  scale_x_continuous(name = "Year", breaks = seq(min(train$Year), max(train$Year), by = 10)) +
  scale_y_continuous(name = "CO2_ppm") +
  ggtitle("Cubic Regression Model with Periodic Signal") +
  theme_minimal()

Fitting the Periodic Signal

First, remove the deterministic trend Fn(t)form the time series and compute the average residual Ci - Fn (ti) for each month.

library(TSA)
# Create time series object
ts_data <- ts(CO2_data_clean$CO2_ppm, start = c(min(CO2_data_clean$Year)), end = c(max(CO2_data_clean$Year)), frequency = 12)
# Decompose time series using STL
ts_decomp <- stl(ts_data, s.window = "periodic")
# Remove trend component
trend_comp <- ts_decomp$time.series[, "trend"]
ts_seasonal <- ts_data - trend_comp
# Calculate RMSE
rmse <- sqrt(mean(test$predicted - CO2_data_clean$CO2_ppm)^2)
rmse
[1] 0.4754072
# Calculate MAPE
mape <- mean(abs((test$predicted - CO2_data_clean$CO2_ppm) / CO2_data_clean$CO2_ppm)) * 100
mape
[1] 8.253399
# Plot the time series and the trend component
plot(ts_data, main = "CO2 PPM Time Series with Trend Component", ylab = "CO2 PPM", xlab = "Year")
lines(trend_comp, col = "red")

Plot the Final Fit Fn (ti) + pi

Plot the final fit Fn (ti) + pi. Your plot should clearly show the final model on top of the entire time series, while indicating the split between the training and testing data.

# Plot the training data
ggplot(train, aes(x = Year, y = CO2_ppm)) +
  geom_line(color = "blue", size = 1) +
  ggtitle("Training Data") +
  xlab("Year") +
  ylab("CO2_ppm") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

# Plot the testing data
ggplot(test, aes(x = Year, y = CO2_ppm)) +
  geom_line(color = "red", size = 1) +
  ggtitle("Testing Data") +
  xlab("Year") +
  ylab("CO2_ppm") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

# Plot the final model on top of the entire time series
ggplot() +
  geom_line(data = train, aes(x = Year, y = CO2_ppm), color = "blue", size = 1, alpha = 0.8) +
  geom_line(data = test, aes(x = Year, y = CO2_ppm), color = "red", size = 1, alpha = 0.5) +
  geom_line(data = CO2_data_clean, aes(x = Year, y = CO2_ppm), color = "green", size = 1, alpha = 0.4) +
  ggtitle("Final Model Fit") +
  xlab("Year[Blue: Training; Red: Testing; & Green: Complete Data]") +
  ylab("Carbon-Dioxide Concentration Level [CO2_ppm]") +
  scale_color_manual(
    values = c("blue", "red", "green"),
    labels = c("Training Data", "Testing Data", "Final Model")
  ) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())