CO2_data <- read.csv("C:/Users/nghimire/OneDrive - University of Texas at Tyler/Desktop/Data analysis-Statistical Modeling/time_series_report_data_nops/CO2.csv", skip = 57, col.names = c("Year", "Month", "Date_Excel", "Date", "CO2_ppm", "s_adj_ppm", "fit_ppm", "s_adj_fit_ppm", "CO2_filled_ppm", "s_adj_filled_ppm"))
head(CO2_data)
Year Month Date_Excel Date CO2_ppm s_adj_ppm fit_ppm s_adj_fit_ppm CO2_filled_ppm
1 1958 2 21231 1958.126 -99.99 -99.99 -99.99 -99.99 -99.99
2 1958 3 21259 1958.203 315.70 314.43 316.19 314.90 315.70
3 1958 4 21290 1958.288 317.45 315.16 317.30 314.98 317.45
4 1958 5 21320 1958.370 317.51 314.71 317.86 315.06 317.51
5 1958 6 21351 1958.455 -99.99 -99.99 317.24 315.14 317.24
6 1958 7 21381 1958.537 315.86 315.19 315.86 315.21 315.86
s_adj_filled_ppm
1 -99.99
2 314.43
3 315.16
4 314.71
5 315.14
6 315.19
# Replace all instances of -99.99 with NA
CO2_data[CO2_data == -99.99] <- NA
# Remove rows with missing values
CO2_data_clean <- na.omit(CO2_data)
# Print the number of rows and columns in the cleaned data frame
cat("Number of rows in original data frame:", nrow(CO2_data), "\n")
Number of rows in original data frame: 743
cat("Number of columns in original data frame:", ncol(CO2_data), "\n")
Number of columns in original data frame: 10
cat("Number of rows in cleaned data frame:", nrow(CO2_data_clean), "\n")
Number of rows in cleaned data frame: 733
cat("Number of columns in cleaned data frame:", ncol(CO2_data_clean), "\n")
Number of columns in cleaned data frame: 10
head(CO2_data_clean)
Year Month Date_Excel Date CO2_ppm s_adj_ppm fit_ppm s_adj_fit_ppm CO2_filled_ppm
2 1958 3 21259 1958.203 315.70 314.43 316.19 314.90 315.70
3 1958 4 21290 1958.288 317.45 315.16 317.30 314.98 317.45
4 1958 5 21320 1958.370 317.51 314.71 317.86 315.06 317.51
6 1958 7 21381 1958.537 315.86 315.19 315.86 315.21 315.86
7 1958 8 21412 1958.622 314.93 316.19 313.99 315.29 314.93
8 1958 9 21443 1958.707 313.21 316.08 312.46 315.35 313.21
s_adj_filled_ppm
2 314.43
3 315.16
4 314.71
6 315.19
7 316.19
8 316.08
# Convert year and month columns to date format
CO2_data_clean$Date <- as.Date(paste(CO2_data_clean$Year, CO2_data_clean$Month, "01", sep = "-"))
# Create time series plot
ggplot(CO2_data_clean, aes(x = Date, y = CO2_ppm)) +
geom_line() +
labs(x = "Year", y = "CO2 Concentration (ppm)", title = "Mauna Loa CO2 Concentration (1958-2021)")
# Fit time series regression model
ts_model <- auto.arima(CO2_data_clean$CO2_ppm, seasonal = TRUE)
# Print model summary
summary(ts_model)
Series: CO2_data_clean$CO2_ppm
ARIMA(2,1,1) with drift
Coefficients:
ar1 ar2 ma1 drift
1.5320 -0.8392 -0.9036 0.1321
s.e. 0.0203 0.0201 0.0115 0.0080
sigma^2 = 0.462: log likelihood = -755.39
AIC=1520.77 AICc=1520.85 BIC=1543.75
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set -0.0002162403 0.6773845 0.5433793 -0.002975161 0.1534257 0.4859663
ACF1
Training set -0.02561056
# Plot residuals
ggplot(data.frame(residuals = ts_model$residuals, Date = CO2_data_clean$Date), aes(x = Date, y = residuals)) +
geom_line() +
labs(x = "Year", y = "Residuals", title = "Residuals from Time Series Model")
# Create time series object
ts_data <- ts(CO2_data_clean$CO2_ppm, start = c(min(CO2_data_clean$Year)), end = c(max(CO2_data_clean$Year)), frequency = 12)
# Fit the auto.arima() model
ts_model <- auto.arima(ts_data, seasonal = TRUE)
summary(ts_model)
Series: ts_data
ARIMA(3,1,3)(2,1,1)[12]
Coefficients:
ar1 ar2 ar3 ma1 ma2 ma3 sar1 sar2 sma1
2.5377 -2.3703 0.8033 -2.6124 2.3940 -0.7750 0.1345 0.1055 -0.7728
s.e. 0.0392 0.0648 0.0339 0.0408 0.0746 0.0375 0.0737 0.0565 0.0605
sigma^2 = 0.1794: log likelihood = -398.8
AIC=817.61 AICc=817.92 BIC=863.4
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.02767412 0.4171741 0.3090387 0.007335288 0.08808438 0.1903605 0.0186576
# Create seasonal plot
seasonplot(ts_data,
year.labels = TRUE, col = "blue",
xlab = "Year", ylab = "CO2_ppm", main = "Seasonal plot of CO2_ppm data"
)
options(scipen = 999)
# Fit a linear model
lm_fit <- lm(CO2_ppm ~ Year, data = CO2_data_clean)
summary(lm_fit)
Call:
lm(formula = CO2_ppm ~ Year, data = CO2_data_clean)
Residuals:
Min 1Q Median 3Q Max
-7.8992 -3.0813 -0.6952 2.6047 12.4730
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2763.420601 17.293281 -159.8 <0.0000000000000002 ***
Year 1.567993 0.008696 180.3 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.161 on 731 degrees of freedom
Multiple R-squared: 0.978, Adjusted R-squared: 0.978
F-statistic: 3.251e+04 on 1 and 731 DF, p-value: < 0.00000000000000022
# Plot the data and the fit
plot(CO2_ppm ~ Year, data = CO2_data_clean, main = "Mauna Loa CO2 Concentration")
abline(lm_fit, col = "red")
# Plot the residuals
plot(lm_fit$fitted.values, lm_fit$residuals,
main = "Residual plot for Mauna Loa CO2 Concentration",
xlab = "Fitted values",
ylab = "Residuals"
)
# Add a horizontal line at y=0
abline(h = 0, col = "red")
# Load the Metrics package
library(Metrics)
# Calculate RMSE
rmse <- rmse(lm_fit$fitted.values, CO2_data_clean$CO2_ppm)
cat("RMSE:", rmse, "\n")
RMSE: 4.155777
# Calculate MAPE
mape <- mape(lm_fit$fitted.values, CO2_data_clean$CO2_ppm)
cat("MAPE:", mape, "%\n")
MAPE: 0.00945464 %
# Split the data into training and test sets
library(caret)
set.seed(123) # Set seed for reproducibility
index <- createDataPartition(CO2_data_clean$CO2_ppm, p = 0.8, list = FALSE)
train <- CO2_data_clean[index, ]
test <- CO2_data_clean[-index, ]
# Fit a linear model on the training data
# lm_fit <- lm(CO2_ppm ~ Date, data = train)
# Calculate RMSE
# rmse <- rmse(lm_fit$fitted.values, test$CO2_ppm)
# cat("RMSE:", rmse, "\n")
# Calculate MAPE
# mape <- mape(lm_fit$fitted.values, test$CO2_ppm)
# cat("MAPE:", mape, "%\n")
# fit a quadratic model on the training data
model_quad <- lm(CO2_ppm ~ poly(Year, 2), data = train)
# summary of the quadratic model
summary(model_quad)
Call:
lm(formula = CO2_ppm ~ poly(Year, 2), data = train)
Residuals:
Min 1Q Median 3Q Max
-4.5559 -1.4544 -0.0298 1.5633 4.8790
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 354.66924 0.08271 4287.92 <0.0000000000000002 ***
poly(Year, 2)1 676.05049 2.00740 336.78 <0.0000000000000002 ***
poly(Year, 2)2 90.02177 2.00740 44.84 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.007 on 586 degrees of freedom
Multiple R-squared: 0.9949, Adjusted R-squared: 0.9949
F-statistic: 5.772e+04 on 2 and 586 DF, p-value: < 0.00000000000000022
plot(predict(model_quad), residuals(model_quad),
xlab = "Fitted values", ylab = "Residuals",
main = "Residual plot for Quadratic Model"
)
# Predict the CO2 concentration in the test dataset
test$CO2_pred <- predict(model_quad, newdata = test)
# Calculate the residual errors
test$residuals <- test$CO2_ppm - test$CO2_pred
# Plot the residual errors
plot(test$Year, test$residuals, type = "l", xlab = "Year", ylab = "Residual Error", main = "Residual Plot for Quadratic Model")
# Obtain predicted values for the test set
pred_quad <- predict(model_quad, newdata = test)
# Calculate RMSE
rmse_quad <- sqrt(mean((test$CO2_ppm - pred_quad)^2))
cat("RMSE for quadratic model:", rmse_quad, "\n")
RMSE for quadratic model: 1.909819
# Calculate MAPE
mape_quad <- mean(abs((test$CO2_ppm - pred_quad) / test$CO2_ppm)) * 100
cat("MAPE for quadratic model:", mape_quad, "%")
MAPE for quadratic model: 0.4547349 %
## Checking Values
model_quad <- lm(CO2_ppm ~ poly(Year, 2), data = train)
beta_hat <- coef(model_quad)
beta_hat
(Intercept) poly(Year, 2)1 poly(Year, 2)2
354.66924 676.05049 90.02177
model_cubic <- lm(CO2_ppm ~ poly(Year, 3), data = train)
summary(model_cubic)
Call:
lm(formula = CO2_ppm ~ poly(Year, 3), data = train)
Residuals:
Min 1Q Median 3Q Max
-4.4144 -1.4244 -0.0546 1.5400 4.9257
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 354.66924 0.08256 4295.928 <0.0000000000000002 ***
poly(Year, 3)1 676.05049 2.00366 337.408 <0.0000000000000002 ***
poly(Year, 3)2 90.02177 2.00366 44.929 <0.0000000000000002 ***
poly(Year, 3)3 3.57923 2.00366 1.786 0.0746 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.004 on 585 degrees of freedom
Multiple R-squared: 0.995, Adjusted R-squared: 0.995
F-statistic: 3.862e+04 on 3 and 585 DF, p-value: < 0.00000000000000022
# Predict CO2_ppm using the cubic model
test$predicted <- predict(model_cubic, newdata = test)
# Calculate residuals
test$residuals <- test$CO2_ppm - test$predicted
# Plot residuals
plot(test$Year, test$residuals, pch = 16, xlab = "Year", ylab = "Residuals", main = "Residual Plot - Cubic Model")
abline(h = 0, lty = 2, col = "red")
# Predict CO2 concentration using cubic model
y_pred_cubic <- predict(model_cubic, newdata = test)
# Calculate RMSE for cubic model
RMSE_cubic <- sqrt(mean((test$CO2_ppm - y_pred_cubic)^2))
RMSE_cubic
[1] 1.903641
# Calculate MAPE for cubic model
MAPE_cubic <- mean(abs((test$CO2_ppm - y_pred_cubic) / test$CO2_ppm)) * 100
MAPE_cubic
[1] 0.4535137
Plot the periodic signal Pi. (Your plot should have 1 data point for each month, so 12 in total.) Clearly state the definition the Pi, and make sure your plot is clearly labeled.
library(ggplot2)
# Create a sequence of years for the plot
years <- seq(min(train$Year), max(train$Year), length.out = 100)
# Predict CO2_ppm values for each year in the sequence
predictions <- predict(model_cubic, newdata = data.frame(Year = years))
# Create a periodic signal using sine function
period <- 10
amplitude <- 5
frequency <- 2 * pi / period
phase <- pi / 4
periodic_signal <- amplitude * sin(frequency * years + phase)
# Plot the predicted CO2_ppm values and the periodic signal
ggplot() +
geom_line(aes(x = years, y = predictions)) +
geom_line(aes(x = years, y = periodic_signal)) +
scale_x_continuous(name = "Year", breaks = seq(min(train$Year), max(train$Year), by = 10)) +
scale_y_continuous(name = "CO2_ppm") +
ggtitle("Cubic Regression Model with Periodic Signal") +
theme_minimal()
First, remove the deterministic trend Fn(t)form the time series and compute the average residual Ci - Fn (ti) for each month.
library(TSA)
# Create time series object
ts_data <- ts(CO2_data_clean$CO2_ppm, start = c(min(CO2_data_clean$Year)), end = c(max(CO2_data_clean$Year)), frequency = 12)
# Decompose time series using STL
ts_decomp <- stl(ts_data, s.window = "periodic")
# Remove trend component
trend_comp <- ts_decomp$time.series[, "trend"]
ts_seasonal <- ts_data - trend_comp
# Calculate RMSE
rmse <- sqrt(mean(test$predicted - CO2_data_clean$CO2_ppm)^2)
rmse
[1] 0.4754072
# Calculate MAPE
mape <- mean(abs((test$predicted - CO2_data_clean$CO2_ppm) / CO2_data_clean$CO2_ppm)) * 100
mape
[1] 8.253399
# Plot the time series and the trend component
plot(ts_data, main = "CO2 PPM Time Series with Trend Component", ylab = "CO2 PPM", xlab = "Year")
lines(trend_comp, col = "red")
Plot the final fit Fn (ti) + pi. Your plot should clearly show the final model on top of the entire time series, while indicating the split between the training and testing data.
# Plot the training data
ggplot(train, aes(x = Year, y = CO2_ppm)) +
geom_line(color = "blue", size = 1) +
ggtitle("Training Data") +
xlab("Year") +
ylab("CO2_ppm") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
# Plot the testing data
ggplot(test, aes(x = Year, y = CO2_ppm)) +
geom_line(color = "red", size = 1) +
ggtitle("Testing Data") +
xlab("Year") +
ylab("CO2_ppm") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
# Plot the final model on top of the entire time series
ggplot() +
geom_line(data = train, aes(x = Year, y = CO2_ppm), color = "blue", size = 1, alpha = 0.8) +
geom_line(data = test, aes(x = Year, y = CO2_ppm), color = "red", size = 1, alpha = 0.5) +
geom_line(data = CO2_data_clean, aes(x = Year, y = CO2_ppm), color = "green", size = 1, alpha = 0.4) +
ggtitle("Final Model Fit") +
xlab("Year[Blue: Training; Red: Testing; & Green: Complete Data]") +
ylab("Carbon-Dioxide Concentration Level [CO2_ppm]") +
scale_color_manual(
values = c("blue", "red", "green"),
labels = c("Training Data", "Testing Data", "Final Model")
) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())