# Load necessary libraries
library(copula)
## Warning: package 'copula' was built under R version 4.4.3
library(MASS)
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read dataset
data <- read.csv("processed_data.csv")
# Convert to numeric and clean data
data$so2 <- as.numeric(as.character(data$so2))
data$no2 <- as.numeric(as.character(data$no2))
# Remove NAs
copula_data <- na.omit(data[, c("so2", "no2")])
# Check for non-finite values
if(any(!is.finite(copula_data$so2)) | any(!is.finite(copula_data$no2))) {
stop("Data contains non-finite values!")
}
# Summary
summary(copula_data)
## so2 no2
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 5.00 1st Qu.: 14.00
## Median : 8.00 Median : 22.00
## Mean : 10.77 Mean : 26.16
## 3rd Qu.: 13.50 3rd Qu.: 32.80
## Max. :909.00 Max. :876.00
ggplot(copula_data, aes(x = so2, y = no2)) +
geom_point(alpha = 0.5, color = "steelblue") +
theme_minimal() +
labs(title = "Scatterplot of SO2 vs NO2", x = "SO2", y = "NO2")

# Use empirical CDF instead of rank
u_so2 <- ecdf(copula_data$so2)(copula_data$so2)
u_no2 <- ecdf(copula_data$no2)(copula_data$no2)
# Combine into matrix
u_data <- cbind(u_so2, u_no2)
# Check pseudo-observations
plot(u_data, main = "Pseudo Observations (Uniform Margins)", xlab = "U(SO2)", ylab = "U(NO2)", col = "blue", pch = 20)

# Initialize results storage
fit_results <- list()
# Helper function for safer copula fitting
safe_fit <- function(copula) {
tryCatch({
fitCopula(copula, u_data, method = "ml", start = c(0.5))
}, error = function(e) {
message(paste("Fitting failed for:", class(copula)[1]))
return(NULL)
})
}
# Fit Copulas
fit_results$Normal <- safe_fit(normalCopula(dim = 2))
## Warning in fitCopula.ml(copula, u = data, method = method, start = start, :
## Hessian matrix not invertible: Lapack routine dgesv: system is exactly
## singular: U[1,1] = 0
fit_results$tCopula <- safe_fit(tCopula(dim = 2))
## Fitting failed for: tCopula
fit_results$Clayton <- safe_fit(claytonCopula(dim = 2))
## Warning in fitCopula.ml(copula, u = data, method = method, start = start, :
## Hessian matrix not invertible: Lapack routine dgesv: system is exactly
## singular: U[1,1] = 0
fit_results$Gumbel <- safe_fit(gumbelCopula(dim = 2))
## Warning in fitCopula.ml(copula, u = data, method = method, start = start, :
## Hessian matrix not invertible: Lapack routine dgesv: system is exactly
## singular: U[1,1] = 0
# Remove failed fits
fit_results <- Filter(Negate(is.null), fit_results)
# Extract log-likelihoods and AICs safely
aic_values <- data.frame(Copula = character(), LogLik = numeric(), AIC = numeric())
for (copula_name in names(fit_results)) {
logLik_value <- tryCatch(logLik(fit_results[[copula_name]]), error = function(e) NA)
aic_value <- tryCatch(-2 * logLik_value, error = function(e) NA)
aic_values <- rbind(aic_values, data.frame(Copula = copula_name, LogLik = logLik_value, AIC = aic_value))
}
# Display AIC comparison
aic_values
## Copula LogLik AIC
## 1 Normal -2.247116e+307 4.494233e+307
## 2 Clayton -2.247116e+307 4.494233e+307
## 3 Gumbel -2.247116e+307 4.494233e+307
# Pick the best copula based on lowest AIC
best_copula <- fit_results[[which.min(aic_values$AIC)]]
# Simulate if we have a valid model
if (!is.null(best_copula)) {
sim_data <- rCopula(1000, best_copula@copula)
plot(sim_data, main = "Simulated Data from Best Copula", xlab = "U(SO2)", ylab = "U(NO2)", col = "darkgreen", pch = 20)
} else {
message("No valid copula model found for simulation.")
}
