Probability Distributions and Goodness-of-Fit

# Load necessary libraries
library(copula)

## Warning: package 'copula' was built under R version 4.4.3

library(MASS)
library(psych)

## Warning: package 'psych' was built under R version 4.4.3

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Read dataset
data <- read.csv("processed_data.csv")

# Convert to numeric and clean data
data$so2 <- as.numeric(as.character(data$so2))
data$no2 <- as.numeric(as.character(data$no2))

# Remove NAs
copula_data <- na.omit(data[, c("so2", "no2")])

# Check for non-finite values
if(any(!is.finite(copula_data$so2)) | any(!is.finite(copula_data$no2))) {
  stop("Data contains non-finite values!")
}

# Summary
summary(copula_data)

##       so2              no2        
##  Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  5.00   1st Qu.: 14.00  
##  Median :  8.00   Median : 22.00  
##  Mean   : 10.77   Mean   : 26.16  
##  3rd Qu.: 13.50   3rd Qu.: 32.80  
##  Max.   :909.00   Max.   :876.00

ggplot(copula_data, aes(x = so2, y = no2)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  theme_minimal() +
  labs(title = "Scatterplot of SO2 vs NO2", x = "SO2", y = "NO2")

# Use empirical CDF instead of rank
u_so2 <- ecdf(copula_data$so2)(copula_data$so2)
u_no2 <- ecdf(copula_data$no2)(copula_data$no2)

# Combine into matrix
u_data <- cbind(u_so2, u_no2)

# Check pseudo-observations
plot(u_data, main = "Pseudo Observations (Uniform Margins)", xlab = "U(SO2)", ylab = "U(NO2)", col = "blue", pch = 20)

# Initialize results storage
fit_results <- list()

# Helper function for safer copula fitting
safe_fit <- function(copula) {
  tryCatch({
    fitCopula(copula, u_data, method = "ml", start = c(0.5))
  }, error = function(e) {
    message(paste("Fitting failed for:", class(copula)[1]))
    return(NULL)
  })
}

# Fit Copulas
fit_results$Normal <- safe_fit(normalCopula(dim = 2))

## Warning in fitCopula.ml(copula, u = data, method = method, start = start, :
## Hessian matrix not invertible: Lapack routine dgesv: system is exactly
## singular: U[1,1] = 0

fit_results$tCopula <- safe_fit(tCopula(dim = 2))

## Fitting failed for: tCopula

fit_results$Clayton <- safe_fit(claytonCopula(dim = 2))

## Warning in fitCopula.ml(copula, u = data, method = method, start = start, :
## Hessian matrix not invertible: Lapack routine dgesv: system is exactly
## singular: U[1,1] = 0

fit_results$Gumbel  <- safe_fit(gumbelCopula(dim = 2))

## Warning in fitCopula.ml(copula, u = data, method = method, start = start, :
## Hessian matrix not invertible: Lapack routine dgesv: system is exactly
## singular: U[1,1] = 0

# Remove failed fits
fit_results <- Filter(Negate(is.null), fit_results)

# Extract log-likelihoods and AICs safely
aic_values <- data.frame(Copula = character(), LogLik = numeric(), AIC = numeric())

for (copula_name in names(fit_results)) {
  logLik_value <- tryCatch(logLik(fit_results[[copula_name]]), error = function(e) NA)
  aic_value <- tryCatch(-2 * logLik_value, error = function(e) NA)
  
  aic_values <- rbind(aic_values, data.frame(Copula = copula_name, LogLik = logLik_value, AIC = aic_value))
}

# Display AIC comparison
aic_values

##    Copula         LogLik           AIC
## 1  Normal -2.247116e+307 4.494233e+307
## 2 Clayton -2.247116e+307 4.494233e+307
## 3  Gumbel -2.247116e+307 4.494233e+307

# Pick the best copula based on lowest AIC
best_copula <- fit_results[[which.min(aic_values$AIC)]]

# Simulate if we have a valid model
if (!is.null(best_copula)) {
  sim_data <- rCopula(1000, best_copula@copula)
  plot(sim_data, main = "Simulated Data from Best Copula", xlab = "U(SO2)", ylab = "U(NO2)", col = "darkgreen", pch = 20)
} else {
  message("No valid copula model found for simulation.")
}

Probability Distributions and Goodness-of-Fit

NguyenMinhKhoi_22BI13220

2025-03-18