Assignment3

library(MASS)
library(MVN)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)

4.28

column_names <- c("Wind (x1)", "Radiation (x2)", "CO (x3)", "NO (x4)", "NO2 (x5)", "O3 (x6)", "HC (x7)")

df <- read.table("/Users/fahadmehfooz/Downloads/T1-5.dat", header = FALSE, col.names = column_names)

radiation_data <- df$Radiation

#Generate a QQ plot
qqnorm(radiation_data, main = "QQ Plot of Solar Radiation Measurements")
qqline(radiation_data, col = "blue")

# Calculating correlation coefficient (r_q) for the QQ plot
# Generate theoretical quantiles for a normal distribution

## Theoretical quantiles are computed as if our data points came from a standard normal distribution (mean = 0, standard deviation = 1).

theoretical_quantiles <- qnorm(ppoints(length(radiation_data)))
observed_quantiles <- sort(radiation_data)
r_q <- cor(theoretical_quantiles, observed_quantiles)
cat("Value of r_q: ", r_q)

## Value of r_q:  0.9693258

#Checking if r_q meets the  critical threshold for normality at n=40, alpha=0.05 (r_q = 0.9726)
alpha <- 0.05
n <- 40
critical_r_q <- 0.9726 

if (r_q >= critical_r_q) {
  cat("The radiation data is likely normally distributed (r_q =", r_q, ">= 0.9726).")
} else {
  cat("The radiation data is likely not normally distributed (r_q =", r_q, "< 0.9726).")
}

## The radiation data is likely not normally distributed (r_q = 0.9693258 < 0.9726).

4.29 a)

no2 <- df$NO2
o3 <- df$O3
X <- cbind(no2, o3)

#Calculating the mean vector for the two variables
mean_vector <- colMeans(X)

#Computing the sample covariance matrix for NO2 and O3
cov_matrix <- cov(X)

# Calculating the inverse of the covariance matrix
inv_cov_matrix <- solve(cov_matrix)

# Computing Mahalanobis distances for each observation
mahalanobis_distances <- apply(X, 1, function(xj) {
  diff <- xj - mean_vector
  sqrt(t(diff) %*% inv_cov_matrix %*% diff)  # square root to get Euclidean distance
})

print(mahalanobis_distances)

##  [1] 0.6787138 0.8119240 1.5417720 1.2760448 0.6430679 0.6899802 1.0885263
##  [8] 3.2617755 0.3726042 0.9034638 1.1647446 0.7891829 2.3768549 0.5620942
## [15] 0.6430679 0.3499962 0.9480497 2.1828164 1.7346216 0.8119240 1.6655754
## [22] 1.0178438 0.8873642 1.8557410 2.4796896 1.0178438 0.3726042 0.9410654
## [29] 0.3714457 1.4996289 0.4360262 0.6787138 1.0710714 2.6619023 1.2076518
## [36] 0.3499962 1.3778501 1.6668112 2.9108530 0.7981364 0.8385991 1.3421479

4.29b

# Defining the threshold for the 50% contour (D^2 threshold for p = 0.50 in a bivariate normal distribution)
# the threshold for a specific probability contour (like the 50% contour), we refer to the chi-squared distribution’s critical value for that probability. Specifically: For 2 degrees of freedom, the 50% quantile of the chi-squared distribution is approximately 1.386.

threshold <- sqrt(1.386)

#  Using the previously calculated distances for each observation, determine which distances are less than or equal to  sqrt of 1.386

within_contour <- sum(mahalanobis_distances <= threshold)

# Computing the Proportion: 
proportion_within_contour <- within_contour / length(mahalanobis_distances)

# Display the proportion
proportion_within_contour

## [1] 0.6190476

4.29 c

#Calculating the number of observations (should be 42 in your case)
n <- length(mahalanobis_distances)

# Sorting the Mahalanobis distances in ascending order
ordered_distances <- sort(mahalanobis_distances)

# Calculating the theoretical chi-squared quantiles for each observation
chi_square_quantiles <- qchisq((1:n - 0.5) / n, df = 2)

#Creating the chi-square plot
plot(chi_square_quantiles, ordered_distances, 
     main = "Chi-Square Plot of Ordered Mahalanobis Distances",
     xlab = "Theoretical Chi-Square Quantiles",
     ylab = "Ordered Mahalanobis Distances",
     pch = 19, col = "blue")

#Adding a reference line (y = x) for comparison
abline(0, 1, col = "red", lty = 2)

Points are not following y = x line thus they are not following chi square distribution.

4.30 a)

x1 <- c(1, 2, 3, 3, 4, 5, 6, 8, 9, 11)
x2 <- c(18.95, 19.00, 17.95, 15.54, 14.00, 12.95, 8.94, 7.49, 6.00, 3.99)

model_x1 <- lm(x1 ~ 1)

# Performing the Box-Cox transformation on the fitted model
boxcox_result_x1 <- boxcox(model_x1)

# Finding the optimal lambda that maximizes log-likelihood
optimal_lambda_x1 <- boxcox_result_x1$x[which.max(boxcox_result_x1$y)]
cat("Optimal lambda for x1:", optimal_lambda_x1, "\n")

## Optimal lambda for x1: 0.3838384

# Transforming x1 using the estimated optimal lambda
x1_transformed <- (x1^optimal_lambda_x1 - 1) / optimal_lambda_x1

# Generating Q-Q plot for the transformed data
qqnorm(x1_transformed, main = "Q-Q Plot for Transformed x1")
qqline(x1_transformed)

The plot generally follow the line but show some deviation at both the lower and higher ends. This suggests that while the Box-Cox transformation improved normality, the transformed data still have some deviations from a perfect normal distribution.

4.30 b

model_x2 <- lm(x2 ~ 1)

# Performing the Box-Cox transformation on the fitted model
boxcox_result_x2 <- boxcox(model_x2)

# Finding the optimal lambda that maximizes log-likelihood
optimal_lambda_x2 <- boxcox_result_x2$x[which.max(boxcox_result_x2$y)]
cat("Optimal lambda for x2:", optimal_lambda_x2, "\n")

## Optimal lambda for x2: 0.9494949

# Transforming x1 using the estimated optimal lambda
x2_transformed <- (x2^optimal_lambda_x2 - 1) / optimal_lambda_x2

# Generating Q-Q plot for the transformed data
qqnorm(x2_transformed, main = "Q-Q Plot for Transformed x1")
qqline(x2_transformed)

For x2, the optimal lambda is much closer to 1 (approximately 0.95), suggesting that x2 required less transformation than x1 to approach normality. There is slight deviation at the tails, similar to the x1 Q-Q plot, but overall the fit to the normal distribution seems better for x2.

4.30c

x1 <- c(1, 2, 3, 3, 4, 5, 6, 8, 9, 11)
x2 <- c(18.95, 19.00, 17.95, 15.54, 14.00, 12.95, 8.94, 7.49, 6.00, 3.99)

# Log transformation
log_x1 <- log(x1)
log_x2 <- log(x2)

# Means and Standard Deviations of log-transformed variables
mean_log_x1 <- mean(log_x1)
mean_log_x2 <- mean(log_x2)
n <- length(x1)

# Creating grid of lambda1 and lambda2 values
lambda_grid <- expand.grid(lambda1 = seq(-3, 3, by = 0.05), lambda2 = seq(-3, 3, by = 0.05))

# Function for Box-Cox transformation
boxcox_transform <- function(x, lambda) {
  if (abs(lambda) < 1e-5) {
    return(log(x))
  } else {
    return((x^lambda - 1) / lambda)
  }
}

# Calculating transformed values and correlation for each lambda1, lambda2 pair
results <- lambda_grid %>%
  rowwise() %>%
  mutate(
    tr_x1 = list(boxcox_transform(x1, lambda1)),
    tr_x2 = list(boxcox_transform(x2, lambda2))
  ) %>%
  mutate(
    cor = cor(unlist(tr_x1), unlist(tr_x2)),
    std_tr_x1 = sd(unlist(tr_x1)),
    std_tr_x2 = sd(unlist(tr_x2)),
    llambda = (-n / 2) * log((std_tr_x1 * std_tr_x2)^2 * (1 - cor^2)) +
              (lambda1 - 1) * mean_log_x1 * n +
              (lambda2 - 1) * mean_log_x2 * n
  ) %>%
  ungroup()

results

## # A tibble: 14,641 × 8
##    lambda1 lambda2 tr_x1      tr_x2         cor std_tr_x1 std_tr_x2 llambda
##      <dbl>   <dbl> <list>     <list>      <dbl>     <dbl>     <dbl>   <dbl>
##  1   -3         -3 <dbl [10]> <dbl [10]> -0.218     0.103   0.00162   -66.6
##  2   -2.95      -3 <dbl [10]> <dbl [10]> -0.220     0.105   0.00162   -66.0
##  3   -2.9       -3 <dbl [10]> <dbl [10]> -0.222     0.107   0.00162   -65.5
##  4   -2.85      -3 <dbl [10]> <dbl [10]> -0.224     0.109   0.00162   -64.9
##  5   -2.8       -3 <dbl [10]> <dbl [10]> -0.226     0.110   0.00162   -64.4
##  6   -2.75      -3 <dbl [10]> <dbl [10]> -0.228     0.112   0.00162   -63.8
##  7   -2.7       -3 <dbl [10]> <dbl [10]> -0.230     0.114   0.00162   -63.3
##  8   -2.65      -3 <dbl [10]> <dbl [10]> -0.233     0.116   0.00162   -62.7
##  9   -2.6       -3 <dbl [10]> <dbl [10]> -0.236     0.119   0.00162   -62.2
## 10   -2.55      -3 <dbl [10]> <dbl [10]> -0.238     0.121   0.00162   -61.6
## # ℹ 14,631 more rows

# Identify the optimal lambda values that maximize llambda
optimal_lambda <- results %>%
  filter(llambda == max(llambda, na.rm = TRUE))

print(optimal_lambda)

## # A tibble: 1 × 8
##   lambda1 lambda2 tr_x1      tr_x2         cor std_tr_x1 std_tr_x2 llambda
##     <dbl>   <dbl> <list>     <list>      <dbl>     <dbl>     <dbl>   <dbl>
## 1    1.25  0.0500 <dbl [10]> <dbl [10]> -0.991      5.01     0.604   -10.3

# Visualizing the log-likelihood over the lambda grid
ggplot(results, aes(x = lambda1, y = lambda2, fill = llambda)) +
  geom_tile() +
  labs(title = "Box-Cox Log-Likelihood", x = "Lambda1", y = "Lambda2") +
  scale_fill_viridis_c() +
  theme_minimal()

# Transforming data with optimal lambda values
optimal_lambda1 <- optimal_lambda$lambda1
optimal_lambda2 <- optimal_lambda$lambda2

tr_x1_optimal <- boxcox_transform(x1, optimal_lambda1)
tr_x2_optimal <- boxcox_transform(x2, optimal_lambda2)

# QQ Plot for transformed data
par(mfrow = c(1, 2))  # Setting up the plotting area for two plots

# QQ plot for transformed x1
qqnorm(tr_x1_optimal, main = "QQ Plot for Transformed x1")
qqline(tr_x1_optimal)

# QQ plot for transformed x2
qqnorm(tr_x2_optimal, main = "QQ Plot for Transformed x2")
qqline(tr_x2_optimal)

# Resetting the plotting area
par(mfrow = c(1, 1))

these are pretty close to normality with the new optimal values.

I am getting the optimal lambdas for x1 = 1.25 and x2 = 0.05. Took it outside the range of (-1, 1).

Independent optimal values (0.38 and 0.95) show that each variable may be close to normality when transformed independently, and these could apply to the transformations individually. The joint optimal values (1.25 and 0.05) indicate that a different combination is necessary for an optimal joint distribution, likely because the transformation considers their correlation, which is very high and negative (cor ≈ -0.99).

Assignment3

2024-10-08

4.28

4.29 a)

4.29b

4.29 c

4.30 a)

4.30 b

4.30c