# Load required libraries
library(tidyverse)  # For data manipulation and ggplot2
library(GGally)     # For ggpairs
library(MASS)       # For mvrnorm and kde2d
library(viridis)    # For color palettes

Joint, Marginal, and Conditional Probability Mass Functions (PMF)

First, let’s examine discrete probability distributions using categorical variables.

# Create example dataset with categorical variables
data <- data.frame(
  X = c("A", "A", "B", "A", "B", "C", "C", "C", "A", "B"),
  Y = c("D", "E", "D", "D", "E", "D", "E", "E", "D", "D"))

# Joint PMF calculation
joint_counts <- table(data$X, data$Y)
joint_pmf <- prop.table(joint_counts)
print(joint_pmf)
##    
##       D   E
##   A 0.3 0.1
##   B 0.2 0.1
##   C 0.1 0.2
# Marginal PMFs
marginal_X <- margin.table(joint_pmf, 1)
marginal_Y <- margin.table(joint_pmf, 2)
print(marginal_X)
## 
##   A   B   C 
## 0.4 0.3 0.3
print(marginal_Y)
## 
##   D   E 
## 0.6 0.4
# Conditional PMFs
conditional_pmf_Y_given_X <- prop.table(joint_counts, margin = 1)
conditional_pmf_X_given_Y <- prop.table(joint_counts, margin = 2)
print(conditional_pmf_Y_given_X)
##    
##             D         E
##   A 0.7500000 0.2500000
##   B 0.6666667 0.3333333
##   C 0.3333333 0.6666667
print(conditional_pmf_X_given_Y)
##    
##             D         E
##   A 0.5000000 0.2500000
##   B 0.3333333 0.2500000
##   C 0.1666667 0.5000000

Correlation Analysis Using mtcars Dataset

Examine relationships between continuous variables using the mtcars dataset.

library(GGally)
library(tidyverse)

# Select continuous variables
mtcars_1 <- tibble(mtcars) %>%
  dplyr::select(mpg, disp, hp, drat, wt, qsec)

# Create pairs plot
ggpairs(mtcars_1)

# Calculate covariance and correlation matrices
cov_matrix <- cov(mtcars_1)
cor_matrix <- cor(mtcars_1)
print(cov_matrix)
##              mpg        disp         hp         drat          wt         qsec
## mpg    36.324103  -633.09721 -320.73206   2.19506351  -5.1166847   4.50914919
## disp -633.097208 15360.79983 6721.15867 -47.06401915 107.6842040 -96.05168145
## hp   -320.732056  6721.15867 4700.86694 -16.45110887  44.1926613 -86.77008065
## drat    2.195064   -47.06402  -16.45111   0.28588135  -0.3727207   0.08714073
## wt     -5.116685   107.68420   44.19266  -0.37272073   0.9573790  -0.30548161
## qsec    4.509149   -96.05168  -86.77008   0.08714073  -0.3054816   3.19316613
print(cor_matrix)
##             mpg       disp         hp        drat         wt        qsec
## mpg   1.0000000 -0.8475514 -0.7761684  0.68117191 -0.8676594  0.41868403
## disp -0.8475514  1.0000000  0.7909486 -0.71021393  0.8879799 -0.43369788
## hp   -0.7761684  0.7909486  1.0000000 -0.44875912  0.6587479 -0.70822339
## drat  0.6811719 -0.7102139 -0.4487591  1.00000000 -0.7124406  0.09120476
## wt   -0.8676594  0.8879799  0.6587479 -0.71244065  1.0000000 -0.17471588
## qsec  0.4186840 -0.4336979 -0.7082234  0.09120476 -0.1747159  1.00000000

Density Plots for Continuous Variables

Independent Variables

library(ggplot2)
set.seed(123)
x1 <- rnorm(1000)
x2 <- rnorm(1000)
df <- data.frame(x1 = x1, x2 = x2)

ggplot(df, aes(x = x1, y = x2)) +
  geom_density_2d_filled() +
  labs(title = "2D Density Plot - Independent Variables")

Correlated Variables

library(MASS)
set.seed(123)
corr_matrix <- matrix(c(1, 0.7, 0.7, 1), 2, 2)
data <- mvrnorm(n = 1000, mu = c(0, 0), Sigma = corr_matrix)
df <- data.frame(x1 = data[,1], x2 = data[,2])

ggplot(df, aes(x = x1, y = x2)) +
  geom_density_2d_filled() +
  labs(title = "2D Density Plot - Correlated Variables")

Kernel Density Estimation (KDE)

Univariate KDE

set.seed(123)
data <- rnorm(1000)
density_est <- density(data)

ggplot() +
  geom_histogram(aes(x = data, y = ..density..), bins = 30) +
  geom_line(aes(x = density_est$x, y = density_est$y), color = "red") +
  theme_minimal() +
  labs(title = "Gaussian KDE - Univariate")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Bivariate KDE

set.seed(123)
x <- rnorm(1000)
y <- 0.7 * x + rnorm(1000, 0, 0.5)
kde <- kde2d(x, y, n = 100)

image(kde, col = viridis(50))
points(x, y, pch = ".", col = "white", alpha = 0.1)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "alpha" is not a
## graphical parameter
title("Bivariate Gaussian KDE")

Inverse Transform Sampling

Theoretical Distribution

mu <- 2
sigma <- 3
set.seed(123)
u <- runif(100000)
x <- qnorm(u, mean = mu, sd = sigma)

hist(x, prob = TRUE, breaks = 30)
curve(dnorm(x, mean = mu, sd = sigma), add = TRUE, col = "red")

Sampling from Empirical Distribution

set.seed(123)
real_data <- rgamma(1000, shape=2, scale=2)
kde <- density(real_data)

x_grid <- kde$x
dx <- diff(x_grid)[1]
cdf <- cumsum(kde$y) * dx
cdf <- cdf/max(cdf)

u <- runif(1000)
sampled_data <- approx(cdf, x_grid, u)$y

df <- data.frame(
  value = c(real_data, sampled_data),
  type = rep(c("Original", "Sampled"), each=1000)
)

ggplot(df, aes(x=value, fill=type)) +
  geom_histogram(alpha=0.5, position="identity", bins=30) +
  theme_minimal()

Bivariate Sampling

set.seed(123)
x <- rnorm(1000)
y <- x * 0.5 + rnorm(1000, 0, 0.5)

kde2d_est <- kde2d(x, y, n=50)
x_grid <- kde2d_est$x
y_grid <- kde2d_est$y
Z <- kde2d_est$z

n_samples <- 1000
samples <- matrix(0, nrow=2, ncol=n_samples)

for(i in 1:n_samples) {
  x_pdf <- apply(Z, 2, sum)
  x_cdf <- cumsum(x_pdf)/sum(x_pdf)
  u1 <- runif(1)
  x_sample <- approx(x_cdf, x_grid, u1)$y
  
  idx <- which.min(abs(x_grid - x_sample))
  y_pdf <- Z[,idx]
  y_cdf <- cumsum(y_pdf)/sum(y_pdf)
  u2 <- runif(1)
  y_sample <- approx(y_cdf, y_grid, u2)$y
  
  samples[,i] <- c(x_sample, y_sample)
}

plot(x, y, col=alpha("black", 0.5), main="Joint Distribution")
points(samples[1,], samples[2,], col=alpha("red", 0.5))
legend("topright", c("Original", "Sampled"), col=c("black", "red"), pch=1)