# Load required libraries
library(tidyverse)  # For data manipulation and ggplot2
library(GGally)     # For ggpairs
library(MASS)       # For mvrnorm and kde2d
library(viridis)    # For color palettes

Joint, Marginal, and Conditional Probability Mass Functions (PMF)

First, let’s examine discrete probability distributions using categorical variables.

# Create example dataset with categorical variables
data <- data.frame(
  X = c("A", "A", "B", "A", "B", "C", "C", "C", "A", "B"),
  Y = c("D", "E", "D", "D", "E", "D", "E", "E", "D", "D"))

# Joint PMF calculation
joint_counts <- table(data$X, data$Y)
joint_pmf <- prop.table(joint_counts)
print(joint_pmf)

##    
##       D   E
##   A 0.3 0.1
##   B 0.2 0.1
##   C 0.1 0.2

# Marginal PMFs
marginal_X <- margin.table(joint_pmf, 1)
marginal_Y <- margin.table(joint_pmf, 2)
print(marginal_X)

## 
##   A   B   C 
## 0.4 0.3 0.3

print(marginal_Y)

## 
##   D   E 
## 0.6 0.4

# Conditional PMFs
conditional_pmf_Y_given_X <- prop.table(joint_counts, margin = 1)
conditional_pmf_X_given_Y <- prop.table(joint_counts, margin = 2)
print(conditional_pmf_Y_given_X)

##    
##             D         E
##   A 0.7500000 0.2500000
##   B 0.6666667 0.3333333
##   C 0.3333333 0.6666667

print(conditional_pmf_X_given_Y)

##    
##             D         E
##   A 0.5000000 0.2500000
##   B 0.3333333 0.2500000
##   C 0.1666667 0.5000000

Correlation Analysis Using mtcars Dataset

Examine relationships between continuous variables using the mtcars dataset.

library(GGally)
library(tidyverse)

# Select continuous variables
mtcars_1 <- tibble(mtcars) %>%
  dplyr::select(mpg, disp, hp, drat, wt, qsec)

# Create pairs plot
ggpairs(mtcars_1)

# Calculate covariance and correlation matrices
cov_matrix <- cov(mtcars_1)
cor_matrix <- cor(mtcars_1)
print(cov_matrix)

##              mpg        disp         hp         drat          wt         qsec
## mpg    36.324103  -633.09721 -320.73206   2.19506351  -5.1166847   4.50914919
## disp -633.097208 15360.79983 6721.15867 -47.06401915 107.6842040 -96.05168145
## hp   -320.732056  6721.15867 4700.86694 -16.45110887  44.1926613 -86.77008065
## drat    2.195064   -47.06402  -16.45111   0.28588135  -0.3727207   0.08714073
## wt     -5.116685   107.68420   44.19266  -0.37272073   0.9573790  -0.30548161
## qsec    4.509149   -96.05168  -86.77008   0.08714073  -0.3054816   3.19316613

print(cor_matrix)

##             mpg       disp         hp        drat         wt        qsec
## mpg   1.0000000 -0.8475514 -0.7761684  0.68117191 -0.8676594  0.41868403
## disp -0.8475514  1.0000000  0.7909486 -0.71021393  0.8879799 -0.43369788
## hp   -0.7761684  0.7909486  1.0000000 -0.44875912  0.6587479 -0.70822339
## drat  0.6811719 -0.7102139 -0.4487591  1.00000000 -0.7124406  0.09120476
## wt   -0.8676594  0.8879799  0.6587479 -0.71244065  1.0000000 -0.17471588
## qsec  0.4186840 -0.4336979 -0.7082234  0.09120476 -0.1747159  1.00000000

Density Plots for Continuous Variables

Independent Variables

library(ggplot2)
set.seed(123)
x1 <- rnorm(1000)
x2 <- rnorm(1000)
df <- data.frame(x1 = x1, x2 = x2)

ggplot(df, aes(x = x1, y = x2)) +
  geom_density_2d_filled() +
  labs(title = "2D Density Plot - Independent Variables")

Correlated Variables

library(MASS)
set.seed(123)
corr_matrix <- matrix(c(1, 0.7, 0.7, 1), 2, 2)
data <- mvrnorm(n = 1000, mu = c(0, 0), Sigma = corr_matrix)
df <- data.frame(x1 = data[,1], x2 = data[,2])

ggplot(df, aes(x = x1, y = x2)) +
  geom_density_2d_filled() +
  labs(title = "2D Density Plot - Correlated Variables")

Kernel Density Estimation (KDE)

Univariate KDE

set.seed(123)
data <- rnorm(1000)
density_est <- density(data)

ggplot() +
  geom_histogram(aes(x = data, y = ..density..), bins = 30) +
  geom_line(aes(x = density_est$x, y = density_est$y), color = "red") +
  theme_minimal() +
  labs(title = "Gaussian KDE - Univariate")

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Bivariate KDE

set.seed(123)
x <- rnorm(1000)
y <- 0.7 * x + rnorm(1000, 0, 0.5)
kde <- kde2d(x, y, n = 100)

image(kde, col = viridis(50))
points(x, y, pch = ".", col = "white", alpha = 0.1)

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "alpha" is not a
## graphical parameter

title("Bivariate Gaussian KDE")

Inverse Transform Sampling

Theoretical Distribution

mu <- 2
sigma <- 3
set.seed(123)
u <- runif(100000)
x <- qnorm(u, mean = mu, sd = sigma)

hist(x, prob = TRUE, breaks = 30)
curve(dnorm(x, mean = mu, sd = sigma), add = TRUE, col = "red")

Sampling from Empirical Distribution

set.seed(123)
real_data <- rgamma(1000, shape=2, scale=2)
kde <- density(real_data)

x_grid <- kde$x
dx <- diff(x_grid)[1]
cdf <- cumsum(kde$y) * dx
cdf <- cdf/max(cdf)

u <- runif(1000)
sampled_data <- approx(cdf, x_grid, u)$y

df <- data.frame(
  value = c(real_data, sampled_data),
  type = rep(c("Original", "Sampled"), each=1000)
)

ggplot(df, aes(x=value, fill=type)) +
  geom_histogram(alpha=0.5, position="identity", bins=30) +
  theme_minimal()

Bivariate Sampling

set.seed(123)
x <- rnorm(1000)
y <- x * 0.5 + rnorm(1000, 0, 0.5)

kde2d_est <- kde2d(x, y, n=50)
x_grid <- kde2d_est$x
y_grid <- kde2d_est$y
Z <- kde2d_est$z

n_samples <- 1000
samples <- matrix(0, nrow=2, ncol=n_samples)

for(i in 1:n_samples) {
  x_pdf <- apply(Z, 2, sum)
  x_cdf <- cumsum(x_pdf)/sum(x_pdf)
  u1 <- runif(1)
  x_sample <- approx(x_cdf, x_grid, u1)$y
  
  idx <- which.min(abs(x_grid - x_sample))
  y_pdf <- Z[,idx]
  y_cdf <- cumsum(y_pdf)/sum(y_pdf)
  u2 <- runif(1)
  y_sample <- approx(y_cdf, y_grid, u2)$y
  
  samples[,i] <- c(x_sample, y_sample)
}

plot(x, y, col=alpha("black", 0.5), main="Joint Distribution")
points(samples[1,], samples[2,], col=alpha("red", 0.5))
legend("topright", c("Original", "Sampled"), col=c("black", "red"), pch=1)

Review of Multiple Variables

Miao Yu

2025-01-23

Joint, Marginal, and Conditional Probability Mass Functions (PMF)

Correlation Analysis Using mtcars Dataset

Density Plots for Continuous Variables

Independent Variables

Correlated Variables

Kernel Density Estimation (KDE)

Univariate KDE

Bivariate KDE

Inverse Transform Sampling

Theoretical Distribution

Sampling from Empirical Distribution

Bivariate Sampling