# Load required libraries
library(tidyverse) # For data manipulation and ggplot2
library(GGally) # For ggpairs
library(MASS) # For mvrnorm and kde2d
library(viridis) # For color palettes
First, let’s examine discrete probability distributions using categorical variables.
# Create example dataset with categorical variables
data <- data.frame(
X = c("A", "A", "B", "A", "B", "C", "C", "C", "A", "B"),
Y = c("D", "E", "D", "D", "E", "D", "E", "E", "D", "D"))
# Joint PMF calculation
joint_counts <- table(data$X, data$Y)
joint_pmf <- prop.table(joint_counts)
print(joint_pmf)
##
## D E
## A 0.3 0.1
## B 0.2 0.1
## C 0.1 0.2
# Marginal PMFs
marginal_X <- margin.table(joint_pmf, 1)
marginal_Y <- margin.table(joint_pmf, 2)
print(marginal_X)
##
## A B C
## 0.4 0.3 0.3
print(marginal_Y)
##
## D E
## 0.6 0.4
# Conditional PMFs
conditional_pmf_Y_given_X <- prop.table(joint_counts, margin = 1)
conditional_pmf_X_given_Y <- prop.table(joint_counts, margin = 2)
print(conditional_pmf_Y_given_X)
##
## D E
## A 0.7500000 0.2500000
## B 0.6666667 0.3333333
## C 0.3333333 0.6666667
print(conditional_pmf_X_given_Y)
##
## D E
## A 0.5000000 0.2500000
## B 0.3333333 0.2500000
## C 0.1666667 0.5000000
Examine relationships between continuous variables using the mtcars dataset.
library(GGally)
library(tidyverse)
# Select continuous variables
mtcars_1 <- tibble(mtcars) %>%
dplyr::select(mpg, disp, hp, drat, wt, qsec)
# Create pairs plot
ggpairs(mtcars_1)
# Calculate covariance and correlation matrices
cov_matrix <- cov(mtcars_1)
cor_matrix <- cor(mtcars_1)
print(cov_matrix)
## mpg disp hp drat wt qsec
## mpg 36.324103 -633.09721 -320.73206 2.19506351 -5.1166847 4.50914919
## disp -633.097208 15360.79983 6721.15867 -47.06401915 107.6842040 -96.05168145
## hp -320.732056 6721.15867 4700.86694 -16.45110887 44.1926613 -86.77008065
## drat 2.195064 -47.06402 -16.45111 0.28588135 -0.3727207 0.08714073
## wt -5.116685 107.68420 44.19266 -0.37272073 0.9573790 -0.30548161
## qsec 4.509149 -96.05168 -86.77008 0.08714073 -0.3054816 3.19316613
print(cor_matrix)
## mpg disp hp drat wt qsec
## mpg 1.0000000 -0.8475514 -0.7761684 0.68117191 -0.8676594 0.41868403
## disp -0.8475514 1.0000000 0.7909486 -0.71021393 0.8879799 -0.43369788
## hp -0.7761684 0.7909486 1.0000000 -0.44875912 0.6587479 -0.70822339
## drat 0.6811719 -0.7102139 -0.4487591 1.00000000 -0.7124406 0.09120476
## wt -0.8676594 0.8879799 0.6587479 -0.71244065 1.0000000 -0.17471588
## qsec 0.4186840 -0.4336979 -0.7082234 0.09120476 -0.1747159 1.00000000
library(ggplot2)
set.seed(123)
x1 <- rnorm(1000)
x2 <- rnorm(1000)
df <- data.frame(x1 = x1, x2 = x2)
ggplot(df, aes(x = x1, y = x2)) +
geom_density_2d_filled() +
labs(title = "2D Density Plot - Independent Variables")
set.seed(123)
data <- rnorm(1000)
density_est <- density(data)
ggplot() +
geom_histogram(aes(x = data, y = ..density..), bins = 30) +
geom_line(aes(x = density_est$x, y = density_est$y), color = "red") +
theme_minimal() +
labs(title = "Gaussian KDE - Univariate")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(123)
x <- rnorm(1000)
y <- 0.7 * x + rnorm(1000, 0, 0.5)
kde <- kde2d(x, y, n = 100)
image(kde, col = viridis(50))
points(x, y, pch = ".", col = "white", alpha = 0.1)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "alpha" is not a
## graphical parameter
title("Bivariate Gaussian KDE")
mu <- 2
sigma <- 3
set.seed(123)
u <- runif(100000)
x <- qnorm(u, mean = mu, sd = sigma)
hist(x, prob = TRUE, breaks = 30)
curve(dnorm(x, mean = mu, sd = sigma), add = TRUE, col = "red")
set.seed(123)
real_data <- rgamma(1000, shape=2, scale=2)
kde <- density(real_data)
x_grid <- kde$x
dx <- diff(x_grid)[1]
cdf <- cumsum(kde$y) * dx
cdf <- cdf/max(cdf)
u <- runif(1000)
sampled_data <- approx(cdf, x_grid, u)$y
df <- data.frame(
value = c(real_data, sampled_data),
type = rep(c("Original", "Sampled"), each=1000)
)
ggplot(df, aes(x=value, fill=type)) +
geom_histogram(alpha=0.5, position="identity", bins=30) +
theme_minimal()
set.seed(123)
x <- rnorm(1000)
y <- x * 0.5 + rnorm(1000, 0, 0.5)
kde2d_est <- kde2d(x, y, n=50)
x_grid <- kde2d_est$x
y_grid <- kde2d_est$y
Z <- kde2d_est$z
n_samples <- 1000
samples <- matrix(0, nrow=2, ncol=n_samples)
for(i in 1:n_samples) {
x_pdf <- apply(Z, 2, sum)
x_cdf <- cumsum(x_pdf)/sum(x_pdf)
u1 <- runif(1)
x_sample <- approx(x_cdf, x_grid, u1)$y
idx <- which.min(abs(x_grid - x_sample))
y_pdf <- Z[,idx]
y_cdf <- cumsum(y_pdf)/sum(y_pdf)
u2 <- runif(1)
y_sample <- approx(y_cdf, y_grid, u2)$y
samples[,i] <- c(x_sample, y_sample)
}
plot(x, y, col=alpha("black", 0.5), main="Joint Distribution")
points(samples[1,], samples[2,], col=alpha("red", 0.5))
legend("topright", c("Original", "Sampled"), col=c("black", "red"), pch=1)