R Markdown

The R code for the Binomial distribution is divided into two main sections to showcase the Probability Mass Function (PMF) and the Cumulative Distribution Function (CDF).

The PMF section uses the built-in dbinom() function in R to calculate the probability of getting an exact number of successes. The code calculates the probability of getting exactly 4 successes in 10 trials, which aligns with the blog post’s explanation of the PMF. The accompanying plot shows this as individual points on a graph, visually representing the discrete probabilities.

The CDF section uses the pbinom() function, which is designed to calculate the cumulative probability the probability of getting at most a certain number of successes. This section calculates the probability of getting 4 or fewer successes. The CDF plot then illustrates this concept as a step function, where the height of each step represents the total accumulated probability up to that point.

# --- Required Libraries ---
# Install these packages if you haven't already
# install.packages("ggplot2")
# install.packages("dplyr")

# Load the required libraries for plotting and data manipulation
library(ggplot2)
library(dplyr)

# --- Parameters for the Binomial Distribution ---
#
# n: The number of independent trials.
# p: The probability of success in each trial.
#
n <- 10
p <- 0.4

# --- Working with the Binomial Distribution ---

# Calculating the Probability Mass Function (PMF)
# Use the dbinom() function to find the probability of exactly 'k' successes.
# For example, let's find the probability of exactly 4 successes in 10 trials.
k_pmf <- 4
probability_pmf <- dbinom(x = k_pmf, size = n, prob = p)
cat("PMF: Probability of exactly", k_pmf, "successes is", round(probability_pmf, 4), "\n")
## PMF: Probability of exactly 4 successes is 0.2508
# Calculating the Cumulative Distribution Function (CDF)
# Use the pbinom() function to find the probability of 'k' or fewer successes.
# For example, let's find the probability of 4 or fewer successes.
k_cdf <- 4
probability_cdf <- pbinom(q = k_cdf, size = n, prob = p)
cat("CDF: Probability of at most", k_cdf, "successes is", round(probability_cdf, 4), "\n")
## CDF: Probability of at most 4 successes is 0.6331
# --- Visualizing the Binomial Distribution ---

# Create a data frame for the theoretical distribution (PMF)
# This will be used to plot the dots representing the theoretical probabilities.
theoretical_data <- data.frame(
  k = 0:n,
  probability = dbinom(x = 0:n, size = n, prob = p)
)

# Simulate data to create a histogram
# We will simulate 1000 random samples from the binomial distribution
# to see how the observed frequencies compare to the theoretical probabilities.
set.seed(42) # for reproducibility
simulated_data <- rbinom(n = 1000, size = n, prob = p)
simulated_df <- data.frame(k = simulated_data)

# Create the combined plot: histogram, dots, and connected lines
ggplot(simulated_df, aes(x = k)) +
  # Create a histogram of the simulated data
  geom_histogram(
    aes(y = after_stat(count/sum(count))), # Normalize count to show proportions
    binwidth = 1, fill = "#a3e635", color = "#4d7c0f", alpha = 0.7
  ) +
  # Add the theoretical probability dots on top
  geom_point(data = theoretical_data, aes(x = k, y = probability), color = "#ef4444", size = 4) +
  # Add lines connecting the theoretical dots to show the trend
  geom_line(data = theoretical_data, aes(x = k, y = probability), color = "#ef4444", size = 1) +
  labs(
    title = paste("Binomial Distribution (n = ", n, ", p = ", p, ")"),
    subtitle = "Theoretical PMF vs. Simulated Data Histogram",
    x = "Number of Successes (k)",
    y = "Probability / Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  )

# --- Visualizing the Cumulative Distribution Function (CDF) ---
#
# This plot shows the cumulative probability. The theoretical CDF is a step
# function, and the empirical CDF from the simulated data provides a comparison.
#

# Create a data frame for the theoretical CDF
theoretical_cdf_data <- data.frame(
  k = 0:n,
  cumulative_probability = pbinom(q = 0:n, size = n, prob = p)
)

# Create the combined CDF plot: theoretical step function and empirical dots
ggplot(simulated_df, aes(x = k)) +
  # Use stat_ecdf to plot the empirical cumulative distribution
  stat_ecdf(geom = "step", color = "#a3e635", size = 1.2) +
  # Add points to the empirical CDF
  stat_ecdf(geom = "point", color = "#a3e635", size = 3) +
  # Add the theoretical CDF as a step function with a different color
  geom_step(data = theoretical_cdf_data, aes(x = k, y = cumulative_probability), color = "#ef4444", size = 1.2) +
  geom_point(data = theoretical_cdf_data, aes(x = k, y = cumulative_probability), color = "#ef4444", size = 4) +
  labs(
    title = paste("Binomial Cumulative Distribution Function (n = ", n, ", p = ", p, ")"),
    subtitle = "Theoretical vs. Empirical CDF",
    x = "Number of Successes (k)",
    y = "Cumulative Probability"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  )