Statistical Distributions for Simulation in R

Author
Affiliations

Dr. Md. Nasif Hossain

Department of Statistics, Pabna University of Science and Technology, Pabna, Bangladesh

Published

May 1, 2026

PUST Campus

Loading Package in R

library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggeffects)

Reproducibility and set.seed()

# Without a seed: different results every time
sample(1:100, 5)
[1] 29 50 24 49  8
sample(1:100, 5)
[1] 39 68 73 99 19
set.seed(2026)   # reset to the same seed
sample(1:100, 5) # same result as above
[1] 93 97 38 45 91
# Recommended practice: one seed at the top of the script
set.seed(2026)

# From this point on, all random calls are reproducible
x <- rnorm(100)
y <- sample(letters, 10)
z <- rbinom(50, size = 1, prob = 0.3)

head(x, 5); y; head(z, 5)
[1]  0.52058907 -1.07969076  0.13923812 -0.08474878 -0.66663962
 [1] "x" "c" "k" "v" "p" "f" "u" "a" "d" "m"
[1] 1 0 0 0 0

Simulating from Statistical Distributions

R distribution function naming convention

Prefix Function Example
r Random samples rnorm(n, mean, sd)
d Density (PDF/PMF) dnorm(x, mean, sd)
p Cumulative probability (CDF) pnorm(q, mean, sd)
q Quantile (inverse CDF) qnorm(p, mean, sd)

The following table summarises which distributions are most useful for linguistic data:

Distribution guide for linguistic data simulation
Distribution R function Typical linguistic use
Normal rnorm() Pitch, formants, log-transformed RT, log word frequency
Log-normal exp(rnorm()) Raw reaction times, response latencies
Binomial rbinom() Accuracy (correct/incorrect), binary grammaticality judgements
Poisson rpois() Error counts, disfluency counts, word occurrences in short texts
Negative binomial rnbinom() Overdispersed counts: word frequencies, collocation counts
Uniform runif() Ages, random positions, presentation order
Beta rbeta() Proportions bounded in (0, 1): accent ratings, similarity scores
Gamma rgamma() Skewed positive continuous: inter-pausal intervals, vowel duration

Normal Distribution

set.seed(2026)

# Simulate log-transformed reaction times
# Mean ~6.4 ≈ exp(6.4) ≈ 600 ms; SD = 0.3 on the log scale
log_rt <- rnorm(n = 500, mean = 6.4, sd = 0.3)
rt_ms  <- exp(log_rt)   # back-transform to milliseconds

data.frame(log_rt = log_rt, rt_ms = rt_ms) |>
  ggplot(aes(x = rt_ms)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 30, fill = "steelblue", color = "white", alpha = 0.8) +
  geom_density(color = "firebrick", linewidth = 1) +
  theme_bw() +
  labs(title = "Simulated reaction times (log-normal distribution)",
       subtitle = "n = 500 | Mean ≈ 600 ms",
       x = "Reaction time (ms)", y = "Density")

cat(sprintf("Mean RT: %.1f ms | SD: %.1f ms | Range: %.0f–%.0f ms\n",
            mean(rt_ms), sd(rt_ms), min(rt_ms), max(rt_ms)))
Mean RT: 637.9 ms | SD: 198.2 ms | Range: 280–1545 ms

Binomial Distribution

set.seed(2026)

# Simulate accuracy in a lexical decision task
# 100 participants, 80 trials each, average accuracy 85%
n_participants <- 100
n_trials       <- 80
accuracy_prob  <- 0.85

n_correct <- rbinom(n = n_participants, size = n_trials, prob = accuracy_prob)
accuracy  <- n_correct / n_trials

data.frame(accuracy = accuracy) |>
  ggplot(aes(x = accuracy)) +
  geom_histogram(binwidth = 0.02, fill = "steelblue", color = "white", alpha = 0.8) +
  geom_vline(xintercept = mean(accuracy), color = "firebrick",
             linetype = "dashed", linewidth = 1) +
  theme_bw() +
  labs(title = "Simulated accuracy in a lexical decision task",
       subtitle = sprintf("n = %d | %d trials | P(correct) = %.2f | Mean = %.3f",
                          n_participants, n_trials, accuracy_prob, mean(accuracy)),
       x = "Accuracy", y = "Count")

Poisson Distribution

set.seed(2026)

# Simulate disfluency counts per minute for 200 speakers
corrections <- rpois(n = 200, lambda = 1.8)

data.frame(corrections = corrections) |>
  ggplot(aes(x = corrections)) +
  geom_bar(fill = "steelblue", color = "white", alpha = 0.8) +
  theme_bw() +
  labs(title = "Simulated disfluency counts per minute (Poisson)",
       subtitle = sprintf("n = 200 | λ = 1.8 | Mean = %.2f | Variance = %.2f",
                          mean(corrections), var(corrections)),
       x = "Disfluencies per minute", y = "Count")

Negative Binomial Distribution

set.seed(2026)

# Compare Poisson vs. Negative Binomial: same mean, different variance
pois_counts <- rpois(n = 500, lambda = 3.0)
nb_counts   <- rnbinom(n = 500, mu = 3.0, size = 0.8)  # size controls overdispersion

cat(sprintf("Poisson:    Mean = %.2f | Variance = %.2f\n",
            mean(pois_counts), var(pois_counts)))
Poisson:    Mean = 2.90 | Variance = 2.73
cat(sprintf("Neg. Binom: Mean = %.2f | Variance = %.2f\n",
            mean(nb_counts), var(nb_counts)))
Neg. Binom: Mean = 2.79 | Variance = 13.41
data.frame(
  count = c(pois_counts, nb_counts),
  dist  = rep(c("Poisson (λ=3)", "Neg. Binomial (μ=3, size=0.8)"), each = 500)
) |>
  ggplot(aes(x = count, fill = dist)) +
  geom_bar(position = "dodge", alpha = 0.8, color = "white") +
  scale_fill_manual(values = c("steelblue", "firebrick")) +
  theme_bw() + theme(legend.position = "top") +
  labs(title = "Poisson vs. Negative Binomial: same mean, very different variance",
       x = "Count", y = "Frequency", fill = "")

Beta Distribution

set.seed(2026)

# Simulate accent ratings (0 = native-like, 1 = heavily accented)
# Beta(2, 5): right-skewed, most ratings near the native-like end
# Beta(5, 2): left-skewed, most ratings toward heavily-accented end
# Beta(2, 2): symmetric, centred at 0.5

ratings_native  <- rbeta(200, shape1 = 2, shape2 = 5)
ratings_l2      <- rbeta(200, shape1 = 5, shape2 = 2)

data.frame(
  rating = c(ratings_native, ratings_l2),
  group  = rep(c("L1 speakers", "Advanced L2 speakers"), each = 200)
) |>
  ggplot(aes(x = rating, fill = group)) +
  geom_histogram(binwidth = 0.05, position = "identity", alpha = 0.6, color = "white") +
  scale_fill_manual(values = c("steelblue", "firebrick")) +
  theme_bw() + theme(legend.position = "top") +
  labs(title = "Simulated accent ratings (Beta distribution)",
       subtitle = "L1: Beta(2,5) — mostly native-like | L2: Beta(5,2) — more accented",
       x = "Accent rating (0 = native-like, 1 = heavily accented)", y = "Count",
       fill = "")

Gamma Distribution

set.seed(2026)

# Simulate inter-pausal intervals in spontaneous speech (ms)
# shape = 2, rate = 0.01 → mean = shape/rate = 200 ms
pauses <- rgamma(n = 500, shape = 2, rate = 0.01)

data.frame(pause_ms = pauses) |>
  ggplot(aes(x = pause_ms)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 40, fill = "steelblue", color = "white", alpha = 0.8) +
  geom_density(color = "firebrick", linewidth = 1) +
  theme_bw() +
  labs(title = "Simulated inter-pausal intervals (Gamma distribution)",
       subtitle = sprintf("n = 500 | Mean = %.0f ms | SD = %.0f ms",
                          mean(pauses), sd(pauses)),
       x = "Inter-pausal interval (ms)", y = "Density")

Basic Simulation models in R

Simulate data

set.seed(123)

n <- 1000   # sample size

# Predictor variables
x1 <- rnorm(n, mean = 50, sd = 10)         # continuous predictor (Normal)
x2 <- rbinom(n, size = 1, prob = 0.4)      # binary predictor (Binomial)
x3 <- rpois(n, lambda = 3)                 # count predictor (Poisson)

income <- factor(sample(c("Low", "Medium", "High"),
                    n,
                    replace = TRUE),
             levels = c("Low", "Medium", "High"))

income<-relevel(income, ref = "Low")

df=data.frame(x1,x2,x3,income)

Linear Regression (Normal outcome)

# Simulate outcome (Normal)
y_normal <- 5 + 0.3*x1 - 2*x2 + 1.5*x3 + rnorm(n, 0, 5)

df <- data.frame(
  y_normal = y_normal,
  x1 = x1,
  x2 = x2,
  x3 = x3,
  income = income
)

# Fit linear regression
model_lm <- lm(y_normal ~ x1 + x2 + x3+income,
               data=df)

summary(model_lm)

Call:
lm(formula = y_normal ~ x1 + x2 + x3 + income, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.2500  -3.1279  -0.2598   3.3663  17.0741 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.65952    0.90488   6.254 5.92e-10 ***
x1            0.27823    0.01621  17.162  < 2e-16 ***
x2           -1.82945    0.32669  -5.600 2.77e-08 ***
x3            1.59377    0.09121  17.474  < 2e-16 ***
incomeMedium -0.09679    0.38615  -0.251    0.802    
incomeHigh    0.25520    0.41090   0.621    0.535    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.069 on 994 degrees of freedom
Multiple R-squared:  0.4043,    Adjusted R-squared:  0.4013 
F-statistic: 134.9 on 5 and 994 DF,  p-value: < 2.2e-16
levels(income)
[1] "Low"    "Medium" "High"  

Logistic Regression (Binomial outcome)

# Generate probability using logistic function
logit_p <- -2 + 0.05*x1 - 1*x2 + 0.3*x3
p <- 1 / (1 + exp(-logit_p))

# Simulate binary outcome
y_binomial <- rbinom(n, size = 1, prob = p)

df <- data.frame(
  y_binomial = y_binomial,
  x1 = x1,
  x2 = x2,
  x3 = x3,
  income = income
)

# Fit logistic regression
model_logit <- glm(y_binomial ~ x1 + x2 + x3+income,
                   data = df,
                   family = binomial)

summary(model_logit)

Call:
glm(formula = y_binomial ~ x1 + x2 + x3 + income, family = binomial, 
    data = df)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -2.405542   0.440341  -5.463 4.68e-08 ***
x1            0.054492   0.008081   6.743 1.55e-11 ***
x2           -0.709725   0.153968  -4.610 4.04e-06 ***
x3            0.355721   0.049688   7.159 8.12e-13 ***
incomeMedium  0.139281   0.185517   0.751    0.453    
incomeHigh    0.032829   0.193867   0.169    0.866    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1166.5  on 999  degrees of freedom
Residual deviance: 1032.8  on 994  degrees of freedom
AIC: 1044.8

Number of Fisher Scoring iterations: 4

Prediction of logistic regression

# Prediction of x1 only
pred_1 <- ggpredict(model_logit, terms = c("x1 [all]"))
# Plot for predicted value 
plot(pred_1)

# Prediction of x1 by income 
pred_2 <- ggpredict(model_logit, terms = c("x1 [all]", "income"))
# Plot for predicted value

plot(pred_2)

Poisson Regression (Count outcome)

# Generate lambda (mean count)
lambda <- exp(1 + 0.02*x1 - 0.5*x2 + 0.2*x3)

# Simulate count outcome
y_poisson <- rpois(n, lambda = lambda)

df <- data.frame(
  y_poisson = y_poisson,
  x1 = x1,
  x2 = x2,
  x3 = x3,
  income = income
)

# Fit Poisson regression
model_pois <- glm(y_poisson ~ x1 + x2 + x3+income, 
                  data=df,
                  family = poisson)

summary(model_pois)

Call:
glm(formula = y_poisson ~ x1 + x2 + x3 + income, family = poisson, 
    data = df)

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)   1.0231294  0.0534249  19.151   <2e-16 ***
x1            0.0196522  0.0009267  21.206   <2e-16 ***
x2           -0.5188986  0.0197615 -26.258   <2e-16 ***
x3            0.2003402  0.0046288  43.281   <2e-16 ***
incomeMedium  0.0220325  0.0212851   1.035    0.301    
incomeHigh   -0.0038863  0.0230702  -0.168    0.866    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 4202.74  on 999  degrees of freedom
Residual deviance:  961.01  on 994  degrees of freedom
AIC: 5181.6

Number of Fisher Scoring iterations: 4