library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggeffects)Statistical Distributions for Simulation in R
PUST Campus
Loading Package in R
Reproducibility and set.seed()
# Without a seed: different results every time
sample(1:100, 5)[1] 29 50 24 49 8
sample(1:100, 5)[1] 39 68 73 99 19
set.seed(2026) # reset to the same seed
sample(1:100, 5) # same result as above[1] 93 97 38 45 91
# Recommended practice: one seed at the top of the script
set.seed(2026)
# From this point on, all random calls are reproducible
x <- rnorm(100)
y <- sample(letters, 10)
z <- rbinom(50, size = 1, prob = 0.3)
head(x, 5); y; head(z, 5)[1] 0.52058907 -1.07969076 0.13923812 -0.08474878 -0.66663962
[1] "x" "c" "k" "v" "p" "f" "u" "a" "d" "m"
[1] 1 0 0 0 0
Simulating from Statistical Distributions
R distribution function naming convention
| Prefix | Function | Example |
|---|---|---|
r |
Random samples | rnorm(n, mean, sd) |
d |
Density (PDF/PMF) | dnorm(x, mean, sd) |
p |
Cumulative probability (CDF) | pnorm(q, mean, sd) |
q |
Quantile (inverse CDF) | qnorm(p, mean, sd) |
The following table summarises which distributions are most useful for linguistic data:
| Distribution guide for linguistic data simulation | ||
| Distribution | R function | Typical linguistic use |
| Normal | rnorm() |
Pitch, formants, log-transformed RT, log word frequency |
| Log-normal | exp(rnorm()) |
Raw reaction times, response latencies |
| Binomial | rbinom() |
Accuracy (correct/incorrect), binary grammaticality judgements |
| Poisson | rpois() |
Error counts, disfluency counts, word occurrences in short texts |
| Negative binomial | rnbinom() |
Overdispersed counts: word frequencies, collocation counts |
| Uniform | runif() |
Ages, random positions, presentation order |
| Beta | rbeta() |
Proportions bounded in (0, 1): accent ratings, similarity scores |
| Gamma | rgamma() |
Skewed positive continuous: inter-pausal intervals, vowel duration |
Normal Distribution
set.seed(2026)
# Simulate log-transformed reaction times
# Mean ~6.4 ≈ exp(6.4) ≈ 600 ms; SD = 0.3 on the log scale
log_rt <- rnorm(n = 500, mean = 6.4, sd = 0.3)
rt_ms <- exp(log_rt) # back-transform to milliseconds
data.frame(log_rt = log_rt, rt_ms = rt_ms) |>
ggplot(aes(x = rt_ms)) +
geom_histogram(aes(y = after_stat(density)),
bins = 30, fill = "steelblue", color = "white", alpha = 0.8) +
geom_density(color = "firebrick", linewidth = 1) +
theme_bw() +
labs(title = "Simulated reaction times (log-normal distribution)",
subtitle = "n = 500 | Mean ≈ 600 ms",
x = "Reaction time (ms)", y = "Density")cat(sprintf("Mean RT: %.1f ms | SD: %.1f ms | Range: %.0f–%.0f ms\n",
mean(rt_ms), sd(rt_ms), min(rt_ms), max(rt_ms)))Mean RT: 637.9 ms | SD: 198.2 ms | Range: 280–1545 ms
Binomial Distribution
set.seed(2026)
# Simulate accuracy in a lexical decision task
# 100 participants, 80 trials each, average accuracy 85%
n_participants <- 100
n_trials <- 80
accuracy_prob <- 0.85
n_correct <- rbinom(n = n_participants, size = n_trials, prob = accuracy_prob)
accuracy <- n_correct / n_trials
data.frame(accuracy = accuracy) |>
ggplot(aes(x = accuracy)) +
geom_histogram(binwidth = 0.02, fill = "steelblue", color = "white", alpha = 0.8) +
geom_vline(xintercept = mean(accuracy), color = "firebrick",
linetype = "dashed", linewidth = 1) +
theme_bw() +
labs(title = "Simulated accuracy in a lexical decision task",
subtitle = sprintf("n = %d | %d trials | P(correct) = %.2f | Mean = %.3f",
n_participants, n_trials, accuracy_prob, mean(accuracy)),
x = "Accuracy", y = "Count")Poisson Distribution
set.seed(2026)
# Simulate disfluency counts per minute for 200 speakers
corrections <- rpois(n = 200, lambda = 1.8)
data.frame(corrections = corrections) |>
ggplot(aes(x = corrections)) +
geom_bar(fill = "steelblue", color = "white", alpha = 0.8) +
theme_bw() +
labs(title = "Simulated disfluency counts per minute (Poisson)",
subtitle = sprintf("n = 200 | λ = 1.8 | Mean = %.2f | Variance = %.2f",
mean(corrections), var(corrections)),
x = "Disfluencies per minute", y = "Count")Negative Binomial Distribution
set.seed(2026)
# Compare Poisson vs. Negative Binomial: same mean, different variance
pois_counts <- rpois(n = 500, lambda = 3.0)
nb_counts <- rnbinom(n = 500, mu = 3.0, size = 0.8) # size controls overdispersion
cat(sprintf("Poisson: Mean = %.2f | Variance = %.2f\n",
mean(pois_counts), var(pois_counts)))Poisson: Mean = 2.90 | Variance = 2.73
cat(sprintf("Neg. Binom: Mean = %.2f | Variance = %.2f\n",
mean(nb_counts), var(nb_counts)))Neg. Binom: Mean = 2.79 | Variance = 13.41
data.frame(
count = c(pois_counts, nb_counts),
dist = rep(c("Poisson (λ=3)", "Neg. Binomial (μ=3, size=0.8)"), each = 500)
) |>
ggplot(aes(x = count, fill = dist)) +
geom_bar(position = "dodge", alpha = 0.8, color = "white") +
scale_fill_manual(values = c("steelblue", "firebrick")) +
theme_bw() + theme(legend.position = "top") +
labs(title = "Poisson vs. Negative Binomial: same mean, very different variance",
x = "Count", y = "Frequency", fill = "")Beta Distribution
set.seed(2026)
# Simulate accent ratings (0 = native-like, 1 = heavily accented)
# Beta(2, 5): right-skewed, most ratings near the native-like end
# Beta(5, 2): left-skewed, most ratings toward heavily-accented end
# Beta(2, 2): symmetric, centred at 0.5
ratings_native <- rbeta(200, shape1 = 2, shape2 = 5)
ratings_l2 <- rbeta(200, shape1 = 5, shape2 = 2)
data.frame(
rating = c(ratings_native, ratings_l2),
group = rep(c("L1 speakers", "Advanced L2 speakers"), each = 200)
) |>
ggplot(aes(x = rating, fill = group)) +
geom_histogram(binwidth = 0.05, position = "identity", alpha = 0.6, color = "white") +
scale_fill_manual(values = c("steelblue", "firebrick")) +
theme_bw() + theme(legend.position = "top") +
labs(title = "Simulated accent ratings (Beta distribution)",
subtitle = "L1: Beta(2,5) — mostly native-like | L2: Beta(5,2) — more accented",
x = "Accent rating (0 = native-like, 1 = heavily accented)", y = "Count",
fill = "")Gamma Distribution
set.seed(2026)
# Simulate inter-pausal intervals in spontaneous speech (ms)
# shape = 2, rate = 0.01 → mean = shape/rate = 200 ms
pauses <- rgamma(n = 500, shape = 2, rate = 0.01)
data.frame(pause_ms = pauses) |>
ggplot(aes(x = pause_ms)) +
geom_histogram(aes(y = after_stat(density)),
bins = 40, fill = "steelblue", color = "white", alpha = 0.8) +
geom_density(color = "firebrick", linewidth = 1) +
theme_bw() +
labs(title = "Simulated inter-pausal intervals (Gamma distribution)",
subtitle = sprintf("n = 500 | Mean = %.0f ms | SD = %.0f ms",
mean(pauses), sd(pauses)),
x = "Inter-pausal interval (ms)", y = "Density")Basic Simulation models in R
Simulate data
set.seed(123)
n <- 1000 # sample size
# Predictor variables
x1 <- rnorm(n, mean = 50, sd = 10) # continuous predictor (Normal)
x2 <- rbinom(n, size = 1, prob = 0.4) # binary predictor (Binomial)
x3 <- rpois(n, lambda = 3) # count predictor (Poisson)
income <- factor(sample(c("Low", "Medium", "High"),
n,
replace = TRUE),
levels = c("Low", "Medium", "High"))
income<-relevel(income, ref = "Low")
df=data.frame(x1,x2,x3,income)Linear Regression (Normal outcome)
# Simulate outcome (Normal)
y_normal <- 5 + 0.3*x1 - 2*x2 + 1.5*x3 + rnorm(n, 0, 5)
df <- data.frame(
y_normal = y_normal,
x1 = x1,
x2 = x2,
x3 = x3,
income = income
)
# Fit linear regression
model_lm <- lm(y_normal ~ x1 + x2 + x3+income,
data=df)
summary(model_lm)
Call:
lm(formula = y_normal ~ x1 + x2 + x3 + income, data = df)
Residuals:
Min 1Q Median 3Q Max
-15.2500 -3.1279 -0.2598 3.3663 17.0741
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.65952 0.90488 6.254 5.92e-10 ***
x1 0.27823 0.01621 17.162 < 2e-16 ***
x2 -1.82945 0.32669 -5.600 2.77e-08 ***
x3 1.59377 0.09121 17.474 < 2e-16 ***
incomeMedium -0.09679 0.38615 -0.251 0.802
incomeHigh 0.25520 0.41090 0.621 0.535
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.069 on 994 degrees of freedom
Multiple R-squared: 0.4043, Adjusted R-squared: 0.4013
F-statistic: 134.9 on 5 and 994 DF, p-value: < 2.2e-16
levels(income)[1] "Low" "Medium" "High"
Logistic Regression (Binomial outcome)
# Generate probability using logistic function
logit_p <- -2 + 0.05*x1 - 1*x2 + 0.3*x3
p <- 1 / (1 + exp(-logit_p))
# Simulate binary outcome
y_binomial <- rbinom(n, size = 1, prob = p)
df <- data.frame(
y_binomial = y_binomial,
x1 = x1,
x2 = x2,
x3 = x3,
income = income
)
# Fit logistic regression
model_logit <- glm(y_binomial ~ x1 + x2 + x3+income,
data = df,
family = binomial)
summary(model_logit)
Call:
glm(formula = y_binomial ~ x1 + x2 + x3 + income, family = binomial,
data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.405542 0.440341 -5.463 4.68e-08 ***
x1 0.054492 0.008081 6.743 1.55e-11 ***
x2 -0.709725 0.153968 -4.610 4.04e-06 ***
x3 0.355721 0.049688 7.159 8.12e-13 ***
incomeMedium 0.139281 0.185517 0.751 0.453
incomeHigh 0.032829 0.193867 0.169 0.866
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1166.5 on 999 degrees of freedom
Residual deviance: 1032.8 on 994 degrees of freedom
AIC: 1044.8
Number of Fisher Scoring iterations: 4
Prediction of logistic regression
# Prediction of x1 only
pred_1 <- ggpredict(model_logit, terms = c("x1 [all]"))# Plot for predicted value
plot(pred_1)# Prediction of x1 by income
pred_2 <- ggpredict(model_logit, terms = c("x1 [all]", "income"))# Plot for predicted value
plot(pred_2)Poisson Regression (Count outcome)
# Generate lambda (mean count)
lambda <- exp(1 + 0.02*x1 - 0.5*x2 + 0.2*x3)
# Simulate count outcome
y_poisson <- rpois(n, lambda = lambda)
df <- data.frame(
y_poisson = y_poisson,
x1 = x1,
x2 = x2,
x3 = x3,
income = income
)
# Fit Poisson regression
model_pois <- glm(y_poisson ~ x1 + x2 + x3+income,
data=df,
family = poisson)
summary(model_pois)
Call:
glm(formula = y_poisson ~ x1 + x2 + x3 + income, family = poisson,
data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.0231294 0.0534249 19.151 <2e-16 ***
x1 0.0196522 0.0009267 21.206 <2e-16 ***
x2 -0.5188986 0.0197615 -26.258 <2e-16 ***
x3 0.2003402 0.0046288 43.281 <2e-16 ***
incomeMedium 0.0220325 0.0212851 1.035 0.301
incomeHigh -0.0038863 0.0230702 -0.168 0.866
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 4202.74 on 999 degrees of freedom
Residual deviance: 961.01 on 994 degrees of freedom
AIC: 5181.6
Number of Fisher Scoring iterations: 4