Probability distributions
- Probability distribution - Describes the
probability of each possible outcome in a scenario
- Probability = Area
- Total area = 1
- Discrete distributions - Describe probabilities for
discrete outcomes
- Continuous distributions
- Expected value - Mean of a probability
distribution
- Law of large numbers - As the size of your sample
increases, the sample mean will approach the expected value (theoretical
mean)
Uniform distribution
- Uniform distribution - A distribution that has
constant probability due to equally likely occurring events
- \(U(a, b)\)
- \(a\): minimum
- \(b\): maximum
- Discrete uniform distribution
- Continuous uniform distribution
dunif(x, min = 0, max = 1, log = FALSE)
punif(q, min = 0, max = 1, lower.tail = TRUE, log.p = FALSE)
qunif(p, min = 0, max = 1, lower.tail = TRUE, log.p = FALSE)
runif(n, min = 0, max = 1)
# Discrete uniform distributions
die <- data.frame(n = c(1, 2, 3, 4, 5, 6))
# Expected value
mean(die$n)
## [1] 3.5
# Sampling from discrete distributions
rolls_10 <- die %>% sample_n(10, replace = TRUE)
g1 <- ggplot(rolls_10, aes(n)) +
geom_histogram(bins = 6) +
labs(title = "Sample size 10")
mean(rolls_10$n)
## [1] 3.9
rolls_1000 <- die %>% sample_n(1000, replace = TRUE)
g2 <- ggplot(rolls_1000, aes(n)) +
geom_histogram(bins = 6) +
labs(title = "Sample size 1000")
mean(rolls_1000$n)
## [1] 3.5
rolls_100000 <- die %>% sample_n(100000, replace = TRUE)
g3 <- ggplot(rolls_100000, aes(n)) +
geom_histogram(bins = 6) +
labs(title = "Sample size 100000")
mean(rolls_100000$n)
## [1] 3.49046
grid.arrange(g1, g2, g3, nrow = 1)

restaurant_groups <- data.frame(
group_id = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J"),
group_size = c(2, 4, 6, 2, 2, 2, 3, 2, 4, 2)
)
# Create a histogram of group_size
ggplot(restaurant_groups, aes(group_size)) +
geom_histogram(bins = 5)

# Create probability distribution
size_distribution <- restaurant_groups %>%
count(group_size) %>%
mutate(probability = n / sum(n))
size_distribution
# Calculate expected group size
expected_val <- sum(size_distribution$group_size * size_distribution$probability)
expected_val
## [1] 2.9
# Calculate probability of picking group of 4 or more
size_distribution %>%
filter(group_size >= 4) %>%
summarise(prob_4_or_more = sum(probability))
# Continuous uniform distributions
# Min and max wait times for back-up that happens every 30 min
min <- 0
max <- 30
data.frame(
# P(wait time ≤ 5)
prob_less_than_5 = punif(5, min, max),
# P(wait time ≥ 5)
prob_greater_than_5 = punif(5, min, max, lower.tail = FALSE),
# P(10 ≤ wait time ≤ 20)
prob_between_10_and_20 = punif(20, min, max) - punif(10, min, max)
)
wait_times <- data.frame(simulation_nb = c(1:100000))
set.seed(334)
# Generate 100000 wait times between 0 and 30 mins, save in time column
wait_times %>%
mutate(time = runif(100000, min, max)) %>%
# Histogram of simulated times
ggplot(aes(time)) +
geom_histogram(bins = 30)

Binomial distribution
- Binomial distribution - Discrete
probability distribution of the number of successes in a sequence of
independent trials
- \(B(n, p)\)
- \(n\) - Total number of trials -
size
- \(p\) - Probability of success -
prob
dbinom(x, size, prob, log = FALSE)
pbinom(q, size, prob, lower.tail = TRUE, log.p = FALSE)
qbinom(p, size, prob, lower.tail = TRUE, log.p = FALSE)
rbinom(n, size, prob)
- Expected value = \(n\) × \(p\)
# Binomial distribution
# rbinom(# of trials, # of coins, # probability of heads/success)
# A single flip
rbinom(1, 1, 0.5)
## [1] 0
# One flip many times
rbinom(8, 1, 0.5)
## [1] 1 0 0 1 0 0 1 1
# Many flips one time
rbinom(1, 8, 0.5)
## [1] 3
# Many flips many times
rbinom(10, 3, 0.5)
## [1] 1 0 1 1 2 1 3 1 2 1
# P(heads = 7)
# dbinom(num heads, num coins, prob of heads)
dbinom(7, 10, 0.5)
## [1] 0.1171875
# P(heads ≤ 7)
pbinom(7, 10, 0.5)
## [1] 0.9453125
# P(heads > 7)
pbinom(7, 10, 0.5, lower.tail = FALSE)
## [1] 0.0546875
1 - pbinom(7, 10, 0.5)
## [1] 0.0546875
# Simulate a single deal
set.seed(10)
rbinom(1, 1, 0.3)
## [1] 0
# Simulate 1 week of 3 deals
set.seed(10)
rbinom(1, 3, 0.3)
## [1] 1
# Simulate 52 weeks of 3 deals
set.seed(10)
deals <- rbinom(52, 3, 0.3)
# Calculate mean deals won per week
mean(deals)
## [1] 0.8076923
# Probability of closing 3 out of 3 deals
dbinom(3, 3, 0.3)
## [1] 0.027
# Probability of closing <= 1 deal out of 3 deals
pbinom(1, 3, 0.3)
## [1] 0.784
# Probability of closing > 1 deal out of 3 deals
pbinom(1, 3, 0.3, lower.tail = FALSE)
## [1] 0.216
data.frame(n = rbinom(10000, 10, .5)) %>%
ggplot(aes(n)) +
geom_histogram(bins = 11)

Normal distribution
- Normal distribution
- Continuous distribution
- Bell curve
- Symmetrical
- Area = 1
- Curve never hits 0
- \(N(\mu, \sigma)\)
- \(\mu\): Mean
- \(\sigma\): Standard deviation
dnorm(x, mean = 0, sd = 1, log = FALSE)
pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
rnorm(n, mean = 0, sd = 1)
- Standard normal distribution - Normal distribution
with mean 0 and a standard deviation of 1
- 68-95-99-point-7 rule
- 68% falls within 1 standard deviation of the mean
- 95% falls within 2 standard deviations
- 99.7% falls within 3 standard deviations
# Normal distribution
# P (Height < 154 cm)
pnorm(154, mean = 161, sd = 7) # Distribution function
## [1] 0.1586553
# P (Height > 154 cm)
pnorm(154, mean = 161, sd = 7, lower.tail = FALSE)
## [1] 0.8413447
# P (154 cm < Height < 157 cm)
pnorm(157, mean = 161, sd = 7) - pnorm(154, mean = 161, sd = 7)
## [1] 0.1251993
# Height that 90% of women shorter than
qnorm(0.9, mean = 161, sd = 7) # Quantile function
## [1] 169.9709
# Height that 90% of women taller than
qnorm(0.9, mean = 161, sd = 7, lower.tail = FALSE)
## [1] 152.0291
# Generate 10 random heights
rnorm(10, mean = 161, sd = 7) # Generates random deviates
## [1] 165.6326 169.2622 162.9009 157.7776 161.8106 154.9148 172.4883 153.1321
## [9] 158.4192 163.4099
# P (Height = 160)
dnorm(160, mean = 161, sd = 7) # Density
## [1] 0.05641316
# Histogram of amount with 10 bins
ggplot(data = amir_deals, aes(amount)) +
geom_histogram(bins = 10)

# Probability of deal < 7500
pnorm(7500, 5000, 2000)
## [1] 0.8943502
# Probability of deal > 1000
pnorm(1000, 5000, 2000, lower.tail = FALSE)
## [1] 0.9772499
# Probability of deal between 3000 and 7000
pnorm(7000, 5000, 2000) - pnorm(3000, 5000, 2000)
## [1] 0.6826895
# Calculate amount that 75% of deals will be more than
qnorm(0.75, 5000, 2000, lower.tail = FALSE)
## [1] 3651.02
new_sales <- data.frame(sale_num = c(1:36))
# Calculate new average amount
new_mean <- 5000 * 1.2
# Calculate new standard deviation
new_sd <- 2000 * 1.3
# Simulate 36 sales
new_sales <- new_sales %>%
mutate(amount = rnorm(36, new_mean, new_sd))
# Create histogram with 10 bins
ggplot(new_sales, aes(amount)) +
geom_histogram(bins = 10)

Poisson distribution
- Poisson processes - Events appear to happen at a
certain rate, but completely at random
- Examples
- Number of animals adopted from an animal shelter per week
- Number of people arriving at a restaurant per hour
- Poisson distribution - Probability of some # of
events occurring over a fixed period of time
- Examples
- Probability of >= 5 animals adopted from an animal shelter per
week
- Probability of 12 people arriving at a restaurant per hour
- Discrete distribution
- The distribution’s peak is always at its \(\lambda\) value
- \(P(\lambda)\)
- \(\lambda\): Average number of
events per time interval
dpois(x, lambda, log = FALSE)
ppois(q, lambda, lower.tail = TRUE, log.p = FALSE)
qpois(p, lambda, lower.tail = TRUE, log.p = FALSE)
rpois(n, lambda)
- Expected value: \(\lambda\)
- Poisson distribution tends to a Normal distribution for a large
parameter lambda (>10)
# Poisson distribution
lambdas <- c(1, 2, 4, 5, 8)
ggplot(data = data.frame(x = 0:10)) +
lapply(lambdas, function(l) geom_point(aes(x, dpois(x, lambda = l), col = factor(l)))) +
lapply(lambdas, function(l) geom_line(aes(x, dpois(x, lambda = l), col = factor(l)), linewidth = 1)) +
labs(title = "Poisson Distribution", x = "Number of Events", y = "Probability") +
guides(color = guide_legend(title = "lambda")) +
theme(text = element_text(size=12))

# P (# adoptions in a week = 5)
dpois(5, lambda = 8)
## [1] 0.09160366
# P (# adoptions in a week ≤ 5)
ppois(5, lambda = 8)
## [1] 0.1912361
# P(# adoptions in a week > 5)
ppois(5, lambda = 8, lower.tail = FALSE)
## [1] 0.8087639
ppois(5, lambda = 10, lower.tail = FALSE)
## [1] 0.932914
# Sampling from a Poisson distribution
rpois(10, lambda = 8)
## [1] 9 11 5 9 12 6 9 11 7 11
sample_means <- data.frame(means = replicate(1000, rpois(10, lambda = 8) %>% mean()))
ggplot(sample_means, aes(means)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exponential distribution
- Exponential distribution - Probability of time
between Poisson events
- Examples
- Probability of > 1 day between adoptions
- Probability of < 10 minutes between restaurant arrivals
- Continuous distribution
- \(E(\lambda)\)
- \(\lambda\): rate, the average rate
of arrivals or occurrences of an event in a given time interval
dexp(x, rate = 1, log = FALSE)
pexp(q, rate = 1, lower.tail = TRUE, log.p = FALSE)
qexp(p, rate = 1, lower.tail = TRUE, log.p = FALSE)
rexp(n, rate = 1)
- Expected value: \(1/\lambda\)
- Poisson distribution vs Exponential distribution
- For the Poisson distribution, the expected value represents the
average frequency of events
- Measures frequency in terms of rate or number of events
- For the exponential distribution, the expected value represents the
average time between events
- Measures frequency in terms of time between events
- For example, on average, 3 cars pass through an intersection in a
minute
- Poisson distribution: \(\lambda =
3\) and expected value = 3
- Exponential distribution: \(\lambda =
3\) and expected value = 1/3
# Exponential distribution
lambdas <- c(0.5, 1, 1.5)
ggplot(data = data.frame(x = seq(0, 5, by = 0.1))) +
lapply(lambdas, function(l) geom_line(aes(x, dexp(x, rate = l), col = factor(l)), linewidth = 1)) +
labs(title = "Exponential Distribution", x = "Time Between Events", y = "") +
guides(color = guide_legend(title = "rate")) +
theme(text=element_text(size=12))

# On average, one customer service ticket is created every 2 minutes. λ = 0.5 customer service tickets created each minute
# P(wait < 1 min)
pexp(1, rate = 0.5)
## [1] 0.3934693
# P(wait > 4 min)
pexp(4, rate = 0.5, lower.tail = FALSE)
## [1] 0.1353353
# P(1 min < wait < 4 min)
pexp(4, rate = 0.5) - pexp(1, rate = 0.5)
## [1] 0.4711954
(Student’s) t-distribution
- (Student’s) t-Distribution
- Similar shape as the normal distribution
- Degrees of freedom (df)
- Affects the thickness of the tails
- Lower df - Thicker tails, higher standard deviation
- Higher df - Closer to normal distribution
# (Student's) t-distribution
df <- c(1, 5, 10)
ggplot(data = data.frame(x = seq(-3, 3, by = 0.1))) +
lapply(df, function(l) geom_line(aes(x, dt(x, df = l), col = factor(l)), linewidth = 1)) +
labs(title = "t-Distribution", x = "", y = "") +
guides(color = guide_legend(title = "df")) +
theme(text=element_text(size=12))

Log-normal distribution
- Log-normal distribution
- Variable whose logarithm is normally distributed
- Example
- Length of chess games
- Adult blood pressure
- Number of hospitalizations in the 2003 SARS outbreak
# Log-normal distribution
sd <- c(.5, 1, 1.5)
ggplot(data = data.frame(x = seq(0, 3, by = 0.05))) +
lapply(sd, function(l) geom_line(aes(x, dlnorm(x, sdlog = l), col = factor(l)), linewidth = 1)) +
labs(title = "Log Distribution", x = "", y = "") +
guides(color = guide_legend(title = "sd")) +
theme(text=element_text(size=12))

The central limit theorem
- Central limit theorem (CLT) - The sampling
distribution of a statistic becomes closer to the normal
distribution as the number of trials increases
- Sampling distributions - Distribution of a summary
statistic
- Samples should be random and independent
- Apply to sample mean, median, sd, proportion
- No matter the original distribution being sampled from
- Estimate characteristics of unknown underlying distribution
- More easily estimate characteristics of large populations
- The mean of the sampling distribution is the mean
of the population
- The standard deviation of the sampling distribution
is the standard deviation of the population divided by the square root
of the sample size
- \(\bar{X} \sim N(\mu,
\frac{\sigma}{\sqrt{n}})\)
- \(\bar{X}\) is the sampling
distribution of the sample means
- \(\sim\) means follows the
distribution
- \(N\) is the normal
distribution
- \(\mu\) is the mean of the
population
- \(\sigma\) is the standard
deviation of the population
- \(n\) is the sample size
- Conditions of the central limit theorem
- The sample size is sufficiently large. This condition is usually met
if the sample size is n >= 30
- The samples are independent and identically
distributed (i.i.d.) random variables. This condition is
usually met if the sampling is random
- The population’s distribution has finite variance.
Central limit theorem doesn’t apply to distributions with infinite
variance, such as the Cauchy distribution. Most distributions have
finite variance
# The central limit theorem
die <- c(1, 2, 3, 4, 5, 6)
# Rolling the dice 5 times
sample_of_5 <- sample(die, 5, replace = TRUE)
sample_of_5
## [1] 3 3 5 4 4
mean(sample_of_5)
## [1] 3.8
# Rolling the dice 5 times 10/100/1000 times
replicate(10, sample(die, 5, replace = TRUE) %>% mean())
## [1] 3.2 2.8 3.8 2.4 2.8 4.4 3.2 4.0 2.2 3.8
sample_means_10 <- replicate(10, sample(die, 5, replace = TRUE) %>% mean())
sample_means_100 <- replicate(100, sample(die, 5, replace = TRUE) %>% mean())
sample_means_1000 <- replicate(1000, sample(die, 5, replace = TRUE) %>% mean())
p1 <- ggplot() + aes(sample_means_10) + geom_histogram(bins = 7)
p2 <- ggplot() + aes(sample_means_100) + geom_histogram(bins = 7)
p3 <- ggplot() + aes(sample_means_1000) + geom_histogram(bins = 7)
grid.arrange(p1, p2, p3, ncol=3)

# Estimate expected value of die
mean(sample_means_1000)
## [1] 3.4738
# Standard deviation and the CLT
mean(replicate(1000, sample(die, 5, replace = TRUE) %>% sd()))
## [1] 1.642136
# Proportions and the CLT
sales_team <- c("Amir", "Brian", "Claire", "Damian")
mean(replicate(100000, mean(sample(sales_team, 10, replace = TRUE) == "Claire")))
## [1] 0.250635
# Create a histogram of num_users
g1 <- ggplot(amir_deals, aes(num_users)) +
geom_histogram(bins = 10)
set.seed(104)
# Sample 20 num_users from amir_deals and take mean and repeat 100 times
sample_means <- replicate(100, sample(amir_deals$num_users, size = 20, replace = TRUE) %>% mean())
# Create data frame for plotting
samples <- data.frame(mean = sample_means)
# Histogram of sample means
g2 <- ggplot(samples, aes(mean)) +
geom_histogram(bins = 10)
grid.arrange(g1, g2, nrow = 1)

Correlation
- Correlation - Relationships between two variables
- \(x\): explanatory/independent
variable
- \(y\): response/dependent
variable
- Only accounts for linear relationships
- Correlation shouldn’t be used blindly, always visualize your
data
- Correlation coefficient - Quantifies the linear
relationship between two variables
- [−1, 1]
- Magnitude corresponds to strength of
relationship
- Sign (+ or -) corresponds to direction of
relationship
- Pearson product-moment correlation (r)
- \(r = \sum^{n}_{i=1} \frac{(x_i -
\bar{x})(y_i - \bar{y})}{\sigma_x \times \sigma_y}\)
- \(\bar{x}\): Mean of \(x\)
- \(\sigma_x\): Standard deviation of
\(x\)
- Most common
- Kendall’s tau
- Spearman’s rho
- Transformation to linear - When variables have skewed distributions
- Log transformation:
log(x)
- Square root transformation:
sqrt(x)
- Reciprocal transformation:
1/x
- Combinations of these - e.g.
sqrt(x) and
1/y
- Correlation does not imply causation. x is correlated with y does
not mean x causes y
- Spurious
correlation/causation/association/confounding/confounder/lurking
variable
# Correlation
ggplot(world_happiness, aes(life_exp, happiness_score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

cor(world_happiness$life_exp, world_happiness$happiness_score, use = "pairwise.complete.obs")
## [1] 0.7802249
g1 <- ggplot(msleep, aes(bodywt, awake)) +
geom_point()
cor(msleep$bodywt, msleep$awake)
## [1] 0.3119801
# Log transformation
g2 <- msleep %>%
mutate(log_bodywt = log(bodywt)) %>%
ggplot(aes(log_bodywt, awake)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
cor(log(msleep$bodywt), msleep$awake)
## [1] 0.5687943
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
