Probability Distributions

https://www.jarad.me/courses/stat587Eng/labs/lab03/lab03.html

https://seankross.com/notes/dpqr/#:~:text=Although%20x%20represents%20the%20independent,the%20normal%20distribution%20with%20dnorm%20.

d: density function p: cumulative density function q: quantile function r: random sampling

library(tidyverse)
library(openintro)
library(palmerpenguins)
library(janitor)
library(interpretCI)
library(BSDA)
library(pwr)

# Consistent Randomization
set.seed(1)

# Remove scientific notation
options(scipen = 999)

# Geom
#dgeom(x, prob, log = FALSE)
#pgeom(q, prob, lower.tail = TRUE, log.p = FALSE)
#qgeom(p, prob, lower.tail = TRUE, log.p = FALSE)
#rgeom(n, prob)

# Expo
#dexp(x, rate = 1, log = FALSE)
#pexp(q, rate = 1, lower.tail = TRUE, log.p = FALSE)
#qexp(p, rate = 1, lower.tail = TRUE, log.p = FALSE)
#rexp(n, rate = 1)

#  Binom
#dbinom(x, size, prob, log = FALSE)
#pbinom(q, size, prob, lower.tail = TRUE, log.p = FALSE)
#qbinom(p, size, prob, lower.tail = TRUE, log.p = FALSE)
#rbinom(n, size, prob)

# Poisson
#dpois(x, lambda, log = FALSE)
#ppois(q, lambda, lower.tail = TRUE, log.p = FALSE)
#qpois(p, lambda, lower.tail = TRUE, log.p = FALSE)
#rpois(n, lambda)

# Uniform
#dunif(x, min = 0, max = 1, log = FALSE)
#punif(q, min = 0, max = 1, lower.tail = TRUE, log.p = FALSE)
#qunif(p, min = 0, max = 1, lower.tail = TRUE, log.p = FALSE)
#runif(n, min = 0, max = 1)

# Normal
#dnorm(x, mean = 0, sd = 1, log = FALSE)
#pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
#qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
#rnorm(n, mean = 0, sd = 1)

# t
#dt(x, df, ncp, log = FALSE)
#pt(q, df, ncp, lower.tail = TRUE, log.p = FALSE)
#qt(p, df, ncp, lower.tail = TRUE, log.p = FALSE)
#rt(n, df, ncp)

Binomial

Example:

100 tosses of a fair coin.
Head = success / 1
Tails = fail / 0

# Simulate data
coin_toss_1 <- rbinom(100, 1, 0.5)

janitor::tabyl(coin_toss_1) %>% 
  as.data.frame() %>% rename(`heads 1/0` = 1,
                             count = 2)

##   heads 1/0 count percent
## 1         0    52    0.52
## 2         1    48    0.48

# Compute P(45 < X < 55) for X Binomial(100, 0.5)
sum(dbinom(46:54, 100, 0.5))

## [1] 0.6317984

# Exactly 3 heads from 10 coin tosses
dbinom(3, 10, 0.2)

## [1] 0.2013266

# Compute P(1 < X < 50) for X Binomial(100, 0.5)
sum(dbinom(x = 1:50, size = 100, p = 0.5))

## [1] 0.5397946

# Probability of tossing 50 heads or less out of 100
pbinom(50, size = 100, prob = 0.5)

## [1] 0.5397946

# Prob of tossing more than 50 heads out of 100
pbinom(50, size = 100, prob = 0.5, lower.tail = FALSE)

## [1] 0.4602054

success <- 0:100

plot(success, dbinom(success, 
                     size = 100, 
                     prob = 0.5),
     type='h')
title("PMF for a binomial distribution with size = 100 and prob = 0.5")

plot(0:100, pbinom(0:100, size = 100, prob = 0.5), 
     type = "l", 
     main = "CDF for binomial distribution with size = 100 and prob = 0.5")

Geometric (the number of failures in a binomial distribution before first success)

The Geometric Distribution models the number of trials needed to achieve the first success in a sequence of independent Bernoulli trials
The outcome from these is modelled by a binomial distribution, each with the same probability of success.
The PMF gives the probability of achieving the first success on the kth trial.
100 tosses of a fair coin.
Head = success / 1
Tails = fail / 0

# Note that X in each function represents the number of failures before a success

# Example if success / heads occurs on the 3rd toss then x should be 2
dgeom(x = 2, p = 0.5)

## [1] 0.125

# Compute P(3 < X < 10) for X Geometric(0.5) tossing a head for the first time between the 3rd and 10th tosses
sum(dgeom(3:10, 0.5))

## [1] 0.1245117

# Compute P(1 < X < 50) for X Geometric(0.5) tossing a head for the first time between the 1st and 50th tosses
sum(dgeom(x = 1:50, p = 0.5))

## [1] 0.5

# Probability of tossing a head by the 4th toss
dgeom(3, prob = 0.2)

## [1] 0.1024

# Prob of it taking more than 3 tosses to toss a head
pgeom(2, prob = 0.5, lower.tail = FALSE)

## [1] 0.125

success <- 0:20

plot(0:10, dgeom(0:10, 
                     prob = 0.5),
     type='h')
title("PMF for a geometric distribution with prob = 0.5")

plot(0:20, pgeom(0:20, prob = 0.5), 
     type = "l", 
     main = "CDF for a geometric distribution with prob = 0.5")

Poisson

The Poisson Distribution models the number of events that occur in a fixed interval of time
It is often used when the events are rare and independent, and the average rate of occurrence is known.
The PMF gives the probability of observing exactly k events in the interval.

Example:

Count of goals scored by Liverpool per game over 100 games
I have made up a rate of 1.5 goals per game which will be the value of the lambda param

liverpool_goals <- rpois(100, 1.5)

hist(liverpool_goals, 
     breaks = 5,
     main = "Goals scored by Liverpool per game n = 100, Lambda = 1.5",
     xlab = "Goals per game")

# Compute P(3 < X < 10) for X Poisson(1.5) probability of scoring between 3 and 10 goals in a game
sum(dpois(3:10, lambda = 1.5))

## [1] 0.1911526

# Compute P(1 < X < 3) for X Poisson(1.5) probability of scoring between 1 and 3 goals in a game
sum(dpois(x = 1:3, lambda = 1.5))

## [1] 0.7112274

# Probability of scoring exactly 3 goals in a game
dpois(3, lambda = 1.5)

## [1] 0.1255107

# Prob of it taking more than 3 games to score
ppois(3, lambda = 1.5, lower.tail = FALSE)

## [1] 0.06564245

# PMF plot
plot(0:7, dpois(0:7, 
                     lambda = 1.5),
     type='h')
title("PMF for a Poisson distribution with lambda = 1.5")

# CDF Plot
plot(0:7, ppois(0:7, lambda = 1.5), 
     type = "l", 
     main = "CDF for a Poisson distribution with lambda = 1.5")

Exponential (time between events in Poisson process)

The Exponential Distribution models the time between events in a Poisson process
Where events occur continuously and independently at a constant rate.
It is often used to model the time until an event happens.
The PDF describes the probability density of the time to the event.

# Compute P(4 < X < 5) for X Exponential X ~ M(1.5) probability of the first goal happening between 4 and 5 games
1 - pexp(4, rate = 1.5) + 1 - pexp(5, rate = 1.5)

## [1] 0.003031837

# Probability of the first goal happening by the 3rd game
1 - pexp(2, rate = 1.5)

## [1] 0.04978707

# Probability of the first goal happening in the 1st game
1 - pexp(1, rate = 1.5)

## [1] 0.2231302

# Prob of it taking more than 3 games to score
pexp(3, rate = 1.5, lower.tail = FALSE)

## [1] 0.011109

# Prob of it taking more than 3 games to score
pexp(3, rate = 1.5)

## [1] 0.988891

# PMF plot
plot(0:7, dexp(0:7, 
                     rate = 1.5),
     type='h')
title("PMF for a Exponential distribution with rate = 1.5")

# CDF Plot
plot(0:7, pexp(0:7, rate = 1.5), 
     type = "l", 
     main = "CDF for a Exponential distribution with rate = 1.5")

Normal

The Normal Distribution, also known as the Gaussian Distribution, is one of the most widely used distributions.
The PDF describes the probability density of observing a particular value, and it is defined by the mean and standard deviation.

# Simulated normal distribution
normal_dist <- rnorm(100, mean = 5, sd = 2)

# Simulated standard normal distribution
standard_normal_dist <- rnorm(100, mean = 0, sd = 1)

hist(normal_dist, main = "Normal Distribution, mu = 5, sd = 2")

hist(standard_normal_dist, main = "Normal Distribution, mu = 0, sd = 1")

PDF and CDF of a normal distribution

# PDF of normal dist with mean 5 and sd 2
curve(dnorm(x, mean = 5, sd = 2),
      from = -1, to = 11,
      main = "PDF for Normal Distrubution with mu = 5 and sd = 2")

curve(pnorm(x, mean = 5, sd = 2), 
      from = -1, to = 11,
      main = "CDF for Normal Distrubution with mu = 5 and sd = 2",
      ylab = "F(x)")

##### Probabilities of X

dnorm is the density function for a normal distribution of X with mean and sd

E.g for a

mean <- 5
st_dev <- 2


prob_list <- list()

x <- 1:9

for(i in seq_along(x)) {
  
  pnorm <- pnorm(i, mean = mean, sd = st_dev)
  dnorm <- dnorm(i, mean = mean, sd = st_dev)
  
  prob_list[[i]] <-
  
  data.frame(x = x[i],
   p_norm = round(pnorm, 4),
   d_norm = round(dnorm, 3)
   
  )

}

x_normal_probs <- bind_rows(prob_list) 

x_normal_probs

##   x p_norm d_norm
## 1 1 0.0228  0.027
## 2 2 0.0668  0.065
## 3 3 0.1587  0.121
## 4 4 0.3085  0.176
## 5 5 0.5000  0.199
## 6 6 0.6915  0.176
## 7 7 0.8413  0.121
## 8 8 0.9332  0.065
## 9 9 0.9772  0.027

Probability calculations for X on a normal distribution

Again pnorm gives the CDF where lower.tail argument can be used for “more than” questions
For <= 2 or lower than 3, specify X = 3 as the first argument
dnorm gives the density function

mean <- 5
st_dev <- 2

# P(X < 3)
# The area to the left of 3
pnorm(3, mean = mean, sd = st_dev)

## [1] 0.1586553

# P(X > 3)
# The area to the right of 3
pnorm(3, mean = mean, sd = st_dev, lower.tail = FALSE)

## [1] 0.8413447

# P(3 < X < 5)
# The area between 3 and 5
dnorm_range <- function(lower_range, 
                         upper_range,
                         mean,
                         st_dev) {

1 - pnorm(lower_range, mean, st_dev) - pnorm(upper_range, mean, st_dev, lower.tail = FALSE)

}

dnorm_range(3, 5, mean, st_dev)

## [1] 0.3413447