Extra Credit

Simulate choosing a card from a shuffled deck 10 times with replacement to count the occurrences where you get a diamond.

# General set-up for Simulation
set.seed(123)
x <- 1 # of realizations. I will put 1 for now lol.
realization.sim <- rbinom(n = x, size = 10, prob = 13/52) # Prob. of getting a diamond in a standard deck of cards is P(Diamond) = 13/52 beacuse there area total of 13 diamond cards in a standard deck of 52 cards.

Simulate 5, 20, 100, and 200 realizations, then plot a histogram for each (I would recommend using ggplot and playing around with the binwidth for geom_histogram for more flexibility in how your graph appears). In addition, get the mean and variance for each vector.

library(ggplot2)

# Simulation of 05 realizations.
set.seed(123)
x <- 5
real5 <- rbinom(n = x, size = 10, prob = 13/52)
real5_df <- data.frame(real5)


# Dont mind me here I am playing around with the bindwidth for histograms.

# hist_real5_df <- ggplot(real5_df, aes(x = real5)) + geom_histogram(binwidth = 0.5,
#      fill = "#008080", color = "black") + labs(title =  "Histogram of 5 realizations", x = "Occurence of getting a Diamond",
#     y = "Frequency")
#     
# print(hist_real5_df)
# 
# hist_real5_df <- ggplot(real5_df, aes(x = real5)) + geom_histogram(binwidth = 2,
#      fill = "#008080", color = "black") + labs(title =  "Histogram of 5 realizations", x = "Occurence of getting a Diamond",
#     y = "Frequency")
#     
# print(hist_real5_df)
# 
# hist_real5_df <- ggplot(real5_df, aes(x = real5)) + geom_histogram(binwidth = 2,
#      fill = "#008080", color = "black") + labs(title =  "Histogram of 5 realizations", x = "Occurence of getting a Diamond",
#     y = "Frequency")
#     
# print(hist_real5_df)

# Ideally, I would choose a binwidth of 1 because < 1 would appear as gaps and > 1 would appear as dense bins. It also makes sense to me because the occurences only happen from 0 to 10 and is discrete. But I am thinking does it depend on the # of realizations to choose an appropriate binwidth? Because like less realizations would give gaps cause there is a higher probability of not getting a diamond with?

hist_real5_df <- ggplot(real5_df, aes(x = real5)) + geom_histogram(binwidth = 1,
     fill = "#008080", color = "black") + labs(title =  "Histogram of 5 Realizations", x = "Occurence of getting a Diamond",
    y = "Frequency")

print(hist_real5_df)

mean(real5)

## [1] 3.4

var(real5)

## [1] 1.8

# Simulation of 20 realizations.
set.seed(123)
x <- 20
real20 <- rbinom(n = x, size = 10, prob = 13/52)
real20_df <- data.frame(real20)

hist_real20_df <- ggplot(real20_df, aes(x = real20)) + geom_histogram(binwidth = 1,
    fill = "#008080", color = "black") + labs(title = "Histogram of 20 Realizations", x = "Occurence of getting a Diamond",
    y = "Frequency")

print(hist_real20_df)

mean(real20)

## [1] 2.8

var(real20)

## [1] 2.273684

# Simulation of 100 realizations.
set.seed(123)
x <- 100
real100 <- rbinom(n = x, size = 10, prob = 13/52)
real100_df <- data.frame(real100)

hist_real100_df <- ggplot(real100_df, aes(x = real100)) + geom_histogram(binwidth = 1,
    fill = "#008080", color = "black") + labs(title = "Histogram of 100 realizations", x = "Occurence of getting a Diamond",
    y = "Frequency")

print(hist_real100_df)

mean(real100)

## [1] 2.48

var(real100)

## [1] 1.807677

# Simulation of 200 realizations.
set.seed(123)
x <- 200
real200 <- rbinom(n = x, size = 10, prob = 13/52)
real200_df <- data.frame(real200)

hist_real200_df <- ggplot(real200_df, aes(x = real200)) + geom_histogram(binwidth = 1,
    fill = "#008080", color = "black") + labs(title = "Histogram of 200 realizations", x = "Occurence of getting a Diamond",
    y = "Frequency")

print(hist_real200_df)

mean(real200)

## [1] 2.535

var(real200)

## [1] 1.64701

Analyze the results and compare to the theoretical expected value, variance, and pdf.

# Theoretical values
n <- 10 # Number of trials
p <- 13/52 # Probability of getting a diamond
theo_mean <- n * p 
theo_mean

## [1] 2.5

n <- 10 # Number of trials
p <- 13/52 # Probability of getting a diamond
q <- 1 - p # Complement of p (prob. of not getting a diamond)
theo_var <- n * p * q
theo_var

## [1] 1.875

# Probability Density Function
X <- 0:10
X.prob.PDF <- dbinom(x=X, size = 10, prob = 13/52) 

# create a data.frame to input into ggplot
PDF.df <- data.frame(X.prob.PDF)

hist.PDF.df <- ggplot(PDF.df, aes(x = factor(X), y = X.prob.PDF)) + geom_bar(stat = "identity",
    fill = "#008080", color = "black") + labs(title = "PDF", x = "Occurence of getting a Diamond",
    y = "Frequency")

print(hist.PDF.df)

As we increase the realizations, we see the mean decrease closer and closer to the theoretical value starting at 3.4 with 5 realizations to 2.535 with 200 realizations (the theoretical mean is 2.5). However when observe the dynamics of the variance, we see that with the increasing of realizations, variance does not necessarily decrease closer to the theoretical variance. At 5 realizations, the variance is 1.8. Then at 20 realizations the variance jumps to 2.273684. At 100 realizations, the variance jumps back down to 1.807677. Lastly at 200 realizations, the variance jumps down to 1.64701. The theoretical variance is 1.875 and the closest to this variance was actually at 100 realizations. But it is not surprising since we are simulating values, which leads to random spread in data and therefore variance is random as well especially for smaller samples. The growing realizations also approach the PDF. Comparing the realizations, we can see that the 200 realizations simulation is the closest distribution to PDF.

Let’s say for a new driver, the average time it takes to drive from UOG to GPO is 20 minutes with a standard deviation of 3 minutes and X, the time to drive between UOG and GPO, is normally distributed.

What’s the probability that a randomly selected new driver will arrive to GPO from UOG within 14 minutes (P< 14)?

pnorm.drive.less.than.14 <-pnorm(14, mean = 20, 3)  # we do not have to subtract from 1 since pnorm provides left-tailed probabilities.
pnorm.drive.less.than.14

## [1] 0.02275013

pnorm.drive.less.than.14.prc <- pnorm.drive.less.than.14 *100
round(pnorm.drive.less.than.14.prc, 1)

## [1] 2.3

Ans: The probability that a randomly selected new driver will arrive to GPO from UOG within 14 minutes is about 2.3%.

What is the probability that they arrive 15 minutes or later P(>=15)?

pnorm.drive.more.than.15 <- 1 - pnorm(15, mean = 20, 3)

pnorm.drive.more.than.15

## [1] 0.9522096

pnorm.drive.more.than.15.prc <- pnorm.drive.more.than.15 *100
round(pnorm.drive.more.than.15.prc, 1)

## [1] 95.2

Ans: The probability that a randomly selected new driver will arrive to GPO from UOG at or after 15 minutes is about 95.2%.

Simulate 5, 20, 100, and 200 observations corresponding to the time it takes to drive from GPO to UOG, then plot a histogram for each. In addition, get the mean and variance for each vector. Analyse the results and compare to the theoretical expected value, variance, and pdf.

set.seed(1234)
sim.drive.5 <- rnorm(n = 5, mean = 20, sd = 3)
hist(sim.drive.5) # Default histogram

sim.drive.5.df <- data.frame(sim.drive.5)

# Customized
hist.sim.drive.5.df <- ggplot(sim.drive.5.df, aes(x = sim.drive.5)) + geom_histogram(binwidth = 1,
    fill = "#66CDAA", color = "black") + labs(title = "Histogram of 05 Observations", x = "Time of Drive(in Minutes)",
    y = "Frequency")

print(hist.sim.drive.5.df)

mean(sim.drive.5)

## [1] 18.94294

var(sim.drive.5)

## [1] 17.49525

set.seed(1234)
sim.drive.20 <- rnorm(n = 20, mean = 20, sd = 3)
hist(sim.drive.20) # Default histogram

sim.drive.20.df <- data.frame(sim.drive.20)

# Customized
hist.sim.drive.20.df <- ggplot(sim.drive.20.df, aes(x = sim.drive.20)) + geom_histogram(binwidth = 1,
    fill = "#66CDAA", color = "black") + labs(title = "Histogram of 20 Observations", x = "Time of Drive(in Minutes)",
    y = "Frequency")

print(hist.sim.drive.20.df)

mean(sim.drive.20)

## [1] 19.24801

var(sim.drive.20)

## [1] 9.250253

set.seed(1234)
sim.drive.100 <- rnorm(n = 100, mean = 20, sd = 3)
hist(sim.drive.100) # Default histogram

sim.drive.100.df <- data.frame(sim.drive.100)

# Customized
hist.sim.drive.100.df <- ggplot(sim.drive.100.df, aes(x = sim.drive.100)) + geom_histogram(binwidth = 1,
    fill = "#66CDAA", color = "black") + labs(title = "Histogram of 100 Observations", x = "Time of Drive(in Minutes)",
    y = "Frequency")

print(hist.sim.drive.100.df)

mean(sim.drive.100)

## [1] 19.52971

var(sim.drive.100)

## [1] 9.07947

set.seed(1234)
sim.drive.200 <- rnorm(n = 200, mean = 20, sd = 3)
hist(sim.drive.200) # Default histogram

sim.drive.200.df <- data.frame(sim.drive.200)

# Customized
hist.sim.drive.200.df <- ggplot(sim.drive.200.df, aes(x = sim.drive.200)) + geom_histogram(binwidth = 1,
    fill = "#66CDAA", color = "black") + labs(title = "Histogram of 200 Observations", x = "Time of Drive(in Minutes)",
    y = "Frequency")

print(hist.sim.drive.200.df)

mean(sim.drive.200)

## [1] 19.82672

var(sim.drive.200)

## [1] 9.375835

# Theoretical values
theo_mean <- 20 # Given in problem.
theo_mean

## [1] 20

theo_stdev <- 3 # Given in problem but need to square to get variance because it is standard deviation.
theo_var <- (theo_stdev)^2
theo_var

## [1] 9

# Probability Density Function
X <- 0:40
X.prob.PDF <- dnorm(x=X,20, 3) 

# create a data.frame to input into ggplot
PDF.df <- data.frame(X.prob.PDF)

hist.PDF.df <- ggplot(PDF.df, aes(x = factor(X), y = X.prob.PDF)) + geom_bar(stat = "identity",
    fill = "#008080", color = "black") + labs(title = "PDF", x = "Time of Drive (in Minutes)",
    y = "Frequency")

print(hist.PDF.df)

The growing observations approach the theoretical value, beginning with 5 observationsand an expected value of 18.94294 and ending at an expected value of 19.82672 at 200 observations (the theoretical expected value is 20). Variance varies as the observations grow due to simulation of observations. The furthest away from the theoretical variance is the simulation with only 5 observations at 17.49525. If we compare the distributions of histograms, we can see that the as observations gets larger, it starts to shape into a normal distribution like the PDF. Both at 100 and 200 observations are the histograms that look, for the most, part normal.

The Poisson distribution is a probability distribution that models the number of events within a fixed time interval. The values taken by a Poisson random variable are discrete and mainly counts. The probability mass function is given as:

\[ f(x) = \frac{\lambda^{x}_{p} e^{-\lambda_{p}}}{x!} \]

where x ∈ 0, 1, … and λp represents the mean number of occurrences. The notation X ∼ POIS(λp) means that X follows a Poisson distribution with parameter λp. (dpois, ppois, qpois, and rpois are the R functions for this distribution).

Joe, our Karabao man at Chamorro Village recieves a number of 4 people per hour, where X, the number of people, follows a Poisson distribution - X ∼ POIS(4). What is the probability that Joe will receive 10 customers in the next 2 hours?

lambda.per.h <- 4
lambda.per.2h <- lambda.per.h * 2
joe.prob.10 <-dpois(x = 10, lambda = lambda.per.2h) # We use dpois() function because we are only trying to find pdf.
joe.prob.10

## [1] 0.09926153

joe.prob.10.pct <- joe.prob.10*100
joe.prob.10.pct

## [1] 9.926153

Ans: The probability that Joe will receive 10 customers in the next 2 hours is approximately 9.93%.

Plot the PMF and CDF (cumulative distribution function) of X ∼ POIS(4) from 0 ≤ x ≤ 8.

# Plot PMF X ∼ POIS(4) from 0 ≤ x ≤ 8.
X <- 0:8

X.prob.PMF <- dpois(x = X, lambda = 4)
print(round(X.prob.PMF, 3))

## [1] 0.018 0.073 0.147 0.195 0.195 0.156 0.104 0.060 0.030

PMF.df <- data.frame(X = X, prob = X.prob.PMF)

# I wasn't sure which plot to use (i.e. scatter), bu I used a line plot because I thought it was the best to see the distribution clearly.
PMF.pois4line <- ggplot(data = PMF.df, aes(x = X, y = prob)) + geom_line(stat = "identity") +
    labs(x = "x", y = "Pr(X=X)") + theme_minimal()

print(PMF.pois4line)

# Plot CDF X ∼ POIS(4) from 0 ≤ x ≤ 8.
X <- 0:8
X.prob.CDF <- ppois(q = X, lambda = 4)
print(round(X.prob.CDF, 3))

## [1] 0.018 0.092 0.238 0.433 0.629 0.785 0.889 0.949 0.979

CDF.df <- data.frame(X = X, prob = X.prob.CDF)

# I wasn't sure which plot to use (i.e. scatter), bu I used a line plot because I thought it was the best to see the distribution clearly.
CDF.pois4line <- ggplot(data = CDF.df, aes(x = X, y = prob)) + geom_line(stat = "identity") +
    labs(x = "x", y = "Pr(X=X)") + theme_minimal()

print(CDF.pois4line)

Simulate the number of people Joe receives for 5, 20, and 100 hours, then plot a histogram for each and get the mean and variance for each vector. Analyze the results and compare to the theoretical expected value, variance, and pdf.

set.seed(1234)
sim.joe.pois.5 <- rpois(n = 5, lambda = 4)
sim.joe.pois.5

## [1] 2 4 4 4 6

hist(sim.joe.pois.5) # Default histogram

sim.joe.pois.5.df <- data.frame(sim.joe.pois.5)

# Customized
hist.sim.joe.pois.5.df <- ggplot(sim.joe.pois.5.df, aes(x = sim.joe.pois.5)) + geom_histogram(binwidth = 1,
    fill = "#66CDAA", color = "black") + labs(title = "Histogram of 05 hours", x = "Customers per Hour",
    y = "Frequency")

print(hist.sim.joe.pois.5.df)

mean(sim.joe.pois.5)

## [1] 4

var(sim.joe.pois.5)

## [1] 2

set.seed(1234)
sim.joe.pois.20 <- rpois(n = 20, lambda = 4)
sim.joe.pois.20

##  [1] 2 4 4 4 6 5 0 2 5 4 5 4 3 7 3 6 3 3 2 2

hist(sim.joe.pois.20) # Default histogram

sim.joe.pois.20.df <- data.frame(sim.joe.pois.20)

# Customized
hist.sim.joe.pois.20.df <- ggplot(sim.joe.pois.20.df, aes(x = sim.joe.pois.20)) + geom_histogram(binwidth = 1,
    fill = "#66CDAA", color = "black") + labs(title = "Histogram of 20 Hours", x = "Customers per Hour",
    y = "Frequency")

print(hist.sim.joe.pois.20.df)

mean(sim.joe.pois.20)

## [1] 3.7

var(sim.joe.pois.20)

## [1] 2.852632

set.seed(1234)
sim.joe.pois.100 <- rpois(n = 100, lambda = 4)
sim.joe.pois.100

##   [1]  2  4  4  4  6  5  0  2  5  4  5  4  3  7  3  6  3  3  2  2  3  3  2  1  2
##  [26]  6  4  7  6  1  4  3  3  4  2  5  2  3 10  6  4  5  3  4  3  4  5  4  3  5
##  [51]  1  3  5  4  2  4  4  5  2  6  6  1  3  0  3  5  3  4  1  4  2  7  0  5  1
##  [76]  4  3  1  3  5  7  4  2  4  2  7  3  3  2  7  2  7  2  2  2  4  3  1  3  5

hist(sim.joe.pois.100) # Default histogram

sim.joe.pois.100.df <- data.frame(sim.joe.pois.100)

# Customized
hist.sim.joe.pois.100.df <- ggplot(sim.joe.pois.100.df, aes(x = sim.joe.pois.100)) + geom_histogram(binwidth = 1,
    fill = "#66CDAA", color = "black") + labs(title = "Histogram of 100 Hours", x = "Customers per Hour",
    y = "Frequency")

print(hist.sim.joe.pois.100.df)

mean(sim.joe.pois.100)

## [1] 3.6

var(sim.joe.pois.100)

## [1] 3.414141

# Theoretical values
theo_mean <- 4 # Given in problem. Same as lambda.
theo_mean

## [1] 4

theo_var <- 4 # Lambda is also variance for poisson dist. 
theo_var

## [1] 4

As the hours grow, we see the distribution approaches the PMF distribution with 100 hours being the closest depiction of the PMF distribution. However for the expected value and variance it actually strays away from the theoretical mean and variance. I think this is because of outlying data as lambda increases.

MA500 Homework 3: Part 2 (Extra Credit)

Michael Cajigal

2024-09-11

Extra Credit