Data 605 Problem 1

Problem 1.

Probability Density 1: X~Gamma. Using R, generate a random variable X that has 10,000 random Gamma pdf values. A Gamma pdf is completely describe by n (a size parameter) and lambda (lambda , a shape parameter). Choose any n greater 3 and an expected value (lambda) between 2 and 10 (you choose).

# Set seed for reproducibility
set.seed(123)

n <- 4
lambda <- 2

# Generate 10,000 random Gamma pdf values
X <- rgamma(n = 10000, shape = n, scale = lambda)

# Plot the histogram
hist(X, breaks = 30, col = "lightblue")

Probability Density 2: Y~Sum of Exponentials. Then generate 10,000 observations from the sum of n exponential pdfs with rate/shape parameter. The n and lambda must be the same as in the previous case.

# Number of observations to generate
num_observations <- 10000

# Generate observations from the sum of exponential PDFs
Y <- numeric(num_observations)

for (i in 1:num_observations) {
  exponentials <- rexp(n, rate = lambda)
  Y[i] <- sum(exponentials)
}

# Print the first few observations
print(head(Y))

## [1] 3.4000115 1.9119165 3.2358809 0.5935296 3.6650713 1.5248701

# Plot the histogram
hist(Y, breaks = 30, col = "lightblue")

Probability Density 3: Z~ Exponential. Then generate 10,000 observations from a single exponential pdf with rate/shape parameter.

# Generate 10,000 random observations from a single exponential PDF
Z <- rexp(n = 10000, rate = 0.5)

# Plot the histogram
hist(Z, breaks = 30, col = "lightblue")

1a. Calculate the empirical expected value (means) and variances of all three pdfs.

# X~Gamma empirical expected value (means) and variances
x_mean <- mean(X)
x_var <- var(X)

# Y~Gamma empirical expected value (means) and variances
y_mean <- mean(Y)
y_var <- var(Y)

# Z~Exponential empirical expected value (means) and variances
z_mean <- mean(Z)
z_var <- var(Z)

1b. Using calculus, calculate the expected value and variance of the Gamma pdf (X).

# Define the integrand function for expected value
integrand <- function(x) x * dgamma(x, shape = 4, rate = 0.5)

# Perform the integration using 'integrate'
result <- integrate(integrand, lower = 0, upper = Inf)

# Extract the expected value from the integration result
expected_value <- result$value

# Print the expected value
expected_value

## [1] 8

# Define the integrand function for variance
integrand_variance <- function(x) (x - expected_value)^2 * dgamma(x, shape = 4, rate = 0.5)

# Perform the integration for variance using 'integrate'
result_variance <- integrate(integrand_variance, lower = 0, upper = Inf)

# Extract the variance from the integration result
variance <- result_variance$value

# Print the variance
variance

## [1] 16

Using the moment generating function for exponentials, calculate the expected value of the single exponential (Z) and the sum of exponentials (Y)

# Define the function
expr <- expression(1-(x/lambda)^-1)

# Take the derivative of the function
df <- deriv(expr, "x")

# Print the derivative function
#df

# Evaluate the derivative function at x = 1
df_value <- eval(df, list(x = 1))

# Print the derivative value
df_value

## [1] -1
## attr(,"gradient")
##      x
## [1,] 2

# Define the function
n=10000
expr <- expression(x*exp(-x))

# Take the derivative of the function
df <- deriv(expr, "x")

# Print the derivative function
#df

# Evaluate the derivative function at x = 2
df_value <- eval(df, list(x = 1))

# Print the derivative value
df_value

## [1] 0.3678794
## attr(,"gradient")
##      x
## [1,] 0

1c-e. Probability. For pdf Z (the exponential), calculate empirically probabilities a through c. Then evaluate through calculus whether the memoryless property holds.

lambda_half <- lambda/2
count_condition_A <- sum(Z > lambda_half)
count_intersection_A <- sum(Z > lambda & Z > lambda_half)
(empirical_prob_A <- count_intersection_A / count_condition_A)

## [1] 0.5940856

count_condition_B <- sum(Z > lambda)
count_intersection_B <- sum(Z > 2 * lambda & Z > lambda)

(empirical_prob_B <- count_intersection_B / count_condition_B)

## [1] 0.3592881

count_condition_C <- sum(Z > lambda)
count_intersection_C <- sum(Z > 3 * lambda & Z > lambda)

(empirical_prob_C <- count_intersection_C / count_condition_C)

## [1] 0.1298665

Loosely investigate whether P(YZ) = P(Y) P(Z) by building a table with quartiles and evaluating the marginal and joint probabilities.

# Calculate quartiles for Y and Z
quartiles_Y <- quantile(Y, probs = c(0.25, 0.5, 0.75, 1))
quartiles_Z <- quantile(Z, probs = c(0.25, 0.5, 0.75, 1))

# Calculate marginal and joint probabilities for each quartile pair
probabilities <- matrix(NA, nrow = length(quartiles_Z), ncol = length(quartiles_Y))

for (i in 1:length(quartiles_Z)) {
  for (j in 1:length(quartiles_Y)) {
    P_Y <- length(Y[Y < quartiles_Z[i]]) / num_observations
    P_Z <- length(Z[Z > quartiles_Y[j]]) / num_observations
    P_YZ <- length(Y[Y < quartiles_Z[i] & Z > quartiles_Y[j]]) / num_observations
    
    probabilities[i, j] <- P_YZ
  }
}

# Create a data frame with the probabilities
prob_table <- data.frame(probabilities)
colnames(prob_table) <- c("1st Quartile Y", "2nd Quartile Y", "3rd Quartile Y", "4th Quartile Y")
rownames(prob_table) <- c("1st Quartile Z", "2nd Quartile Z", "3rd Quartile Z", "4th Quartile Z")

# Add the sum column
prob_table$Sum <- rowSums(probabilities)

#new_row <- data.frame("Sum",0,0,0,0,0)

# Add the new row to the data frame
#prob_table <- rbind(prob_table, new_row)

# Print the probabilities table
print(prob_table)

##                1st Quartile Y 2nd Quartile Y 3rd Quartile Y 4th Quartile Y
## 1st Quartile Z         0.0167         0.0124         0.0082         0.0009
## 2nd Quartile Z         0.1562         0.1171         0.0794         0.0074
## 3rd Quartile Z         0.4212         0.3131         0.2168         0.0189
## 4th Quartile Z         0.5269         0.3909         0.2717         0.0230
##                   Sum
## 1st Quartile Z 0.0382
## 2nd Quartile Z 0.3601
## 3rd Quartile Z 0.9700
## 4th Quartile Z 1.2125

chisq.test(prob_table*10000)

## 
##  Pearson's Chi-squared test
## 
## data:  prob_table * 10000
## X-squared = 1.0726, df = 12, p-value = 1

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

Fisher’s Exact Test is suitable for smaller sample sizes. Fisher’s Exact Test can handle contingency tables of any size but becomes computationally intensive for larger tables.Chi-Square Test, on the other hand, is commonly used for larger sample sizes. It compares the observed frequencies in a contingency table with the expected frequencies under the assumption of independence.Chi-Square is most appropriate here due to larger table.

# Perform Fisher's Exact Test
fisher_result <- fisher.test(prob_table)

## Warning in fisher.test(prob_table): 'x' has been rounded to integer: Mean
## relative difference: 0.6137244

print(fisher_result)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  prob_table
## p-value = 1
## alternative hypothesis: two.sided

# Perform Chi-Square Test
chi_square_result <- chisq.test(prob_table)

## Warning in chisq.test(prob_table): Chi-squared approximation may be incorrect

print(chi_square_result)

## 
##  Pearson's Chi-squared test
## 
## data:  prob_table
## X-squared = 0.00010726, df = 12, p-value = 1

Data 605 Problem 1

Melissa Bowman

2023-05-07

Problem 1.