DATA 605_Week9_Discussion Questions

(IP, Central Limit Theorem, pg. 339)

11) Write a computer program to simulate 10,000 Bernoulli trials with probability .3 for success on each trial. Have the program compute the 95 percent confidence interval for the probability of success based on the proportion of successes. Repeat the experiment 100 times and see how many times the true value of .3 is included within the confidence limits.

set.seed(1248)
n = 10000  # number of trials per experiment
p = .3  # probability of success of any trial
experiments <- 100  # number of experiments

# Define function to simulate 10k trials per experiment for 100 experiments
Bernoulli <- function() {
  record <- vector(mode = 'numeric', length = experiments)
  
# Replicate experiment
  for(i in 1:experiments){
    trials <- rbinom(n = n, size = 1, p = p)
  
# Log number of trials that are successes in each trial
    success <- 0
    for(trial in trials){
      if(trial == 1){
        success = success + 1
      }
      
# Calculcate per-experiment success rate over 10k trials and capture in vector  
    }
    record[i] <- success/length(trials)
  }
  return(record)
}

# Conduct 10 experiments and collect results in dataframe
r_vec <- Bernoulli()
r_df <- as.data.frame(r_vec)
head(r_df)

##    r_vec
## 1 0.3003
## 2 0.3115
## 3 0.2930
## 4 0.3044
## 5 0.2986
## 6 0.3004

# Define histogram of experiment results
g <- ggplot(data = r_df) +

# Define chart axes and title
  scale_y_continuous(name = "Count of experiments", breaks = c(5, 10, 15), limits = c(0, 15)) +
  ggtitle("Histogram: 100 experiments of 10k Bernoulli trials each") +
  xlab("Rate of success")

# Plot
g +

# As histogram
  geom_histogram(aes(x = r_vec), binwidth = .001) +
  
# With visual theme  
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5), legend.position = "none")

# Calculate lower and upper bounds of 95% confidence interval
lower = p - qnorm(.975) * sqrt((p) * (1 - p) / n)
upper = p + qnorm(.975) * sqrt((p) * (1 - p) / n)

# Assign marker values conditional on whether outside of CI (A) or within CI (B)
r_df$r_CI <- 
  case_when(r_df$r_vec < lower | r_df$r_vec > upper ~ "A",
            r_df$r_vec >= lower & r_df$r_vec <= upper ~ "B")

# Determine how many of the 100 experiments fall within the confidence interval
r_test <- length(which(r_df >= lower & r_df <= upper))
r_test

## [1] 92

# Convert marker values to factor
r_df$r_CI <- as.factor(r_df$r_C)

# Vizualize proportion of experiments within 95% confidence interval
ggplot(data = r_df, fill = r_vec) +

# Define chart axes and title
  scale_y_continuous(name = "Count of experiments", breaks = c(5, 10, 15), limits = c(0, 15)) +
  ggtitle("Histogram: 100 experiments of 10k Bernoulli trials each") +
  xlab("Rate of success") +

# Define histogram
  geom_histogram(aes(x = r_vec, fill = r_CI), binwidth = .001) +

# Visualize CI with hue and reference x-intercerpt lines 
  scale_fill_manual(values = c("gray78", "grey28")) +
  geom_vline(aes(xintercept = lower, color = "red", linetype = "dotted")) +
  geom_vline(aes(xintercept = upper, color = "red", linetype = "dotted")) +
  geom_text(aes(x = .3, label = "92 experiments within .95 CI", y = 15), color = "grey28", text = element_text(size = 8), hjust = .5) +
  geom_text(aes(x = lower, label = "\nlower bound = .291", y = 10), color = "red", angle = 90, text = element_text(size = 8)) +
  geom_text(aes(x = upper, label = "upper bound = .309\n", y = 10), color = "red", angle = 90, text = element_text(size = 8)) +

# Set visual theme  
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5), legend.position = "none")

## Warning: Ignoring unknown parameters: text

## Warning: Ignoring unknown parameters: text

## Warning: Ignoring unknown parameters: text

DATA 605_Week9_Discussion Questions

Jeremy O’Brien

October 22, 2018

(IP, Central Limit Theorem, pg. 339)