[Computational Statistics] Practice Assignment

# Set chunk options
knitr::opts_chunk$set(echo = T, comment = NA, warning = F, message = F)

# Set document options
options(scipen = 999)

# List necessary packages
packages <- c('dplyr', 'plotly')  # dplyr for data manipulation | plotly for visualization

# Install packages if not existing in local R
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}

# Load libraries
invisible(lapply(packages, library, character.only = TRUE))

Exercise 1. Consider the pseudo code for the coin tossing experiment on slide 62 (a computer simulation experiment that is set up to approximate the probability of obtaining exactly k times head when tossing a fair coin n times). Implement this code as a function in the statistical software R.

####################
### Pseudo code
####################
# Coin (nr_tosses, nr_head, S)
#
# Input:
# S: integer defining the number of experiments
# nr_tosses: integer defining the number of coin tosses
# nr_head: integer defining the number of heads
# 
# Output: proportion of trials with exactly nr_head times heads
#
# Set out_vec a vector of length S
# For s in 1:S 
# Sample nr_tosses from (“heads”,”tails”) with replacement
# Count H, the number of occurrences of heads
# If H == nr_head then 
# Out_vec[s] = 1
#  Else 
#           Out_vec[s] = 0
# Return sum(Out_vec)/ S
####################

coin_tossing <- function(number_coins, number_heads, number_tossing, seed = 42) {
  
  ########################################
  # Argument description
  # *********************
  # Number of coin tosses: number_coins 
  # Number of heads: number_heads 
  # Number of experiments: number_tossing
  ########################################
  
  # Set seed
  set.seed(seed)
  
  # Create initial vector of tosses
  tosses <- rep(0, number_tossing)
  
  # Toss the coin and count the heads
  tosses <- sapply(tosses, function(i) {sum(rbinom(n = number_coins, size = 1, prob = 0.5))})
  #print(tosses)
  
  # Check if condition is satisfied
  check <- case_when(tosses == number_heads ~ 1, TRUE ~ 0)
  #print(check)
  
  # Calculate probability of output satisfying the condition
  output <- sum(check)/number_tossing
  #print(output)
  
  # Return output of function
  return(output)
  
}

#coin_tossing(n = 2, k = 1, number_tossing = 10)
coin_tossing(number_coins = 1000, 
             number_heads = 521, 
             number_tossing = 10e3) # S should be smaller than 10e6 or the process will take so long

[1] 0.011

Exercise 2. Consider the Monty Hall problem discussed during the lecture.

1. Write pseudo code for a Monty Hall computer simulation experiment

Funtion: monty_hall(number_doors, switch_decision, number_games)

Input:
  number_doors: number of doors in the game
  switch_decision: logical value as decision to switch or not
  number_games: number of games played

Output: probability of winning (a proportion of times of choosing winning door over number of games played)

Steps: 
  Initiate with the vector of zeroes (games_played) with length = number_games
  Repeat the following for each element i of games_played:
    Set series of doors (1:number_doors)
    Sample the original chosen door 
    Sample the winning door
    If the player switches the door (switch_decision = TRUE), check condition of winning and mark value for games_played[i]: 
      If the original chosen door is the winning door, then mark 0, else mark 1
    Else if the player does not switch, check condition of winning: 
      If the original chosen door is the winning door, then mark 1, else mark 0
    Calculate probability of winning = sum(games_played)/number_games

2. Implement the Monty Hall simulation experiment in R. Do not forget to include a seed to initialize the pseudo random number generator.

monty_hall <- function(number_doors = 3, switch_decision, number_games, seed = 12) {
  
  ########################################
  # Argument description
  # *********************
  # number_doors: number of doors in the game, default as 3
  # switch_decision: decision to switch or not 
  # number_games: number of games played
  ########################################
  
  # Set seed
  set.seed(seed)
  
  # Create initial vector of games played
  games_played <- rep(0, number_games)
  #cat('games_played =', games_played, '\n')
  
  # Set the doors
  doors <- 1:number_doors
  
  # Sample the chosen door
  chosen_door <- sapply(games_played, function(i) {sample(x = doors, size = 1)})
  #cat('chosen_door =', chosen_door, '\n')
  
  # Sample the winning door
  winning_door <- sapply(games_played, function(i) {sample(x = doors, size = 1)})
  #cat('winning_door =', winning_door, '\n')
  
  # Check if condition is satisfied
  if (switch_decision) {
    check <- case_when(chosen_door == winning_door ~ 0, TRUE ~ 1)
    
  } else {
    check <- case_when(chosen_door == winning_door ~ 1, TRUE ~ 0)

  }
  #cat('switch =', switch_decision, '- check =', check, '\n')
  
  # Calculate probability of outputs satisfying the condition
  output <- sum(check)/number_games
  #cat('sum(check) =', sum(check), '- number_games =', number_games, '\n')
  
  # Return output of function
  return(output)
  
}

monty_hall(switch_decision = T, number_games = 100)

[1] 0.65

monty_hall(switch_decision = F, number_games = 100)

[1] 0.35

3. Report the results of your simulation experiment. What do you conclude about the best strategy to follow (switch /stay /does not matter)? Make use of Tables and/or Figures to present your results.

games <- 1:100
switch_results <- sapply(games, function(n) {monty_hall(number_doors = 3, 
                                                        switch_decision = TRUE, 
                                                        number_games = n)})
stay_results <- sapply(games, function(n) {monty_hall(number_doors = 3, 
                                                        switch_decision = FALSE, 
                                                        number_games = n)})

results <- data.frame(n = games, switch = switch_results, stay = stay_results)

line_plot <- results %>% 
  plot_ly(type = 'scatter', mode = 'lines',
          x = ~n, 
          y = ~switch, name = 'Switch', line = list(color = '#ff9800')) %>%
  add_trace(y = ~stay, name = 'Stay', line = list(color = '#27aee1')) %>%
  layout(title = 'Winning probability',
         xaxis = list(title = 'Number of games played', showgrid = T),
         yaxis = list(title = 'Probability', showgrid = T)
  )

line_plot

Conclusion: The best strategy is to switch doors.

Exercise 3. [old exam question] Consider a variant of the famous three-doors (or, Monty Hall) problem with four instead of three doors. The variant of the problem is as follows. A player in a television game show has to choose among four closed doors. The host of the show tells him that behind one of these four doors, there is a big prize (e.g., a car) while behind the three other doors there is no prize (or a goat if you prefer). The player chooses one of the four doors to open. Before opening that door, the host of the show who knows what is behind the doors, opens another door (behind which there is no prize). Then, the player can choose to switch to another door or to stay.

What is the probability of winning the big prize for a player who chooses the strategy to switch?

monty_hall_variant2 <- function(number_doors = 4, switch_decision = T, number_games, seed = 123) {
  
  ########################################
  # Argument description
  # *********************
  # number_doors: number of doors in the game, default as 4
  # switch_decision: decision to switch or not, default as TRUE 
  # number_games: number of games played
  ########################################
  
  # Set seed
  set.seed(seed)
  
  # Create initial vector of games played
  games_played <- rep(0, number_games)
  #cat('games_played =', games_played, '\n')
  
  # Set the doors
  doors <- 1:number_doors
  
  # Sample the chosen door
  chosen_door <- sapply(games_played, function(i) {sample(x = doors, size = 1)})
  #cat('chosen_door =', chosen_door, '\n')
  
  # Sample the winning door
  winning_door <- sapply(games_played, function(i) {sample(x = doors, size = 1)})
  #cat('winning_door =', winning_door, '\n')
  
  # Sample the new door (switched)
  switch_door <- sapply(chosen_door, function(i) {sample(x = doors[-i], size = 1)})
  #cat('switch_door =', switch_door, '\n')
  
  # Check if condition is satisfied
  if (switch_decision) {
    check <- case_when(switch_door == winning_door ~ 1, TRUE ~ 0)
    
  } else {
    check <- case_when(chosen_door == winning_door ~ 1, TRUE ~ 0)

  }
  #cat('switch =', switch_decision, '- check =', check, '\n')
  
  # Calculate probability of outputs satisfying the condition
  output <- sum(check)/number_games
  #cat('sum(check) =', sum(check), '- number_games =', number_games, '\n')
  
  # Return output of function
  return(output)
  
}

monty_hall_variant2(number_games = 5)

[1] 0.6

monty_hall_variant2(number_games = 10)

[1] 0.4

monty_hall_variant2(number_games = 100)

[1] 0.17

monty_hall_variant2(number_games = 1000)

[1] 0.254

monty_hall_variant2(number_games = 1000)

[1] 0.254

monty_hall_variant2(number_games = 5000)

[1] 0.2518

games <- 1:200
switch_results <- sapply(games, function(n) monty_hall_variant2(number_games = n))

results <- data.frame(n = games, switch = switch_results)

line_plot <- results %>% 
  plot_ly(type = 'scatter', mode = 'lines',
          x = ~n, 
          y = ~switch, name = 'Switch', line = list(color = '#009933')) %>%
  layout(title = 'Winning probability',
         xaxis = list(title = 'Number of games played', showgrid = T),
         yaxis = list(title = 'Probability', showgrid = T)
  )

line_plot

Exercise 4. Consider the Birthday problem mentioned during the lecture (What is the probability that at least two people in this virtual room have the same birthday). Assume all days of the year are equally likely for a child to be born.

1. Implement a simulation experiment in R to generate birthdays for a group of n people. Do not forget to include a seed to initialize the pseudo random number generator.

birth_day_exp <- function(n, number_groups, seed = 51) {
  
  ########################################
  # Argument description
  # *********************
  # n: number of people in the group
  # number_groups: number of groups to check
  ########################################
  
  # Set seed
  set.seed(seed)
  
  # Create initial vector of group
  group <- rep(0, number_groups)
  #cat('group =', group, '\n')
  
  # Set the series of birthdays
  birth_days <- 1:365
  #cat('birth_days =', birth_days, '\n')
  
  # Sample the birthday and count the unique birthdays
  sampling <- sapply(group, function(i) length(unique(sample(x = birth_days, size = n, replace = T))) )
  #cat('sampling =', sampling, '\n')
  
  # Check if condition is satisfied
  check <- case_when(sampling < n ~ 1, TRUE ~ 0)
  #cat('check =', check, '\n')
  
  # Calculate probability of outputs satisfying the condition
  output <- sum(check)/number_groups
  #cat('sum(check) =', sum(check), '- number_groups =', number_groups, '\n')
  
  # Return output of function
  return(output)
  
}

birth_day_exp(n = 15, number_groups = 2000)

[1] 0.2535

birth_day_exp(n = 15, number_groups = 4000)

[1] 0.262

birth_day_exp(n = 40, number_groups = 4000)

[1] 0.8905

2. Use your birthday simulation R function to estimate the probability that two or more students of the Statistical computing class have the same Birthday for various class sizes. Make use of Tables and/or Figures to present your results.

class_sizes <- 5:200
same_birthday <- sapply(class_sizes, function(n) birth_day_exp(n = n, number_groups = 1000))

results <- data.frame(n = class_sizes, same = same_birthday)

line_plot <- results %>% 
  plot_ly(type = 'scatter', mode = 'lines',
          x = ~n, 
          y = ~same, name = 'Same Birthday', line = list(color = '#009933')) %>%
  layout(title = 'Probability of at least 2 students having the same birthday',
         xaxis = list(title = 'Class size', showgrid = T),
         yaxis = list(title = 'Probability', showgrid = T)
  )

line_plot

3. Give the analytical solution to the Birthday problem assuming n = 40 and compare the analytical result to the result obtained from your Monte Carlo simulation experiment.

Probability of no couple of people having the same birthday:

\[ \dfrac{365 \times 364 \times 363... \times 326}{365^{40}}=0.1087682 \]

Probability of at least 2 people having the same birthday:

\[ 1 - 0.1087682 = 0.8912318 \]

Comparison: The results obtained from analytical solution and simulation are the same (\(\approx 0.89\)).