library(tidyverse)
library(stringr)
library(ggdist)
library(broom)

simulating a single die roll

#Simulating single die roll
sample(x = 1:6, size = 1)

## [1] 5

# Simulating two dice rolls
sample(x = 1:6, size = 2, replace = TRUE)

## [1] 1 1

#Use the sample function to simulate 100000 dice rolls, storing the result in a vector called rolls. What proportion of rolls were each number?
#Simulating 100,000 rolls
nrolls=100000
rolls<- sample(x = 1:6, size = 100000, replace = TRUE)


#What proportion of rolls were each number?
rollsTibble<-tibble(value=rolls)
rollsTibble %>% count(value) %>% mutate(perc=n/nrolls)

## # A tibble: 6 × 3
##   value     n  perc
##   <int> <int> <dbl>
## 1     1 16823 0.168
## 2     2 16497 0.165
## 3     3 16685 0.167
## 4     4 16437 0.164
## 5     5 16670 0.167
## 6     6 16888 0.169

#We can compute the lengths and values of runs of equal values in a vector using the rle() (run length encoding) function. A run consists of identical and contiguous values in a vector, which in this case represents consecutive die rolls of the same value.

runs<-rle(rolls)

#Using the rle() function, what are the most consecutive rolls with the same die value?
runsTibble <- tibble(lengths=runs$lengths,
                   values=runs$values)

#What is the total number of runs of each length?
freqRuns<-runsTibble %>% count(lengths)
freqRuns

## # A tibble: 7 × 2
##   lengths     n
##     <int> <int>
## 1       1 69396
## 2       2 11550
## 3       3  2002
## 4       4   286
## 5       5    58
## 6       6     6
## 7       7     4

#What is the ratio of the number of runs of length 2 compared to the number of runs of length 1? 

freqRuns$n[2]/freqRuns$n[1]

## [1] 0.1664361

#What about the number of runs of length 3 compared to 2?
freqRuns$n[3]/freqRuns$n[2]

## [1] 0.1733333

What are these ratios close to? to 1/6

Sampling fruits

#Randomly select 10 values from the fruit vector 🍇 🍉 🍓 🥝 🍒 that is available from the stringr package, and run your code several times. Does sample() use sampling with or without replacement by default?

sample(fruit, size=10)

##  [1] "purple mangosteen" "apricot"           "orange"           
##  [4] "jackfruit"         "canary melon"      "pear"             
##  [7] "mandarine"         "gooseberry"        "star fruit"       
## [10] "guava"

#Randomly select 10 values from the fruit vector, sampling with replacement. Resample 10 fruits until you obtain at least one duplicate fruit within a sample.

sample(fruit, size=10, replace=TRUE)

##  [1] "fig"          "blood orange" "plum"         "jackfruit"    "peach"       
##  [6] "coconut"      "kumquat"      "star fruit"   "rambutan"     "apple"

#Setting seed
#Set the seed value to 1989, and then sample 13 values from the fruit vector to reproduce the results below.
set.seed(1989)
sample(fruit, size=13)

##  [1] "salal berry"       "purple mangosteen" "durian"           
##  [4] "rambutan"          "breadfruit"        "currant"          
##  [7] "tamarillo"         "blackberry"        "raisin"           
## [10] "dragonfruit"       "chili pepper"      "avocado"          
## [13] "mango"

Does sample() use sampling with or without replacement by default? it does it without replacement. thus the default is the sample without replacement.

Sampling from distribution

exponential used for modeling time to event

#Generate 100,000 values from a standard normal distribution, and calculate an estimate of the median (50th percentile) of the standard normal distribution using the quantile() function
set.seed(1994)
simValues<-rnorm(n=100000, mean=0, sd=1)

quantile(simValues, probs=0.50)

##         50% 
## -0.00310657

#Also calculate an estimate of the 97.5th percentile of the standard normal distribution using the quantile() function. Note that the exact value of the 97.5th percentile can be found using qnorm(p = 0.975).

quantile(simValues, probs=0.975)

##    97.5% 
## 1.962128

#you can get exact percentile using
qnorm(p=0.975)

## [1] 1.959964

The uniform, exponential, and Weibull distributions can all be used to model waiting times or time to events of interest for various contexts.

#Use Monte Carlo methods to answer the following:
#if Spongebob is waiting at the bus stop, what is his expected waiting time until the next bus comes?
wtimes<-tibble(spongeTimes=runif(n=100000, min=0, max=20))

mean(wtimes$spongeTimes)

## [1] 9.985058

#What is the median waiting time until the next bus comes for Spongebob?

median(wtimes$spongeTimes)

## [1] 9.964044

#Visualize spongebob's waiting times
wtimes %>% ggplot(aes(x=spongeTimes))+
  geom_histogram(color="#b26e2d",
                 fill = "#fff464")+
  labs(title = "Spongebob's waiting times",
       x="Time (minutes)", 
       y="Frequency")+
  theme_bw()

#What is Spongebob’s total expected waiting time until he catches a bus back to the Krusty Krab with Patrick?
wtimes<-wtimes %>% mutate(patrickTimes=runif(n=100000, min = 0, max=20), totalTime=patrickTimes+10, spongeTimes)

#Visualize Patricks waiting times
wtimes %>% ggplot(aes(x=patrickTimes))+
  geom_histogram(color="#f5a3a5",
                 fill = "#a988bd")+
  labs(title = "Patrics's waiting times",
       x="Time (minutes)", 
       y="Frequency")+
  theme_bw()

#What is Spongebob’s median total waiting time until he catches a bus back to the Krusty Krab with Patrick?
#Visualize spongebob's total waiting times
wtimes %>% ggplot(aes(x=totalTime))+
  geom_histogram(color="#b26e2d",
                 fill = "#fff464")+
  labs(title = "Spongebob's totalxwaiting times",
       x="Time (minutes)", 
       y="Frequency")+
  theme_bw()

#Estimating median
wtimes %>% pull(totalTime) %>% median

## [1] 19.98667

wtimes %>% pull(totalTime) %>% quantile(probs=0.50)

##      50% 
## 19.98667

#Construct and interpret a 95% confidence interval for the total waiting time in minutes for Spongebob until a bus comes for him and Patrick to take to the Krusty Krab.
lowerBound<-wtimes %>% pull(totalTime) %>% quantile(probs=0.25)
upperBound<-wtimes %>% pull(totalTime) %>% quantile(probs=0.95)

we are 95% confident that in total, spongbob will wait 15.03 and 29.01 between 15.55 and 54.46 minutes until he and Patric are on the bus heading to the Krusty Krab

permutation and randomization

# Simulating participant blood-pressure changes
set.seed(1994)
n1 <- 8
n2 <- 8
trialData <- tibble(Participant = 1:(n1 + n2),
                    Treatment = rep(c("Drug", "Placebo"), each = c(n1, n2)),
                    Change = c(rcauchy(n = n1, location = -3.5, scale = 5), 
                               rcauchy(n = n2, location = 0, scale = 1)))

# Two-sample t-test
tResult <- t.test(x = trialData$Change[which(trialData$Treatment == "Drug")], 
              y = trialData$Change[which(trialData$Treatment == "Placebo")],
              alternative = "less")

visualize our data using raincloud plots

# Plotting rain cloud plots showing differences between groups
trialData %>% ggplot(aes(x = Treatment, y = Change)) +
  ggdist::stat_halfeye(adjust = .5, width = 2*.3, .width = c(0.5, 1)) + 
    geom_boxplot(width = .3, outlier.shape = NA) +
  ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1,
                    color = "black") +
    coord_flip() +
    labs(y = "Change in systolic blood-pressure (mmHg)",
         title = "Comparison of blood-pressure changes") + 
  theme_bw() +
  theme(legend.position = "none")

# Generating all permutations
perms <- combn(1:(n1 + n2), m = n1)

# Calculating number of permutations
nperms <- choose(n1 + n2, n1)

# Instantiating vector for test statistics
permTs <- vector(length = nperms)

# Calculating t-test statistic for each permutation
for(p in 1:nperms) {
  permTs[p] <- t.test(x = trialData$Change[perms[, p]], 
                      y = trialData$Change[-c(perms[, p])])$statistic
}


#Create a histogram displaying the null distribution obtained for the permutation test. What is the distribution centered at?
tibble(t=permTs) %>% ggplot(aes(x=t))+
  geom_histogram(color="white")+ 
  geom_vline(xintercept=quantile(permTs, probs=0.05), 
color="red", linetype="dotted")+
  geom_vline(xintercept=tResult$statistic, 
color="blue", linetype="solid")+
  labs(title = "Permutation distribution",
       y="Frequency")+
  theme_bw()

# Calculate the p-value for the permutation test. In this case, the p-value can be calculated as the proportion of permutation test statistics less than or equal to our observed t-test statistic. they are nearly the same
permPvalue<-mean(permTs<=tResult$statistic)
permPvalue

## [1] 0.04048174

Since the P-value of 0.0404817 is less than 0.05 we.. that is we have evidence at the 5% significance level that our proposesd drug lowers peoples blood pressure more on average than just a placedo

How many total ways are there to arrange the participants between the two groups? 12870

What is the distribution centered at? The distribution is centered at 0.0

Do we reject or fail to reject the null at the 5% significance level? REJECT

Nonparametric Bootstrapping

# Simulating data from distribution
set.seed(1989)
n <- 30
observed <- rchisq(n = n, df = 4) + rnorm(n = n, mean = 0, sd = 1)

#Create a raincloud plot to visualize the simulated data. Describe the data in terms of its number of peaks (e.g., unimodal, bimodal, etc.) and its symmetry.
tibble(Value=observed) %>% ggplot(aes(y=Value)) +
  ggdist::stat_halfeye(adjust = .5, width = 2*.3, .width = c(0.5, 1)) + 
    geom_boxplot(width = .3, outlier.shape = NA) +
  ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1,
                    color = "black") +
    coord_flip() +
    labs(y = "Value",
         title = "Simulated values from a non-normal distribution") + 
  theme_bw() +
  theme(legend.position = "none")

#What is the sample size for this data set?

#What is the sample median for this data?
median(observed)

## [1] 3.448628

# Number of bootstrap samples
B <- 10000

# Instantiating matrix for bootstrap samples
boots <- matrix(NA, nrow = n, ncol = B)

# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- observed[sample(1:n, size = n, replace = TRUE)]
}


#Using the generated bootstrap samples, create a bootstrap distribution of sample medians, and visualize this distribution using a histogram.

#Installing vector for bootstrap medians
bootMedians<-vector(length=B)

#Calculating the median for each of the resamples
for(b in 1:B) {
  bootMedians[b]<-median(boots[,b])
}

#Creating a histogram
tibble(Median=bootMedians) %>% ggplot(aes(x=Median))+
  geom_histogram(color="white")+ 
  labs(title = "Distribution of nonparametric bootstrap medians",
       y="Frequency")+
  theme_bw()

#Nonparametric estimates of the SE of the sample median
  SEestimate<-sd(bootMedians)
  SEestimate

## [1] 0.9040296

#Use the bootstrap samples to obtain a nonparametric 95% confidence interval for the population median.
  lowerBoundMed<-quantile(bootMedians, probs=0.025)
  upperBoundMed<-quantile(bootMedians, probs=0.975)

We are 95% confident that the true median is between 2.09 and 5.07.

#Creating a histogram
tibble(Median=bootMedians) %>% ggplot(aes(x=Median))+
  geom_histogram(color="white")+
  geom_vline(xintercept=lowerBoundMed, 
color="dodgerblue", linetype="solid")+
  geom_vline(xintercept=upperBoundMed, 
color="dodgerblue", linetype="solid")+
  labs(title = "Distribution of nonparametric bootstrap medians",
       y="Frequency")+
  theme_bw()

Parametric Bootstrap

#Generate B=10,000 samples each of size 30 from a normal distribution using the estimated mean and standard deviation from the observed data set, and visualize this distribution using a histogram

# Number of bootstrap samples
B <- 10000

# Instantiating matrix for bootstrap samples
paramboots <- matrix(NA, nrow = n, ncol = B)
xBar<-mean(observed)
s<-sd(observed)

# Simulating a normal set of 30 values, B times
for(b in 1:B) {
paramboots[, b] <- rnorm(n=n, mean=xBar, sd=s)
}

#Installing vector for bootstrap medians
bootparamMedians<-vector(length=B)

#Calculating median for each simulated data set
for(b in 1:B) {
  bootparamMedians[b]<-median(paramboots[,b])
}

#Obtain a parametric bootstrap estimate of the standard error of the sample median.
  SEparamestimate<-sd(bootMedians)
  SEparamestimate

## [1] 0.9040296

#Obtain a parametric bootstrap 95% confidence interval for the sample median.
  lowerBoundparamMed<-quantile(bootparamMedians, probs=0.025)
  upperBoundparamMed<-quantile(bootparamMedians, probs=0.975)
  
#Creating a histogram
tibble(Median=bootparamMedians) %>% ggplot(aes(x=Median))+
  geom_histogram(color="white")+
  geom_vline(xintercept=lowerBoundparamMed, 
color="dodgerblue", linetype="solid")+
  geom_vline(xintercept=upperBoundparamMed, 
color="dodgerblue", linetype="solid")+
  labs(title = "Distribution of Parametric bootstrap medians",
       y="Frequency")+
  theme_bw()

We are 95% confident that the true median is between 2.23 and 6.21.

Bootstrapping and Simulations with R

Hyreen Alice