Setup

Loading all the required libraries

library(tidyverse)
library(stringr)
library(ggdist)
library(broom)

Simulating a fair six-sided die single roll:

sample(x = 1 : 6, size = 1)

## [1] 1

Simulating a fair six-sided die two rolls:

sample(x = 1 : 6, size = 2, replace = TRUE)

## [1] 6 5

simulating a large number of rolls

# Use the sample function to simulate 100000 dice rolls, storing the result in a vector called rolls. What proportion of rolls were each number?

Rolls <- sample(x = 1 :6, size = 100000, replace = TRUE)


Runs <- rle(Rolls)


runsTidy <- tibble(lengths = Runs$lengths,
                   values = Runs$values)

rollsTidy <- tibble(Rolls = Rolls)

rollsTidy %>% count(Rolls)

## # A tibble: 6 × 2
##   Rolls     n
##   <int> <int>
## 1     1 16575
## 2     2 16937
## 3     3 16716
## 4     4 16599
## 5     5 16629
## 6     6 16544

# Using the rle() function, what are the most consecutive rolls with the same die value?
# What is the total number of runs of each length?
runsTidy %>% count(lengths)

## # A tibble: 6 × 2
##   lengths     n
##     <int> <int>
## 1       1 69752
## 2       2 11525
## 3       3  1848
## 4       4   329
## 5       5    58
## 6       6     8

What is the total number of runs of each length?

runCounts <- runsTidy %>% count(lengths)
runCounts

## # A tibble: 6 × 2
##   lengths     n
##     <int> <int>
## 1       1 69752
## 2       2 11525
## 3       3  1848
## 4       4   329
## 5       5    58
## 6       6     8

What is the ratio of the number of runs of length 2 compared to the number of runs of length 1? What about the number of runs of length 3 compared to 2?

runCounts$n[2] / runCounts$n[1]

## [1] 0.1652282

runCounts$n[3] / runCounts$n[2]

## [1] 0.1603471

Sampling from other vectors in R

# Randomly select 10 values from the fruit vector that is available from the stringr package, and run your code several times. Does sample() use sampling with or without replacement by default?

selectFruits <- sample(fruit , size = 10)
selectFruits

##  [1] "dragonfruit" "date"        "apricot"     "tamarillo"   "eggplant"   
##  [6] "bilberry"    "salal berry" "currant"     "nut"         "watermelon"

selectFruits2 <- sample(fruit, size = 10, replace = TRUE)
selectFruits2

##  [1] "avocado"    "cherry"     "cherry"     "redcurrant" "elderberry"
##  [6] "clementine" "bilberry"   "raspberry"  "plum"       "raspberry"

#  Set the seed value to 1989, and then sample 13 values from the fruit vector to reproduce the results below.
set.seed(1989)
selectFruits3 <- sample(fruit, size = 13)
selectFruits3

##  [1] "salal berry"       "purple mangosteen" "durian"           
##  [4] "rambutan"          "breadfruit"        "currant"          
##  [7] "tamarillo"         "blackberry"        "raisin"           
## [10] "dragonfruit"       "chili pepper"      "avocado"          
## [13] "mango"

Generate 100,000 values from a standard normal distribution, and calculate an estimate of the median (50th percentile) of the standard normal distribution using the quantile() function.

set.seed(1994)
simData <- tibble(Value = rnorm(n = 100000, mean = 0, sd = 1))

quantile(simData$Value, probs = 0.50)

##         50% 
## -0.00310657

quantile(simData$Value, probs = 0.975)

##    97.5% 
## 1.962128

qnorm(p = 0.975)

## [1] 1.959964

Spongbob Bus example

Waitingtimes <- tibble(Spongebob = runif(n = 10000, min = 0, max = 20), Person = "Spongebob")

Waitingtimes%>% ggplot(aes(x = Spongebob)) + geom_histogram(color = "#b26e2d", fill = "#fff464") +
  labs(title = "Uniform distribution: spongebob's waiting time", y = "frequency") + theme_bw()

mean(Waitingtimes$Spongebob)

## [1] 10.00163

Waitingtimes %>% pull(Spongebob) %>% mean()

## [1] 10.00163

Waitingtimes %>% pull(Spongebob) %>% median()

## [1] 10.00601

What is Spongebob’s total expected waiting time until he catches a bus back to the Krusty Krab with Patrick?

patrickWaits <- tibble(Patrick = runif(n = 10000, min = 0, max = 30))

patrickWaits %>% ggplot(aes(x = Patrick)) + geom_histogram(color = "#d2dc23", fill = "#f5a3a5") +
  labs(title = "Uniform distribution: patricks waiting time", y = "frequency") + theme_bw()

# Total time = (Patrick wait time at bus stop + 10 minutes + spongebob bus wait time)

allTimes <-  Waitingtimes %>% bind_cols(patrickWaits) %>% 
  mutate(Total = Patrick + 10 + Spongebob)

allTimes %>% pull(Total) %>% mean()

## [1] 35.07758

allTimes %>% pull(Total) %>% median()

## [1] 35.05496

allTimes %>% 
  ggplot(aes(x = Total)) +
  geom_histogram(color = "#b26e2d", fill = "#fff464") +
  labs(title = "Uniform distribution: spongebob's total waiting time", y = "frequency") + theme_bw()

lowerBound <- allTimes %>% pull(Total) %>% quantile(probs = 0.025)
upperBound <- allTimes %>% pull(Total) %>% quantile(probs = 0.975)

We are 95% confident that Spongebob, in total, will wait between 15.4 and 54.66 untill him and partick are heading to Krusty Krabs

Permutation & randomization tests

# Simulating participant blood- pressure changes

set.seed(1994)
n1 <- 8
n2 <- 8 
trialData <- tibble(Participant = 1:(n1+n2), 
                    Treatment = rep(c("Drug", "Placebo"), each = c(n1,n2)),
                    Change = c(rcauchy(n = n1, location = -3.5, scale = 5),
                               rcauchy(n = n2, location = 0, scale = 1)))

# Two-sample t-test
tResult <- t.test(x = trialData$Change[which(trialData$Treatment == "Drug")],
                  y = trialData$Change[which(trialData$Treatment == "Placebo")],
                  alternative = "less")

# Plotting rain cloud plots showing differences between groups
trialData %>% ggplot(aes(x=Treatment, y = Change)) +
  ggdist:: stat_halfeye(adjust = .5, width = 2* .3, width = c(0.5,1)) +
   geom_boxplot(width = .3, outlier.shape = NA) + 
  ggdist::stat_dots(slide = "left", dotsize=6, justification = 1.05, binwidth = .1,
                    color = "black") +
  coord_flip()+
  labs(y = "Change in systolic blood-pressure (mmHg)",
       title = "Comparision of blood-pressure changes")+
  theme_bw()+
  theme(legend.position = "none")

# Generating all permutations
perms <- combn(1:(n1+n2), m = n1)

#Calculating number of permutations
nperms <- choose(n1+n2, n1)

# Instantiating vector for test statistics
permTs <- vector(length = nperms)

# Calculating t-test statistic for each permutation
for(p in 1:nperms) {
  permTs[p] <- t.test(x= trialData$Change[perms[, p]],
                      y = trialData$Change[-c(perms[,p])])$statistic
}

# Create a histogram displaying the null distribution obtained for thee permutation test. What is the distribution centered at?
tibble(t = permTs) %>% ggplot(aes(x=t))+
  geom_histogram(color="white") +
  geom_vline(xintercept = quantile(permTs, probs = 0.05),
             color = "red", linetype = "dotted") +
  geom_vline(xintercept = tResult$statistic,
             color = "blue", linetype = "solid") +
  labs(title = "Permutation null distribution",
       y = "Frequency") +
  theme_bw()

# Calculate the p-value for the permutation test. In this case, the p-value can be calculated as the proportion of permutation test statistics less than or equal to our observed t-test statistic.

permPvalue <- mean(permTs <- tResult$statistic)
permPvalue

## [1] -1.853983

We reject the values.

Bootstrap

# Simulating data from distribution
set.seed(1989)
n <- 30
observed <- rchisq(n = n, df = 4) + rnorm(n = n, mean = 0, sd = 1)

# Create a raincloud plot to visualize the simulated data. Describe the data in terms of its number of peaks (e.g., unimodal, bimodal, etc.) and its symmetry.

tibble(Value = observed) %>% ggplot(aes(y = Value)) +
  ggdist::stat_halfeye(adjust = .5, width = 2*.3, .width = c(0.5, 1)) + 
    geom_boxplot(width = .3, outlier.shape = NA) +
  ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1,
                    color = "black") +
    coord_flip() +
    labs(y = "Value",
         title = "Simulate values for a non-normal distribution") + 
  theme_bw() +
  theme(legend.position = "none")

median(observed)

## [1] 3.448628

It is no symmetric bimodal distribution of data.

# Sample median
median(observed)

## [1] 3.448628

## [1] 3.448628
# Number of bootstrap samples
B<-10000

# Instantiating matrix for bootstrap samples
boots <- matrix (NA, nrow=n, ncol=B)

#Sampling with replacement B times
for(b in 1:B) {
  boots[, b] <- observed[sample(1:n, size= n, replace = TRUE)]
}

#using the generated bootsrap samples, create a bootstrap distribution of sample medians, and visualize this distribution using a histogram.

#Instantiating vector for bootstrap medians
bootMedians <- vector(length= B)

# Sampling with replacement B times
for (b in 1:B) {
boots[, b] <- observed [sample(1:n, size = n, replace = TRUE)]
}
#Using the generated bootstrap samples, create a bootstrap distribution of sample medians, visualize this distribution using a histogram.

# Instantiating vector for bootstrap medians
bootMedians <- vector (length = B)

# Sampling with replacement B times
for (b in 1:B) {
bootMedians [b] <- median (boots [, b])
}
# Creating a histogram
tibble (Median = bootMedians) %>% ggplot(aes (x = Median)) +
  geom_histogram(color="white")+
  labs(title="Distribution of bootstrap medians",
     У="Frequency")+
  theme_bw()

# Nonparametric estimate of the SE of the sample median
SEestimate <- sd (bootMedians)
SEestimate

## [1] 0.9075281

# Use the bootstrap samples to obtain a nonparametric 95% confidence interval for the population median.
lowerBoundMed <- quantile(bootMedians, probs = 0.025) 
upperBoundMed <- quantile (bootMedians, probs = 0.975)

We are 95% confident that the true median is between 2.09 and 5.07.

# Creating a histogram
tibble (Median = bootMedians) %>% ggplot(aes(x = Median))+
  geom_histogram(color = "white") +
  geom_vline(xintercept = lowerBoundMed, 
              color = "dodgerblue", linetype="solid")+
  geom_vline(xintercept = upperBoundMed,
             color ="dodgerblue" ,linetype="solid") +
labs(title ="Distribution of nonparametric bootstrap medians",
     у ="Frequency")+
  theme_bw()

# Generate B=10,000 samples each of size 30 from a normal distribution using the estimated mean and standard deviation from the observed datset, and visualize this distribution using a histogram.

# Number of bootstrap samples
B < - 10000

## [1] FALSE

#Instantiating matrix for bootstrap samples
paramBoots <- matrix(NA, nrow = n, ncol = B)
XBar <- mean(observed)
s <- sd(observed)

# Simulating a normal set of n values, B times

for(b in 1:B){
  paramBoots[, b] <- rnorm(n = n, mean = XBar, sd = s)
}

# Instantiating vector for bootstrap medians
bootParamMedians <- vector(length = B)

#Calculating median for each simulated data set
for(b in 1:B) {
bootParamMedians[b] <- median(paramBoots[, b])
}

# Creating a histogram
tibble (Median = bootParamMedians) %>% ggplot(aes(x = Median))+
  geom_histogram(color = "white")+
  labs(title= "Distribution of parametric bootstrap nedians",
       y="Frequency") +
  theme_bw()

# Nonparametric estimate of the SE of the sample median
SEparamEstimate <- sd(bootParamMedians)
SEparamEstimate

## [1] 1.02091

# Use the bootstrap samples to obtain a nonparametric 95% confidence interval for the population median.
lowerBoundParamMed <- quantile(bootParamMedians, probs = 0.025) 
upperBoundParamMed <- quantile(bootParamMedians, probs = 0.975)

# Creating a histogram
tibble (Median = bootParamMedians) %>% ggplot(aes(x = Median))+
  geom_histogram(color = "white") +
  geom_vline(xintercept = lowerBoundParamMed, 
              color = "dodgerblue", linetype="solid")+
  geom_vline(xintercept = upperBoundParamMed,
             color ="dodgerblue" ,linetype="solid") +
  labs(title ="Distribution of parametric bootstrap medians",
       у ="Frequency")+
  theme_bw()

Activity-11

Sujith Prakash Parsa

2022-11-22