Packages needed for activity

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(stringr)
library(ggdist)
library(broom)

Simulating rolls of a six sided die

rolls <- sample( x = 1:6, size = 100000, replace = TRUE)
runs <- rle(rolls)

runsTidy <- tibble(lengths = runs$lengths,
                   values = runs$values)

#proportions of rolls
rollsTidy <-tibble(roll = rolls)

rollsTidy %>% count(roll)
## # A tibble: 6 × 2
##    roll     n
##   <int> <int>
## 1     1 16497
## 2     2 16782
## 3     3 16732
## 4     4 16660
## 5     5 16564
## 6     6 16765
#What are the most consecutive rolls with the same die value

runsCount <- runsTidy %>% count(lengths)

#what is the ratio of the number of runs of length 2 compared to runs with lengths 1

runsCount$n[2]/runsCount$n[1]
## [1] 0.1662978

Ratios are close to 1/6

sampling fruits

#sampling 10 fruit
sample(fruit, size = 10)
##  [1] "raspberry"    "tamarillo"    "nut"          "mango"        "grape"       
##  [6] "kumquat"      "olive"        "kiwi fruit"   "pamelo"       "passionfruit"
#sampling with replacement
sample(fruit, size = 10, replace = TRUE)
##  [1] "clementine" "coconut"    "physalis"   "tangerine"  "pineapple" 
##  [6] "ugli fruit" "star fruit" "orange"     "blueberry"  "watermelon"
#sampling with seed = 1989
set.seed(1989)
sample(fruit, size = 13)
##  [1] "salal berry"       "purple mangosteen" "durian"           
##  [4] "rambutan"          "breadfruit"        "currant"          
##  [7] "tamarillo"         "blackberry"        "raisin"           
## [10] "dragonfruit"       "chili pepper"      "avocado"          
## [13] "mango"
set.seed(1995)
sinValues <- rnorm(n=100000, mean = 0, sd = 1)
quantile(sinValues, probs = .5)
##          50% 
## -0.004720696
#value of 97.5th percentile
quantile(sinValues, probs = .975)
##    97.5% 
## 1.958377
wtimes<- tibble(bus1 = runif(n = 100000, min = 0, max = 20))

mean(wtimes$bus1)
## [1] 9.983236
median(wtimes$bus1)
## [1] 9.983259
#visualize the dist
wtimes %>% ggplot(aes(x = bus1)) +
  geom_histogram(bins = 20, color = "black", fill = "#fff56c") +
  labs(title = "wait time for the bus",
       x = "Time in minutes",
       y = "Frequency") +
  theme_bw()

wtimes <- wtimes %>% mutate(patrickTimes = runif(n = 100000, min = 0, max = 30), totalTime = patrickTimes + 10 + bus1)

#patricks waiting time
wtimes %>% ggplot(aes(x = patrickTimes)) +
  geom_histogram(bins = 20, color = "black", fill = "#f5a3a5") +
  labs(title = "Patricks wait time for the bus",
       x = "Time in minutes",
       y = "Frequency") +
  theme_bw()

wtimes %>% ggplot(aes(x = totalTime)) +
  geom_histogram(bins = 20, color = "black", fill = "#fff56c") +
  labs(title = "total wait time for the bus",
       x = "Time in minutes",
       y = "Frequency") +
  theme_bw()

wtimes %>% pull(totalTime) %>% median()
## [1] 34.96549
lowerbound <- wtimes %>% pull(totalTime) %>% quantile(probs = .025)
upperbound <- wtimes %>% pull(totalTime) %>% quantile(probs = .975)

we are 95% confident that spongebobs total waiting time will be between 15.43 and 54.4 minutes till he is on the bus to Krusty Krabs with Patrick.

#permutation

# Simulating participant blood-pressure changes
set.seed(1994)
n1 <- 8
n2 <- 8
trialData <- tibble(Participant = 1:(n1 + n2),
                    Treatment = rep(c("Drug", "Placebo"), each = c(n1, n2)),
                    Change = c(rcauchy(n = n1, location = -3.5, scale = 5), 
                               rcauchy(n = n2, location = 0, scale = 1)))

# Two-sample t-test
tResult <- t.test(x = trialData$Change[which(trialData$Treatment == "Drug")], 
              y = trialData$Change[which(trialData$Treatment == "Placebo")],
              alternative = "less")
# Plotting rain cloud plots showing differences between groups
trialData %>% ggplot(aes(x = Treatment, y = Change)) +
  ggdist::stat_halfeye(adjust = .5, width = 2*.3, .width = c(0.5, 1)) + 
    geom_boxplot(width = .3, outlier.shape = NA) +
  ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1,
                    color = "black") +
    coord_flip() +
    labs(y = "Change in systolic blood-pressure (mmHg)",
         title = "Comparison of blood-pressure changes") + 
  theme_bw() +
  theme(legend.position = "none")

# Generating all permutations
perms <- combn(1:(n1 + n2), m = n1)

# Calculating number of permutations
nperms <- choose(n1 + n2, n1)

# Instantiating vector for test statistics
permTs <- vector(length = nperms)

# Calculating t-test statistic for each permutation
for(p in 1:nperms) {
  permTs[p] <- t.test(x = trialData$Change[perms[, p]], 
                      y = trialData$Change[-c(perms[, p])])$statistic
}

#Create a histogram displaying the null distribution obtained for the permutation test. What is the distribution centered at?
tibble(t = permTs) %>% ggplot(aes(x = t)) +
  geom_histogram(color = "white") +
  geom_vline(xintercept = quantile(permTs, probs = 0.05), color = "red", linetype = "dotted") +
  geom_vline(xintercept = tResult$statistic, color = "blue", linetype = "solid") +
  labs(title = "permitation null distribution",
       y = "frequency") +
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

permPvalue <- mean(permTs <= tResult$statistic)

There are 12870 was to arrange the participants between the two groups

since the the p-value of 0.0404817 is less than .05 we reject the null hypothisis

##bootstapping

# Simulating data from distribution
set.seed(1989)
n <- 30
observed <- rchisq(n = n, df = 4) + rnorm(n = n, mean = 0, sd = 1)
#Create a raincloud plot to visualize the simulated data. Describe the data in terms of its number of peaks (e.g., unimodal, bimodal, etc.) and its symmetry.

tibble(Value = observed) %>% ggplot(aes(y = Value)) +
  ggdist::stat_halfeye(ajust = .5, width = 2*.3, .width = c(0.5, 1)) +
  geom_boxplot(width = .3, outlier.shape = NA) +
  ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1, color = "black") +
  coord_flip() +
  labs(y = "Value",
       title = "simulated values from non-normal distribution") +
  theme_bw() +
  theme(legend.position = "none")

#sample median 
median(observed)
## [1] 3.448628
# Number of bootstrap samples
B <- 10000

# Instantiating matrix for bootstrap samples
boots <- matrix(NA, nrow = n, ncol = B)

# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- observed[sample(1:n, size = n, replace = TRUE)]
}

#Using the generated bootstrap samples, create a bootstrap distribution of sample medians, and visualize this distribution using a histogram
# Instantiating matrix for bootstrap samples
bootsMedian <- vector(length = 8)

# Sampling with replacement B times
for(b in 1:B) {
bootsMedian[b] <- median(boots[, b])
}

tibble(Median = bootsMedian) %>% ggplot(aes(x = Median)) +
  geom_histogram(color = "white") +
  labs(title = "bootstrap distribution",
       y = "frequency") +
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#nonparamatric estimate of the SE of the sample median

SEestimate <- sd(bootsMedian)
SEestimate
## [1] 0.9040296
#95% confidance interval
lowerBound <- quantile(bootsMedian, probs = .025)
upperBound <- quantile(bootsMedian, probs = .975)

We are 95% confidant that the true median is between 2.0945482 and 5.0719181

tibble(Median = bootsMedian) %>% ggplot(aes(x = Median)) +
  geom_histogram(color = "white") +
  geom_vline(xintercept = lowerBound, color = "blue", linetype = "solid") +
  geom_vline(xintercept = upperBound, color = "blue", linetype = "solid") +
  labs(title = "bootstrap distribution",
       y = "frequency") +
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##parametric bootstrap

# Number of bootstrap samples
B <- 10000

# Instantiating matrix for bootstrap samples
paraBoots <- matrix(NA, nrow = n, ncol = B)
xBar <- mean(observed)
s <- sd(observed)

# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- rnorm(n = n, mean = xBar, sd = s)
}

bootParaMedian <- vector(length = 8)

for(b in 1:B) {
bootParaMedian[b] <- median(paraBoots[, b])
}

#codes not working 

#tibble(Median = bootParaMedian) %>% ggplot(aes(x = Median)) +
  #geom_histogram(color = "white") +
  #labs(title = "non parametric bootstrap distribution",y = "frequency") +
  #theme_bw()

#SEestimatePara <- sd(bootParaMedian)
#SEestimatePara
#95% confidance interval
#lowerBoundPara <- quantile(bootParaMedian, probs = .025)
#upperBoundPara <- quantile(bootParaMedian, probs = .975)