Packages needed for activity
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stringr)
library(ggdist)
library(broom)
Simulating rolls of a six sided die
rolls <- sample( x = 1:6, size = 100000, replace = TRUE)
runs <- rle(rolls)
runsTidy <- tibble(lengths = runs$lengths,
values = runs$values)
#proportions of rolls
rollsTidy <-tibble(roll = rolls)
rollsTidy %>% count(roll)
## # A tibble: 6 × 2
## roll n
## <int> <int>
## 1 1 16497
## 2 2 16782
## 3 3 16732
## 4 4 16660
## 5 5 16564
## 6 6 16765
#What are the most consecutive rolls with the same die value
runsCount <- runsTidy %>% count(lengths)
#what is the ratio of the number of runs of length 2 compared to runs with lengths 1
runsCount$n[2]/runsCount$n[1]
## [1] 0.1662978
Ratios are close to 1/6
#sampling 10 fruit
sample(fruit, size = 10)
## [1] "raspberry" "tamarillo" "nut" "mango" "grape"
## [6] "kumquat" "olive" "kiwi fruit" "pamelo" "passionfruit"
#sampling with replacement
sample(fruit, size = 10, replace = TRUE)
## [1] "clementine" "coconut" "physalis" "tangerine" "pineapple"
## [6] "ugli fruit" "star fruit" "orange" "blueberry" "watermelon"
#sampling with seed = 1989
set.seed(1989)
sample(fruit, size = 13)
## [1] "salal berry" "purple mangosteen" "durian"
## [4] "rambutan" "breadfruit" "currant"
## [7] "tamarillo" "blackberry" "raisin"
## [10] "dragonfruit" "chili pepper" "avocado"
## [13] "mango"
set.seed(1995)
sinValues <- rnorm(n=100000, mean = 0, sd = 1)
quantile(sinValues, probs = .5)
## 50%
## -0.004720696
#value of 97.5th percentile
quantile(sinValues, probs = .975)
## 97.5%
## 1.958377
wtimes<- tibble(bus1 = runif(n = 100000, min = 0, max = 20))
mean(wtimes$bus1)
## [1] 9.983236
median(wtimes$bus1)
## [1] 9.983259
#visualize the dist
wtimes %>% ggplot(aes(x = bus1)) +
geom_histogram(bins = 20, color = "black", fill = "#fff56c") +
labs(title = "wait time for the bus",
x = "Time in minutes",
y = "Frequency") +
theme_bw()
wtimes <- wtimes %>% mutate(patrickTimes = runif(n = 100000, min = 0, max = 30), totalTime = patrickTimes + 10 + bus1)
#patricks waiting time
wtimes %>% ggplot(aes(x = patrickTimes)) +
geom_histogram(bins = 20, color = "black", fill = "#f5a3a5") +
labs(title = "Patricks wait time for the bus",
x = "Time in minutes",
y = "Frequency") +
theme_bw()
wtimes %>% ggplot(aes(x = totalTime)) +
geom_histogram(bins = 20, color = "black", fill = "#fff56c") +
labs(title = "total wait time for the bus",
x = "Time in minutes",
y = "Frequency") +
theme_bw()
wtimes %>% pull(totalTime) %>% median()
## [1] 34.96549
lowerbound <- wtimes %>% pull(totalTime) %>% quantile(probs = .025)
upperbound <- wtimes %>% pull(totalTime) %>% quantile(probs = .975)
we are 95% confident that spongebobs total waiting time will be between 15.43 and 54.4 minutes till he is on the bus to Krusty Krabs with Patrick.
#permutation
# Simulating participant blood-pressure changes
set.seed(1994)
n1 <- 8
n2 <- 8
trialData <- tibble(Participant = 1:(n1 + n2),
Treatment = rep(c("Drug", "Placebo"), each = c(n1, n2)),
Change = c(rcauchy(n = n1, location = -3.5, scale = 5),
rcauchy(n = n2, location = 0, scale = 1)))
# Two-sample t-test
tResult <- t.test(x = trialData$Change[which(trialData$Treatment == "Drug")],
y = trialData$Change[which(trialData$Treatment == "Placebo")],
alternative = "less")
# Plotting rain cloud plots showing differences between groups
trialData %>% ggplot(aes(x = Treatment, y = Change)) +
ggdist::stat_halfeye(adjust = .5, width = 2*.3, .width = c(0.5, 1)) +
geom_boxplot(width = .3, outlier.shape = NA) +
ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1,
color = "black") +
coord_flip() +
labs(y = "Change in systolic blood-pressure (mmHg)",
title = "Comparison of blood-pressure changes") +
theme_bw() +
theme(legend.position = "none")
# Generating all permutations
perms <- combn(1:(n1 + n2), m = n1)
# Calculating number of permutations
nperms <- choose(n1 + n2, n1)
# Instantiating vector for test statistics
permTs <- vector(length = nperms)
# Calculating t-test statistic for each permutation
for(p in 1:nperms) {
permTs[p] <- t.test(x = trialData$Change[perms[, p]],
y = trialData$Change[-c(perms[, p])])$statistic
}
#Create a histogram displaying the null distribution obtained for the permutation test. What is the distribution centered at?
tibble(t = permTs) %>% ggplot(aes(x = t)) +
geom_histogram(color = "white") +
geom_vline(xintercept = quantile(permTs, probs = 0.05), color = "red", linetype = "dotted") +
geom_vline(xintercept = tResult$statistic, color = "blue", linetype = "solid") +
labs(title = "permitation null distribution",
y = "frequency") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
permPvalue <- mean(permTs <= tResult$statistic)
There are 12870 was to arrange the participants between the two groups
since the the p-value of 0.0404817 is less than .05 we reject the null hypothisis
##bootstapping
# Simulating data from distribution
set.seed(1989)
n <- 30
observed <- rchisq(n = n, df = 4) + rnorm(n = n, mean = 0, sd = 1)
#Create a raincloud plot to visualize the simulated data. Describe the data in terms of its number of peaks (e.g., unimodal, bimodal, etc.) and its symmetry.
tibble(Value = observed) %>% ggplot(aes(y = Value)) +
ggdist::stat_halfeye(ajust = .5, width = 2*.3, .width = c(0.5, 1)) +
geom_boxplot(width = .3, outlier.shape = NA) +
ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1, color = "black") +
coord_flip() +
labs(y = "Value",
title = "simulated values from non-normal distribution") +
theme_bw() +
theme(legend.position = "none")
#sample median
median(observed)
## [1] 3.448628
# Number of bootstrap samples
B <- 10000
# Instantiating matrix for bootstrap samples
boots <- matrix(NA, nrow = n, ncol = B)
# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- observed[sample(1:n, size = n, replace = TRUE)]
}
#Using the generated bootstrap samples, create a bootstrap distribution of sample medians, and visualize this distribution using a histogram
# Instantiating matrix for bootstrap samples
bootsMedian <- vector(length = 8)
# Sampling with replacement B times
for(b in 1:B) {
bootsMedian[b] <- median(boots[, b])
}
tibble(Median = bootsMedian) %>% ggplot(aes(x = Median)) +
geom_histogram(color = "white") +
labs(title = "bootstrap distribution",
y = "frequency") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#nonparamatric estimate of the SE of the sample median
SEestimate <- sd(bootsMedian)
SEestimate
## [1] 0.9040296
#95% confidance interval
lowerBound <- quantile(bootsMedian, probs = .025)
upperBound <- quantile(bootsMedian, probs = .975)
We are 95% confidant that the true median is between 2.0945482 and 5.0719181
tibble(Median = bootsMedian) %>% ggplot(aes(x = Median)) +
geom_histogram(color = "white") +
geom_vline(xintercept = lowerBound, color = "blue", linetype = "solid") +
geom_vline(xintercept = upperBound, color = "blue", linetype = "solid") +
labs(title = "bootstrap distribution",
y = "frequency") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##parametric bootstrap
# Number of bootstrap samples
B <- 10000
# Instantiating matrix for bootstrap samples
paraBoots <- matrix(NA, nrow = n, ncol = B)
xBar <- mean(observed)
s <- sd(observed)
# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- rnorm(n = n, mean = xBar, sd = s)
}
bootParaMedian <- vector(length = 8)
for(b in 1:B) {
bootParaMedian[b] <- median(paraBoots[, b])
}
#codes not working
#tibble(Median = bootParaMedian) %>% ggplot(aes(x = Median)) +
#geom_histogram(color = "white") +
#labs(title = "non parametric bootstrap distribution",y = "frequency") +
#theme_bw()
#SEestimatePara <- sd(bootParaMedian)
#SEestimatePara
#95% confidance interval
#lowerBoundPara <- quantile(bootParaMedian, probs = .025)
#upperBoundPara <- quantile(bootParaMedian, probs = .975)