Activity 11

sampling fruits

#sampling 10 fruit
sample(fruit, size = 10)

##  [1] "raspberry"    "tamarillo"    "nut"          "mango"        "grape"       
##  [6] "kumquat"      "olive"        "kiwi fruit"   "pamelo"       "passionfruit"

#sampling with replacement
sample(fruit, size = 10, replace = TRUE)

##  [1] "clementine" "coconut"    "physalis"   "tangerine"  "pineapple" 
##  [6] "ugli fruit" "star fruit" "orange"     "blueberry"  "watermelon"

#sampling with seed = 1989
set.seed(1989)
sample(fruit, size = 13)

##  [1] "salal berry"       "purple mangosteen" "durian"           
##  [4] "rambutan"          "breadfruit"        "currant"          
##  [7] "tamarillo"         "blackberry"        "raisin"           
## [10] "dragonfruit"       "chili pepper"      "avocado"          
## [13] "mango"

set.seed(1995)
sinValues <- rnorm(n=100000, mean = 0, sd = 1)
quantile(sinValues, probs = .5)

##          50% 
## -0.004720696

#value of 97.5th percentile
quantile(sinValues, probs = .975)

##    97.5% 
## 1.958377

wtimes<- tibble(bus1 = runif(n = 100000, min = 0, max = 20))

mean(wtimes$bus1)

## [1] 9.983236

median(wtimes$bus1)

## [1] 9.983259

#visualize the dist
wtimes %>% ggplot(aes(x = bus1)) +
  geom_histogram(bins = 20, color = "black", fill = "#fff56c") +
  labs(title = "wait time for the bus",
       x = "Time in minutes",
       y = "Frequency") +
  theme_bw()

wtimes <- wtimes %>% mutate(patrickTimes = runif(n = 100000, min = 0, max = 30), totalTime = patrickTimes + 10 + bus1)

#patricks waiting time
wtimes %>% ggplot(aes(x = patrickTimes)) +
  geom_histogram(bins = 20, color = "black", fill = "#f5a3a5") +
  labs(title = "Patricks wait time for the bus",
       x = "Time in minutes",
       y = "Frequency") +
  theme_bw()

wtimes %>% ggplot(aes(x = totalTime)) +
  geom_histogram(bins = 20, color = "black", fill = "#fff56c") +
  labs(title = "total wait time for the bus",
       x = "Time in minutes",
       y = "Frequency") +
  theme_bw()

wtimes %>% pull(totalTime) %>% median()

## [1] 34.96549

lowerbound <- wtimes %>% pull(totalTime) %>% quantile(probs = .025)
upperbound <- wtimes %>% pull(totalTime) %>% quantile(probs = .975)

we are 95% confident that spongebobs total waiting time will be between 15.43 and 54.4 minutes till he is on the bus to Krusty Krabs with Patrick.

#permutation

# Simulating participant blood-pressure changes
set.seed(1994)
n1 <- 8
n2 <- 8
trialData <- tibble(Participant = 1:(n1 + n2),
                    Treatment = rep(c("Drug", "Placebo"), each = c(n1, n2)),
                    Change = c(rcauchy(n = n1, location = -3.5, scale = 5), 
                               rcauchy(n = n2, location = 0, scale = 1)))

# Two-sample t-test
tResult <- t.test(x = trialData$Change[which(trialData$Treatment == "Drug")], 
              y = trialData$Change[which(trialData$Treatment == "Placebo")],
              alternative = "less")

# Plotting rain cloud plots showing differences between groups
trialData %>% ggplot(aes(x = Treatment, y = Change)) +
  ggdist::stat_halfeye(adjust = .5, width = 2*.3, .width = c(0.5, 1)) + 
    geom_boxplot(width = .3, outlier.shape = NA) +
  ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1,
                    color = "black") +
    coord_flip() +
    labs(y = "Change in systolic blood-pressure (mmHg)",
         title = "Comparison of blood-pressure changes") + 
  theme_bw() +
  theme(legend.position = "none")

# Generating all permutations
perms <- combn(1:(n1 + n2), m = n1)

# Calculating number of permutations
nperms <- choose(n1 + n2, n1)

# Instantiating vector for test statistics
permTs <- vector(length = nperms)

# Calculating t-test statistic for each permutation
for(p in 1:nperms) {
  permTs[p] <- t.test(x = trialData$Change[perms[, p]], 
                      y = trialData$Change[-c(perms[, p])])$statistic
}

#Create a histogram displaying the null distribution obtained for the permutation test. What is the distribution centered at?
tibble(t = permTs) %>% ggplot(aes(x = t)) +
  geom_histogram(color = "white") +
  geom_vline(xintercept = quantile(permTs, probs = 0.05), color = "red", linetype = "dotted") +
  geom_vline(xintercept = tResult$statistic, color = "blue", linetype = "solid") +
  labs(title = "permitation null distribution",
       y = "frequency") +
  theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

permPvalue <- mean(permTs <= tResult$statistic)

There are 12870 was to arrange the participants between the two groups

since the the p-value of 0.0404817 is less than .05 we reject the null hypothisis

##bootstapping

# Simulating data from distribution
set.seed(1989)
n <- 30
observed <- rchisq(n = n, df = 4) + rnorm(n = n, mean = 0, sd = 1)
#Create a raincloud plot to visualize the simulated data. Describe the data in terms of its number of peaks (e.g., unimodal, bimodal, etc.) and its symmetry.

tibble(Value = observed) %>% ggplot(aes(y = Value)) +
  ggdist::stat_halfeye(ajust = .5, width = 2*.3, .width = c(0.5, 1)) +
  geom_boxplot(width = .3, outlier.shape = NA) +
  ggdist::stat_dots(side = "left", dotsize = 6, justification = 1.05, binwidth = .1, color = "black") +
  coord_flip() +
  labs(y = "Value",
       title = "simulated values from non-normal distribution") +
  theme_bw() +
  theme(legend.position = "none")

#sample median 
median(observed)

## [1] 3.448628

# Number of bootstrap samples
B <- 10000

# Instantiating matrix for bootstrap samples
boots <- matrix(NA, nrow = n, ncol = B)

# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- observed[sample(1:n, size = n, replace = TRUE)]
}

#Using the generated bootstrap samples, create a bootstrap distribution of sample medians, and visualize this distribution using a histogram
# Instantiating matrix for bootstrap samples
bootsMedian <- vector(length = 8)

# Sampling with replacement B times
for(b in 1:B) {
bootsMedian[b] <- median(boots[, b])
}

tibble(Median = bootsMedian) %>% ggplot(aes(x = Median)) +
  geom_histogram(color = "white") +
  labs(title = "bootstrap distribution",
       y = "frequency") +
  theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#nonparamatric estimate of the SE of the sample median

SEestimate <- sd(bootsMedian)
SEestimate

## [1] 0.9040296

#95% confidance interval
lowerBound <- quantile(bootsMedian, probs = .025)
upperBound <- quantile(bootsMedian, probs = .975)

We are 95% confidant that the true median is between 2.0945482 and 5.0719181

tibble(Median = bootsMedian) %>% ggplot(aes(x = Median)) +
  geom_histogram(color = "white") +
  geom_vline(xintercept = lowerBound, color = "blue", linetype = "solid") +
  geom_vline(xintercept = upperBound, color = "blue", linetype = "solid") +
  labs(title = "bootstrap distribution",
       y = "frequency") +
  theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##parametric bootstrap

# Number of bootstrap samples
B <- 10000

# Instantiating matrix for bootstrap samples
paraBoots <- matrix(NA, nrow = n, ncol = B)
xBar <- mean(observed)
s <- sd(observed)

# Sampling with replacement B times
for(b in 1:B) {
boots[, b] <- rnorm(n = n, mean = xBar, sd = s)
}

bootParaMedian <- vector(length = 8)

for(b in 1:B) {
bootParaMedian[b] <- median(paraBoots[, b])
}

#codes not working 

#tibble(Median = bootParaMedian) %>% ggplot(aes(x = Median)) +
  #geom_histogram(color = "white") +
  #labs(title = "non parametric bootstrap distribution",y = "frequency") +
  #theme_bw()

#SEestimatePara <- sd(bootParaMedian)
#SEestimatePara
#95% confidance interval
#lowerBoundPara <- quantile(bootParaMedian, probs = .025)
#upperBoundPara <- quantile(bootParaMedian, probs = .975)

Activity 11

Sarah Stuck

2022-11-22

sampling fruits