Rewriting the earlier randtest function so that it can take a tibble as input. The tibble needs to contain the data we want to do the simulation on, with the right values in the Treatment column, so this is not infinitely flexible - it is just better than previous because:
# randtest parameters:
# tbbl: a tibble that must contain a "Treatment" column
# and a "Count" column
# reps: the number of simulations to run with the provided data (optional)
#
randtest <- function(tbbl, reps=10000) {
if (!("Treatment" %in% colnames(tbbl)) ||
!("Count" %in% colnames(tbbl))) {
print("Error: tibble does not contain expected columns: Treatment, Count")
return(0)
}
# Determine Control, Treated lengths and form the data vector
ctl_len <- filter(tbbl, Treatment == "Ctl") %>% count() %>% pull()
trt_len <- filter(tbbl, Treatment == "Trt") %>% count() %>% pull()
data <- arrange(tbbl, Treatment) %>% pull(Count)
# Calculate the measured difference in means of Control and Treated group
measured_mean_diff <-
tbbl %>%
group_by(Treatment) %>%
summarize(Mean = mean(Count)) %>%
summarize(Mean_Diff = Mean[1] - Mean[2]) %>%
pull()
# Initialize the results vector
results <- numeric(reps)
for (i in 1:reps) {
# Generate a random ordering of the actual data
simtemp <- sample(data)
# Calculate and store the difference in the means
# (control counts are the first n values, treatment counts are the rest)
results[i] <- mean(simtemp[1:ctl_len]) - mean(simtemp[(ctl_len + 1):(ctl_len + trt_len)])
}
# Convert results to a true/false array indicating for
# which trial the mean difference is greater than the
# actual measured mean difference
# (is more extreme than the measured difference)
results_truth <-
results >= measured_mean_diff
# Summing over a true/false array - R coerces TRUE to 1
# and FALSE to 0 to be able to mathematically sum!
# Count the number of such trials and divide by the
# total number of trials to get the probability (p-value).
greater_p <- sum(results_truth)/reps
# Return to caller a list containing the greater_p value (dbl),
# the measured difference in means, and the results vector
return(list(p_value = greater_p, measured = measured_mean_diff, data = results))
}
Read in the mice data and tidy it up in one swoop, using what we learned before.
c1_mice <- read_csv('./C1 Mice.csv') %>%
gather(key = Subgroup, value = Count) %>%
separate(Subgroup, c("Gender", "Treatment"), sep = '-')
## Parsed with column specification:
## cols(
## `Female-Trt` = col_double(),
## `Female-Ctl` = col_double(),
## `Male-Trt` = col_double(),
## `Male-Ctl` = col_double()
## )
Run the randtest simulation on each Gender subgroup
males <- randtest(filter(c1_mice, Gender == "Male"))
females <- randtest(filter(c1_mice, Gender == "Female"))
Plot the Males results
ggplot(mapping = aes(x = males$data)) +
geom_histogram(center = 0, binwidth = 2, fill = "blue") +
geom_vline(xintercept = males$measured) +
xlab("Mean Differences") +
ggtitle(paste0("Male Mice Simulation: p-value is ", males$p_value))
Plot the Females results
ggplot(mapping = aes(x = females$data)) +
geom_histogram(center = 0, binwidth = 2, fill = "darkgreen") +
geom_vline(xintercept = females$measured) +
xlab("Mean Differences") +
ggtitle(paste0("Female Mice Simulation: p-value is ", females$p_value))