library(ggplot2)
library(reshape2) 
#' @name assign_vector
#' @param data A vector of data to perform the t-test on.
#' @param n An integer indicating the number of t-tests to perform. Default is 1000
#' @return A data frame in "tall" format
assign_vector <- function(data, n = 1000) {
  # replicate the call to shapiro.test n times to build up a vector of p-values
  p.5 <- replicate(n=n, expr=shapiro.test(sample(my.data, 5, replace=TRUE))$p.value)
  p.10 <- replicate(n=n, expr=shapiro.test(sample(my.data, 10, replace=TRUE))$p.value)
  p.1000 <- replicate(n=n, expr=shapiro.test(sample(my.data, 1000, replace=TRUE))$p.value)
  #' Combine the data into a data frame, 
  #' one column for each number of samples tested.
  p.df <- cbind(p.5, p.10, p.1000)
  p.df <- as.data.frame(p.df)
  colnames(p.df) <- c("5 samples","10 samples","1000 samples")
  #' Put the data in "tall" format, one column for number of samples
  #' and one column for the p-value.
  p.df.m <- melt(p.df)
  #' Make sure the levels are sorted correctly.
  p.df.m <- transform(p.df.m, variable = factor(variable, levels = c("5 samples","10 samples","1000 samples")))
  return(p.df.m)  
}
n.rand <- 100000
n.test <- 10000
my.data <- rnorm(n.rand)
p.df.m <- assign_vector(my.data, n = n.test)
## No id variables; using all as measure variables
#plot the p-values 
ggplot(p.df.m, aes(x = value)) + 
  geom_histogram(binwidth = 1/10) + 
  facet_grid(facets=variable ~ ., scales="free_y") + 
  xlim(0,1) +
  ylab("Count of p-values") +
  xlab("p-values") +
  theme(text = element_text(size = 13))

#check out the distribution. What do you notice?
ggplot(NULL, aes(x=x, colour = distribution)) + 
  stat_function(fun=dnorm, data = data.frame(x = c(-6,6), distribution = factor(1))) + 
  stat_function(fun=dt, args = list( df = 20), data = data.frame(x = c(-6,6), distribution = factor(2)), linetype = "dashed") + 
  scale_colour_manual(values = c("blue","red"), labels = c("Normal","T-Distribution"))

my.data <- rt(n.rand, df = 20)

Testing the tails

my.data <- rt(n.rand, df = 20)
my.data.2 <- rnorm(n.rand)
# Trim off the tails
my.data <- my.data[which(my.data < 3 & my.data > -3)]
# Add in tails from the other distribution
my.data <- c(my.data, my.data.2[which(my.data.2 < -3 | my.data.2 > 3)])
my.data <- rnorm(n.rand)
my.data.2 <- rt(n.rand, df = 20)
# Trim off the tails
my.data <- my.data[which(my.data < 3 & my.data > -3)]
# Add in tails from the other distribution
my.data <- c(my.data, my.data.2[which(my.data.2 < -3 | my.data.2 > 3)])
my.data <- rlnorm(n.rand, 0, 0.4)
p.df.m <- assign_vector(my.data, n = n.test)
## No id variables; using all as measure variables
ggplot(p.df.m, aes(x = value)) +
 geom_histogram(binwidth = 1/10) +
 facet_grid(facets=variable ~ ., scales="free_y") +
 xlim(-.1,1) +
 ylab("Count of p-values") +
 xlab("p-values") +
 theme(text = element_text(size = 13))
## Warning: Removed 6 rows containing missing values (geom_bar).

my.data <- rlnorm(n.rand, 0, 0.4)
my.data <- rlnorm(n.rand, 0, 0.4)
ggplot(p.df.m, aes(x = value)) + 
  geom_histogram(binwidth = 1/10) + 
  facet_grid(facets=variable ~ ., scales="free_y") + 
  xlim(0,1) +
  ylab("Count of p-values") +
  xlab("p-values") +
  theme(text = element_text(size = 16))

my.data <- rlnorm(n.rand, 0, 0.4)
hist(my.data)

p.df.m <- assign_vector(my.data, n = n.test)
## No id variables; using all as measure variables
ggplot(p.df.m, aes(x = value)) +
 geom_histogram(binwidth = 1/10) +
 facet_grid(facets=variable ~ ., scales="free_y") +
 xlim(-.1,1) +
 ylab("Count of p-values") +
 xlab("p-values") +
 theme(text = element_text(size = 16))
## Warning: Removed 6 rows containing missing values (geom_bar).

Discussion

After evaluating the results from the experiment, I was able to notice that with small smaple sizes, the data looks evenly distributed. The distribution is also very senstive to what goes on in the tails. Shapiro- Wilkes test works well on skewed data, and before we cut off the tails; however, when we cut off the tails, or get large sample sizes, we get more failed normality. The main thing I learned was that the data actually looks more skewed when you trim the tails with larger sample sizes, but look normally distributed with smaller sample sizes. Here’s the thing, in the tutorial, the author mentioned that we shouldn’t get too hung up on whether our data is evenly distributed, yet she emphasized the importance of trimming your tails, and cleaning your data. Maybe she was emphasizing this with larger sample sizes, because even when we trimmed the tails, the data was skewed? I’ll admit that sounded like the author had a bit of cognitive dissonance going on, or maybe I lost the point.