Load library

library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  3.0.0     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Investigation of statistical power when log-transforming looking time

# MatVect <- c(1,.38,.38,1)*.298^2 
# Sigma <- matrix(MatVect,2,2)
# mu <- c(1,1) 
set.seed(1)
N_SIM <- 10000
M_NORMAL <- 10
SD_NORMAL <- 1
OUTLIER_PARAMETER <- 2.5

Main simulation function

https://en.wikipedia.org/wiki/Log-normal_distribution

simulate_paired_t <- function (df) {
  # draw data
  if (df$base_distribution == "lognormal") {
    lt_1 <- rlnorm(n = df$n_subs, 
                 meanlog = log((M_NORMAL^2) / sqrt(SD_NORMAL^2 + M_NORMAL^2)), 
                 sdlog = sqrt(log(1 + (SD_NORMAL^2 / M_NORMAL^2))))
    lt_2 <- rlnorm(n = df$n_subs, 
                 meanlog = log(((M_NORMAL + df$true_effect)^2) / sqrt(SD_NORMAL^2 + (M_NORMAL + df$true_effect)^2)), 
                 sdlog = sqrt(log(1 + (SD_NORMAL^2 / (M_NORMAL + df$true_effect)^2))))
    
  } else if (df$base_distribution == "lognormal_outlier") {
    lt_1 <- rlnorm(n = df$n_subs, 
                 meanlog = log((M_NORMAL^2) / sqrt(SD_NORMAL^2 + M_NORMAL^2)), 
                 sdlog = sqrt(log(1 + (SD_NORMAL^2 / M_NORMAL^2))))
    outlier <- sample(1:df$n_subs, df$n_subs*0.1, replace=FALSE)
  lt_1[outlier] <- (abs(lt_1[outlier]) + 3*sqrt(log(1 + (SD_NORMAL^2 / M_NORMAL^2))))*sign(lt_1[outlier])
    
    lt_2 <- rlnorm(n = df$n_subs, 
                 meanlog = log(((M_NORMAL + df$true_effect)^2) / sqrt(SD_NORMAL^2 + (M_NORMAL + df$true_effect)^2)), 
                 sdlog = sqrt(log(1 + (SD_NORMAL^2 / (M_NORMAL + df$true_effect)^2))))
    outlier2 <- sample(1:df$n_subs, df$n_subs*0.1, replace=FALSE)
  lt_2[outlier2] <- (abs(lt_2[outlier2]) + 3*sqrt(log(1 + (SD_NORMAL^2 / (M_NORMAL + df$true_effect)^2)))) * sign(lt_2[outlier2])

  } else if (df$base_distribution == "normal") {
    lt_1 <- rnorm(n = df$n_subs, 
                 mean = M_NORMAL, sd = SD_NORMAL) 
    
    lt_2 <- rnorm(n = df$n_subs, 
                 mean = M_NORMAL + df$true_effect, sd = SD_NORMAL) 
  
  } else if (df$base_distribution == "normal_outlier") {
     lt_1 <- rnorm(n = df$n_subs, 
                 mean = M_NORMAL, sd = SD_NORMAL) 
    outlier <- sample(1:df$n_subs, df$n_subs*0.1, replace=FALSE)
  lt_1[outlier] <- (abs(lt_1[outlier]) + 3*SD_NORMAL) * sign(lt_1[outlier])
    
    lt_2 <- rnorm(n = df$n_subs, 
                 mean = M_NORMAL + df$true_effect, sd = SD_NORMAL) 
    outlier2 <- sample(1:df$n_subs, df$n_subs*0.1, replace=FALSE)
  lt_2[outlier2] <- (abs(lt_2[outlier2]) + 3*SD_NORMAL) * sign(lt_2[outlier2])
    # 90% of distribution is rnorm as above
    # 10% of distribtion is outliers - 4x the standard deviation, mean of zero (i.e, 2 outliers)
    }
  
  # transform
  # trim outliers
  if (df$trim_before_transform == "trim first") {
    if (df$trim_outliers == "trimmed") {
      m_lt <- mean(c(lt_1, lt_2))
      sd_lt <- sd(c(lt_1, lt_2))
      
      lt_1 <- lt_1[abs(lt_1) < m_lt + OUTLIER_PARAMETER * sd_lt]
      lt_2 <- lt_2[abs(lt_2) < m_lt + OUTLIER_PARAMETER * sd_lt]
    }
    
    if (df$log_transform == "log transform") {
      lt_1 <- log(lt_1)
      lt_2 <- log(lt_2)
    }
    
    if (df$log_transform == "untransformed") {
      lt_1 <- lt_1
      lt_2 <- lt_2
    }
  } else {
     if (df$trim_before_transform == "transform first") {
       if (df$log_transform == "log transform") {
         lt_1 <- log(lt_1)
         lt_2 <- log(lt_2)
       }
      
       if (df$trim_outliers == "trimmed") {
      m_lt <- mean(c(lt_1, lt_2))
      sd_lt <- sd(c(lt_1, lt_2))
      
      lt_1 <- lt_1[abs(lt_1) < m_lt + OUTLIER_PARAMETER * sd_lt]
      lt_2 <- lt_2[abs(lt_2) < m_lt + OUTLIER_PARAMETER * sd_lt]
       }
     } else if (df$trim_before_transform == "untransformed") {
      if (df$trim_outlier == "trimmed") {
        m_lt <- mean(c(lt_1, lt_2))
      sd_lt <- sd(c(lt_1, lt_2))
      
      lt_1 <- lt_1[abs(lt_1) < m_lt + OUTLIER_PARAMETER * sd_lt]
      lt_2 <- lt_2[abs(lt_2) < m_lt + OUTLIER_PARAMETER * sd_lt]
      }
     }
  }
    
  
  t_test <- t.test(lt_1, lt_2, paired = FALSE)
  
  df$n_included <- length(c(lt_1, lt_2))
  df$p_val <- t_test$p.value
  
  return(df)
}

Conduct simulations

sims <- expand.grid(n = 1:N_SIM, 
                    n_subs = 20, 
                    base_distribution = c("lognormal","normal", "normal_outlier", "lognormal_outlier"),
                    log_transform = c("log transform", "untransformed"), 
                    trim_outliers = c("trimmed", "untrimmed"),
                    trim_before_transform = c("trim first", "transform first"), 
                    true_effect = c(0, .2, .5, .8))
sims <- sims %>%
  mutate(idx = 1:n()) %>%
  split(.$idx) %>%
  map_df(simulate_paired_t)

Visualise

Distribution without outlier

sim_summary_no_outlier <- sims %>%
  filter(base_distribution %in% c("lognormal", "normal")) %>% 
  group_by(base_distribution, log_transform, 
           trim_before_transform, trim_outliers, true_effect) %>%
  summarise(p_lt_05 = mean(p_val < .05)) 

ggplot(sim_summary_no_outlier, 
       aes(x = true_effect, y = p_lt_05, 
           col = trim_outliers, lty = base_distribution)) + 
  geom_line() + 
  facet_grid(log_transform ~ trim_before_transform) + 
  geom_hline(yintercept = .05, lty = 2, col = "black") 

Distribution with outlier

sim_summary_outlier <- sims %>%
  filter(base_distribution %in% c("lognormal_outlier", "normal_outlier")) %>% 
  group_by(base_distribution, log_transform, 
           trim_before_transform, trim_outliers, true_effect) %>%
  summarise(p_lt_05 = mean(p_val < .05)) 
  
ggplot(sim_summary_outlier, 
       aes(x = true_effect, y = p_lt_05, 
           col = trim_outliers, lty = base_distribution)) + 
  geom_line() + 
  facet_grid(log_transform ~ trim_before_transform) + 
  geom_hline(yintercept = .05, lty = 2, col = "black") 

False positive rate

fp_sim_summary <- sims %>%
  filter(true_effect == 0) %>%
  group_by(base_distribution, log_transform, 
           trim_before_transform, trim_outliers, true_effect) %>%
  summarise(p_lt_05 = mean(p_val < .05)) 

ggplot(fp_sim_summary, 
       aes(x = base_distribution, y = p_lt_05, 
           fill = trim_outliers)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  facet_grid(log_transform ~ trim_before_transform) + 
  geom_hline(yintercept = .05, lty = 2, col = "black") + 
  coord_flip()

Exploring whether linearity issues will lead to wrong predictions

# MatVect <- c(1,.38,.38,1)*.298^2 
# Sigma <- matrix(MatVect,2,2)
# mu <- c(1,1) 
set.seed(2)
N_SIM <- 10000
M_NORMAL <- 10
SD_NORMAL <- 1

Main simulation function

simulate_pred_unpaired_t <- function (df) {
  if (df$base_distribution == "lognormal"){
   # draw data
    lt_1 <- rlnorm(n = df$n_subs, 
                   meanlog = log((M_NORMAL^2) / sqrt(SD_NORMAL^2 + M_NORMAL^2)), 
                   sdlog = sqrt(log(1 + (SD_NORMAL^2 / M_NORMAL^2))))
    lt_2 <- rlnorm(n = df$n_subs, 
                 meanlog = log(((M_NORMAL + df$true_effect)^2) / sqrt(SD_NORMAL^2 + (M_NORMAL + df$true_effect)^2)), 
                 sdlog = sqrt(log(1 + (SD_NORMAL^2 / (M_NORMAL + df$true_effect)^2))))
    
  } else if (df$base_distribution == "lognormal_outlier") {
     lt_1 <- rlnorm(n = df$n_subs, 
                 meanlog = log((M_NORMAL^2) / sqrt(SD_NORMAL^2 + M_NORMAL^2)), 
                 sdlog = sqrt(log(1 + (SD_NORMAL^2 / M_NORMAL^2))))
    outlier <- sample(1:df$n_subs, df$n_subs*0.1, replace=FALSE)
    lt_1[outlier] <- (abs(lt_1[outlier]) + 3*sqrt(log(1 + (SD_NORMAL^2 / M_NORMAL^2))))*sign(lt_1[outlier])
    
    lt_2 <- rlnorm(n = df$n_subs, 
                 meanlog = log(((M_NORMAL + df$true_effect)^2) / sqrt(SD_NORMAL^2 + (M_NORMAL + df$true_effect)^2)), 
                 sdlog = sqrt(log(1 + (SD_NORMAL^2 / (M_NORMAL + df$true_effect)^2))))
    outlier2 <- sample(1:df$n_subs, df$n_subs*0.1, replace=FALSE)
  lt_2[outlier2] <- (abs(lt_2[outlier2]) + 3*sqrt(log(1 + (SD_NORMAL^2 / (M_NORMAL + df$true_effect)^2)))) * sign(lt_2[outlier2])

# calculate means and sd for predicted values    
  } 
  
if (df$log_transform == "untransform"){
    predict_value_un <- predict(lm((lt_2 - lt_1) ~ 1))
    df$predict_mean <- mean(predict_value_un)
    df$predict_sd <- sd(predict_value_un, na.rm = T)   
                            
  } else if (df$log_transform == "log transform"){
    predict_value_log <- predict(lm((log(lt_2)-log(lt_1)) ~ 1))
    df$predict_mean <- mean(exp(predict_value_log)) # need to anti-log the mean difference
    df$predict_sd <- sd(predict_value_log, na.rm = T)
    
  } 
  
  return(df)
}

Conduct simulations

sims_pred <- expand.grid(n = 1:N_SIM, 
                         n_subs = 20,
                         base_distribution = c("lognormal", "lognormal_outlier"),
                        log_transform = c("untransform", "log transform"), 
                        true_effect = c(0, .2, .5, .8))

sims_pred <- sims_pred %>%
  mutate(idx = 1:n()) %>%
  split(.$idx) %>%
  map_df(simulate_pred_unpaired_t)

Visualise

sim_summary_pred <- sims_pred %>%
  group_by(true_effect, log_transform, base_distribution) %>%
  summarise(prediction = mean(predict_mean)) 

ggplot(sim_summary_pred, 
       aes(x = true_effect, y = prediction, col = log_transform)) + 
  geom_line() +
  geom_abline(slope = 1, lty = 2, col = "black")+
  facet_wrap(. ~ base_distribution)