Hurricane News Pre-Test Results

# first column read as string
videos_df <- read.csv("data/tikapi/samples/sample2download_102724_with_stats_summary.csv", colClasses = c(id = "character"))
outcome_selfcollect <- read.csv("data/tikapi/stats/hurricane_news/overall_trendpop_stats_selfcollect.csv", colClasses = c(id = "character"))
follower_history <- read.csv("data/tikapi/stats/hurricane_news/overall_follower_history_trendpop.csv")

Preprocess

# remove punctuations
videos_df$containsSummary <- gsub("[[:punct:]]", "", videos_df$containsSummary)
videos_df$containsSummary <- gsub(" ", "", videos_df$containsSummary)
videos_df$containsSummary <- ifelse(videos_df$containsSummary == "Yes", 1, 0)
videos_df$pre_post <- ifelse(videos_df$created_date <= "2024-10-15", "pre", "post")
videos_df$pre_post <- factor(videos_df$pre_post, levels = c("pre", "post"))
control_creators <- c("itzzzalexxx1", "news5128", "highk3k41", "andrewmarkowitz96", "traviswingate", "xero_chaos", "news.everyday1")
videos_df$treatment <- ifelse(videos_df$profile_unique_id == "cbseveningnews", "control", 
                                    ifelse(videos_df$profile_unique_id %in% control_creators, "control", "treated"))
videos_df$treatment <- factor(videos_df$treatment, levels = c("control", "treated"))
response_creators <- c("chefsreviews", "kingtretainment", "mjwhomes")
videos_df$responded <- ifelse(videos_df$profile_unique_id %in% response_creators, 1, 0)
influencer_no_video <- videos_df %>% 
  group_by(profile_unique_id, pre_post) %>% 
  summarise(num_videos = n()) %>%
  spread(pre_post, num_videos) %>%
  filter(is.na(pre) | is.na(post)) %>%
  pull(profile_unique_id)

df_long <- videos_df %>% 
  filter(!profile_unique_id %in% influencer_no_video)

treated_creators <- df_long %>% 
  filter(treatment == "treated") %>% 
  pull(profile_unique_id)

control_creators <- df_long %>% 
  filter(treatment == "control") %>% 
  pull(profile_unique_id)
outcome_sc <- outcome_selfcollect %>% 
  filter(created_date %in% c("2024-10-21", "2024-11-04")) %>% 
  dplyr::select(c("id", "profile_unique_id", "followers_latest", "average_views_latest", "average_likes_latest", "average_comments_latest", "created_date"))
outcome_sc$followers_latest <- as.numeric(outcome_sc$followers_latest)
outcome_sc$average_views_latest <- as.numeric(outcome_sc$average_views_latest)
outcome_sc$average_likes_latest <- as.numeric(outcome_sc$average_likes_latest)
outcome_sc$average_comments_latest <- as.numeric(outcome_sc$average_comments_latest)

reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "followers_latest") %>% 
  mutate(follower_growth = `2024-11-04` - `2024-10-21`,
         follower_growth_perc = follower_growth / `2024-10-21`) %>%
  rename("follower_level_post0" = `2024-10-21`,
         "follower_level_post1" = `2024-11-04`) -> outcome_sc_follower

reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_views_latest") %>% 
  mutate(views_growth = `2024-11-04` - `2024-10-21`,
         views_growth_perc = views_growth / `2024-10-21`) %>%
  rename("views_level_post0" = `2024-10-21`,
         "views_level_post1" = `2024-11-04`) -> outcome_sc_views

reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_likes_latest") %>% 
  mutate(likes_growth = `2024-11-04` - `2024-10-21`,
         likes_growth_perc = likes_growth / `2024-10-21`) %>%
  rename("likes_level_post0" = `2024-10-21`,
         "likes_level_post1" = `2024-11-04`) -> outcome_sc_likes

reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_comments_latest") %>% 
  mutate(comments_growth = `2024-11-04` - `2024-10-21`,
         comments_growth_perc = comments_growth / `2024-10-21`) %>%
  rename("comments_level_post0" = `2024-10-21`,
         "comments_level_post1" = `2024-11-04`) -> outcome_sc_comments

# merge all
outcome_sc <- merge(outcome_sc_follower, outcome_sc_views, by = "profile_unique_id")
outcome_sc <- merge(outcome_sc, outcome_sc_likes, by = "profile_unique_id")
outcome_sc <- merge(outcome_sc, outcome_sc_comments, by = "profile_unique_id")
outcome_sc$treatment <- ifelse(outcome_sc$profile_unique_id %in% treated_creators, "treated", "control")

Descriptive Statistics

Outcome Statistics

Here we look at the summary statistics for the outcome variables: follower_growth, views_growth, likes_growth, and comments_growth.

The growth is calculated as the difference between the latest value collected on 11-04 and the latest value collected on 10-21. The intervention was done on 10-15.

It should be noted that for views, likes, and comments, this is a proxy as Trendpop as a statistic called “average_latest” for these three variables, so I’m using the difference between this average_latest collected on 10-21 and the average_latest collected on 11-04.

# summary stats on follower_growth and follower_growth_perc
outcome_sc %>% 
  summarise(avg_follow_growth = mean(follower_growth, na.rm = TRUE),
            sd_follow_growth = sd(follower_growth, na.rm = TRUE),
            `avg_follow_growth_%` = mean(follower_growth_perc, na.rm = TRUE),
            `sd_follow_growth_%` = sd(follower_growth_perc, na.rm = TRUE)) %>%
  pander()
avg_follow_growth sd_follow_growth avg_follow_growth_% sd_follow_growth_%
451.6 859.8 0.1241 0.2215
# summary stats on views_growth and views_growth_perc
outcome_sc %>% 
  summarise(avg_view_growth = mean(views_growth, na.rm = TRUE),
            sd_view_growth = sd(views_growth, na.rm = TRUE),
            `avg_view_growth_%` = mean(views_growth_perc, na.rm = TRUE),
            `sd_view_growth_%` = sd(views_growth_perc, na.rm = TRUE)) %>%
  pander()
avg_view_growth sd_view_growth avg_view_growth_% sd_view_growth_%
1226 42614 0.1296 0.9676
# summary stats on likes_growth and likes_growth_perc
outcome_sc %>% 
  summarise(avg_like_growth = mean(likes_growth, na.rm = TRUE),
            sd_like_growth = sd(likes_growth, na.rm = TRUE),
            `avg_like_growth_%` = mean(likes_growth_perc, na.rm = TRUE),
            `sd_like_growth_%` = sd(likes_growth_perc, na.rm = TRUE)) %>%
  pander()
avg_like_growth sd_like_growth avg_like_growth_% sd_like_growth_%
953.7 5576 0.02213 0.5707
# summary stats on comments_growth and comments_growth_perc
outcome_sc %>% 
  summarise(avg_com_growth = mean(comments_growth, na.rm = TRUE),
            sd_com_growth = sd(comments_growth, na.rm = TRUE),
            `avg_com_growth_%` = mean(comments_growth_perc, na.rm = TRUE),
            `sd_com_growth_%` = sd(comments_growth_perc, na.rm = TRUE)) %>%
  pander()
avg_com_growth sd_com_growth avg_com_growth_% sd_com_growth_%
-9.08 35.63 -0.07724 0.4694

By Treatment

outcome_sc %>% group_by(treatment) %>%
  summarise(avg_follow_growth = mean(follower_growth, na.rm = TRUE),
            sd_follow_growth = sd(follower_growth, na.rm = TRUE),
            `avg_follow_growth_%` = mean(follower_growth_perc, na.rm = TRUE),
            `sd_follow_growth_%` = sd(follower_growth_perc, na.rm = TRUE),
            avg_view_growth = mean(views_growth, na.rm = TRUE),
            sd_view_growth = sd(views_growth, na.rm = TRUE),
            `avg_view_growth_%` = mean(views_growth_perc, na.rm = TRUE),
            `sd_view_growth_%` = sd(views_growth_perc, na.rm = TRUE),
            avg_like_growth = mean(likes_growth, na.rm = TRUE),
            sd_like_growth = sd(likes_growth, na.rm = TRUE),
            `avg_like_growth_%` = mean(likes_growth_perc, na.rm = TRUE),
            `sd_like_growth_%` = sd(likes_growth_perc, na.rm = TRUE),
            avg_com_growth = mean(comments_growth, na.rm = TRUE),
            sd_com_growth = sd(comments_growth, na.rm = TRUE),
            `avg_com_growth_%` = mean(comments_growth_perc, na.rm = TRUE),
            `sd_com_growth_%` = sd(comments_growth_perc, na.rm = TRUE)) %>%
  pander()
Table continues below
treatment avg_follow_growth sd_follow_growth avg_follow_growth_%
control 91.33 132.5 0.1537
treated 654.3 1026 0.1075
Table continues below
sd_follow_growth_% avg_view_growth sd_view_growth avg_view_growth_%
0.2828 23327 61395 0.4312
0.1872 -11206 20910 -0.04009
Table continues below
sd_view_growth_% avg_like_growth sd_like_growth avg_like_growth_%
0.9953 3108 9209 0.335
0.9404 -257.9 423.4 -0.1539
Table continues below
sd_like_growth_% avg_com_growth sd_com_growth avg_com_growth_%
0.5606 -17.89 55.27 -0.07024
0.5115 -4.125 18.15 -0.08074
sd_com_growth_%
0.2713
0.5509

Focal Creator

outcome_sc %>% filter(profile_unique_id == "cbseveningnews") %>%
  dplyr::select(follower_growth, follower_growth_perc, views_growth, views_growth_perc, likes_growth, likes_growth_perc, comments_growth, comments_growth_perc) %>%
  pander()
Table continues below
follower_growth follower_growth_perc views_growth views_growth_perc
0 0 185754 0.2489
likes_growth likes_growth_perc comments_growth comments_growth_perc
27663 0.7405 -165 -0.1078

Feature Statistics

We look at the mean and standard deviation for the feature: containsSummary.

df_long %>% 
  summarise(overall_mean = mean(containsSummary, na.rm = TRUE),
            overall_sd = sd(containsSummary, na.rm = TRUE)) %>%
  pander()
overall_mean overall_sd
0.4747 0.4997
containsSummary_mean <- mean(df_long$containsSummary, na.rm = TRUE)
containsSummary_sd <- sd(df_long$containsSummary, na.rm = TRUE)

By Treatment and Pre-Post

df_long %>% 
  group_by(treatment, pre_post) %>% 
  summarise(mean = mean(containsSummary, na.rm = TRUE),
            sd = sd(containsSummary, na.rm = TRUE),
            se = sd(containsSummary, na.rm = TRUE) / sqrt(n()),
            n = n()) %>%
  pander()
treatment pre_post mean sd se n
control pre 0.6074 0.4901 0.04219 135
control post 0.6735 0.4714 0.04761 98
treated pre 0.4017 0.4909 0.0262 351
treated post 0.4175 0.4943 0.03444 206

Focal Creator

We look at the feature values for the focal creator.

df_long %>% filter(profile_unique_id == "cbseveningnews") %>%
  group_by(pre_post) %>%
  summarise(mean = mean(containsSummary, na.rm = TRUE),
            sd = sd(containsSummary, na.rm = TRUE),
            se = sd(containsSummary, na.rm = TRUE) / sqrt(n()),
            n = n()) %>%
  pander()
pre_post mean sd se n
pre 0.907 0.2939 0.04482 43
post 0.7222 0.4543 0.07571 36

Preliminary Regression

Panel Regression

We first estimate whether the feature containsSummary is associated with the outcome variables views. This is done using the entire dataset instead of separating into the pre and post periods.

lm(views~containsSummary, df_long) %>% summary()
## 
## Call:
## lm(formula = views ~ containsSummary, data = df_long)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
##  -316494  -309345   -96542   -83849 13483362 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        97299      42619   2.283 0.022695 *  
## containsSummary   219339      61858   3.546 0.000414 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 868200 on 788 degrees of freedom
## Multiple R-squared:  0.0157, Adjusted R-squared:  0.01446 
## F-statistic: 12.57 on 1 and 788 DF,  p-value: 0.0004145

We then estimate whether we were able to move the non-viral creators to be more like the focal creator by measuring whether contains summary increased for the treated creators.

panderOptions('round',4)
# round to 3 digits
model_p_t <- feols(containsSummary ~ pre_post | profile_unique_id, 
   data = df_long %>% filter(treatment == "treated"))

model_p_t %>% 
  summary() 
## OLS estimation, Dep. Var.: containsSummary
## Observations: 557
## Fixed-effects: profile_unique_id: 16
## Standard-errors: Clustered (profile_unique_id) 
##              Estimate Std. Error  t value Pr(>|t|) 
## pre_postpost 0.013726   0.060035 0.228629  0.82224 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.427024     Adj. R2: 0.222401
##                  Within R2: 2.318e-4

We run an interaction regression here: containsSummary ~ pre_post * treatment

model_p <- feols(containsSummary ~ pre_post * treatment,
   data = df_long)

model_p %>% summary()
## OLS estimation, Dep. Var.: containsSummary
## Observations: 790
## Standard-errors: IID 
##                                Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)                    0.607407   0.042114 14.422774  < 2.2e-16 ***
## pre_postpost                   0.066062   0.064938  1.017315 3.0932e-01    
## treatmenttreated              -0.205698   0.049556 -4.150823 3.6748e-05 ***
## pre_postpost:treatmenttreated -0.050296   0.077855 -0.646018 5.1846e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.488085   Adj. R2: 0.040995

Cross-Sectional

We aggregate to the creator level given our outcome variables are at the creator level.

generate_wide_data <- function(df_long){
  # create wide data by aggregating
  df_wide <- df_long %>% 
    group_by(profile_unique_id, pre_post, treatment, responded) %>% 
    summarise(mean_summary = mean(containsSummary, na.rm = TRUE),
              num_video = n())
  
  post_data <- df_wide %>% 
    filter(pre_post == "post") %>% 
    ungroup() %>%
    dplyr::select(-pre_post)
  
  pre_data <- df_wide %>%
    filter(pre_post == "pre") %>% 
    ungroup() %>%
    dplyr::select(-pre_post)
  
  df_wide <- merge(pre_data, post_data, 
                   by = c("profile_unique_id", "treatment", "responded"), 
                   suffixes = c("_pre", "_post"), all.x = TRUE, all.y = TRUE)
  # round to 3 digits
  df_wide <- df_wide %>% mutate_if(is.numeric, ~round(., 3))
  return (df_wide)
    
}

df_wide  <- generate_wide_data(df_long)
df_wide <- merge(df_wide, outcome_sc, by = c("profile_unique_id", "treatment"))

Average Contains Summary ~ Treatment

df_wide$mean_summary_diff <- df_wide$mean_summary_post - df_wide$mean_summary_pre
lm(mean_summary_diff ~ treatment + num_video_pre + num_video_post + follower_level_post0, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.0675 0.1241 -0.544 0.5935
treatmenttreated -0.0227 0.1151 -0.1968 0.8463
num_video_pre -0.0023 0.0067 -0.3458 0.7337
num_video_post 0.0098 0.0073 1.336 0.1991
follower_level_post0 0 0 -1.599 0.1282
Fitting linear model: mean_summary_diff ~ treatment + num_video_pre + num_video_post + follower_level_post0
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 0.224 0.2068 0.0201

{Outcome} ~ Average Contains Summary

Follower Growth

follower_growth_perc_mean <- mean(df_wide$follower_growth_perc, na.rm = TRUE)
follower_growth_perc_sd <- sd(df_wide$follower_growth_perc, na.rm = TRUE)
lm(follower_growth ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 533.3 195.7 2.725 0.013
mean_summary_diff 671.7 877 0.7659 0.4527
Fitting linear model: follower_growth ~ mean_summary_diff
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 909.5 0.0285 -0.0201
lm(follower_growth_perc ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.1452 0.0482 3.013 0.0069
mean_summary_diff 0.3642 0.216 1.687 0.1072
Fitting linear model: follower_growth_perc ~ mean_summary_diff
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 0.224 0.1245 0.0807

Views Growth

lm(views_growth ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -796.9 9674 -0.0824 0.9352
mean_summary_diff -52910 43360 -1.22 0.2366
Fitting linear model: views_growth ~ mean_summary_diff
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 44965 0.0693 0.0228
lm(views_growth_perc ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.0186 0.1727 -0.1075 0.9155
mean_summary_diff -1.039 0.774 -1.343 0.1944
Fitting linear model: views_growth_perc ~ mean_summary_diff
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 0.8027 0.0827 0.0368

Likes Growth

lm(likes_growth ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 944.6 1291 0.7316 0.4729
mean_summary_diff -4616 5787 -0.7976 0.4345
Fitting linear model: likes_growth ~ mean_summary_diff
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 6002 0.0308 -0.0176
lm(likes_growth_perc ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.0087 0.1339 0.065 0.9488
mean_summary_diff -0.1677 0.6 -0.2796 0.7827
Fitting linear model: likes_growth_perc ~ mean_summary_diff
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 0.6222 0.0039 -0.0459

Comments Growth

lm(comments_growth ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -10.02 8.344 -1.201 0.2438
mean_summary_diff 9.978 37.4 0.2668 0.7923
Fitting linear model: comments_growth ~ mean_summary_diff
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 38.78 0.0035 -0.0463
lm(comments_growth_perc ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.0762 0.1043 -0.7309 0.4733
mean_summary_diff -0.3959 0.4674 -0.8471 0.407
Fitting linear model: comments_growth_perc ~ mean_summary_diff
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
22 0.4847 0.0346 -0.0136

Power Analysis (Simulation)

Intervention

Find non-top focal creators (5K – 250/500K followers) who recently received a viral video success - Point out to non-viral creators recently success and ask them to move towards - 2x2 Scenarios: a control group for non-viral creators and a control for viral creators

Note: all creators should be within the 5K – 250/500K followers’ range in this experiment - Rationale: we won’t be able to influence top-creators (i.e. >= 250/500K followers) - Need to decide the upper bound based on the creators we get

Outcome to measure

Data Tables

Non-viral creators

Columns

Focal creator table

Columns

Note: for now, we simulate data using only containsSummary feature and follower growth percentage outcome.

# average number of videos per creator group by pre post
# divided by 2 because of pre and post
avg_num_videos <- round(nrow(df_long) / length(unique(df_long$profile_unique_id)) / 2)

Simulation Procedure

  1. Generate non-viral creators data
    • Assign focal creators
    • Assign treatment
    • Generate features: binomial distribution for containsSummary with mean equals 0.4746835 from pilot data. Those in the treatment group will have a higher probability of containsSummary (0.1 as of now). This is generated as panel data first then aggregated to non-viral creator level to calculate the difference between containsSummary Pre and containsSummary Post.
    • Generate outcomes: normal distribution for follower growth percentage with mean equals 0.1342599 and standard deviation equals to 0.2335862 from pilot data. Those in the treatment group will have a fixed effect size of 0.1.
    • Note that for each treated focal creator, we assign half of the non-viral creators to the treatment group.
  2. Generate focal creators data
    • Assign treatment
    • Generate features: ignored for now.
    • Generate outcomes: normal distribution for follower growth percentage with mean equals 0.1342599 and standard deviation equals to 0.2335862 from pilot data. Those in the assigned group will have a fixed effect size of 0.005 times the number of non-viral creators in the treatment group. 0.005 means that if the non-focal creator has 100,000 followers, the focal creator will have 100,000 * 0.005 = 500 fewer new followers from each non-viral creator in the treatment group.
  3. Run Regressions and record p-value for Treatment coefficient
    • Run regression for non-viral creators
      • Difference in containsSummary_diff ~ Treatment
      • Follower growth percentage ~ Treatment
      • Follower growth percentage ~ Treatment + containsSummary_diff
    • Run regression for focal creators
      • Follower growth percentage ~ Treatment

Entire procedure is run 500 times to calculate the power of the test.

power_analysis <- function(n_nonviral, n_focal, feature_effect_size, follower_growth_effect_size, follower_decline_effect_size, n_run = 500){
  pval_diffs <- c()
  pval_outcomes <- c()
  pval_mediators <- c()
  pval_outcome_focal <- c()
  
  pval_diffs_long <- c()
  pval_outcomes_long <- c()
  pval_mediators_long <- c()
  
  for (i in 1:n_run){
    ####################################
    ######## treatment assignment ######
    ####################################
    treated_focal_creators <- sample(1:n_focal, n_focal/2, replace = FALSE)
    n_nonviral_per_focal <- n_nonviral / (n_focal / 2)
    # print(paste0("n_nonviral_per_focal: ", n_nonviral_per_focal))
    assigned_focal_creator = sample(rep(treated_focal_creators, each = n_nonviral_per_focal), n_nonviral, replace = FALSE)
    # each assigned focal creator should have half non_viral creators in treatment

    # check if this is an integer
    if (n_nonviral_per_focal %% 1 != 0){
      stop("n_nonviral should be divisible by n_focal")
    }
    treated_nonviral_creators <- c()
    for (i in treated_focal_creators){
      treated_nonviral_creators <- c(treated_nonviral_creators, sample(which(assigned_focal_creator == i), n_nonviral_per_focal/2, replace = FALSE))
    }
    
    ####################################
    ######## non-viral creators ########
    ####################################
    nonviral_creators <- data.frame(
      id = 1:n_nonviral,
      # even distribution
      assigned_focal_creator = assigned_focal_creator,
      follower_growth_perc = rnorm(n_nonviral, follower_growth_perc_mean, follower_growth_perc_sd)
    )
    nonviral_creators$treatment <- ifelse(nonviral_creators$id %in% treated_nonviral_creators, "treatment", "control")
    nonviral_creators$follower_growth_perc <- ifelse(nonviral_creators$treatment == "treatment", 
                                                     nonviral_creators$follower_growth_perc + follower_growth_effect_size, 
                                                     nonviral_creators$follower_growth_perc)
    
    
    ### generate long data
    nonviral_creators_videos_pre <- rpois(n_nonviral, avg_num_videos)
    nonviral_creators_videos_post <- rpois(n_nonviral, avg_num_videos)
    
    nonviral_creators_long_pre <- data.frame(
      # repeat each id by nonviral_creators_videos_pre times
      id = rep(1:n_nonviral, nonviral_creators_videos_pre),
      containsSummary = rbinom(sum(nonviral_creators_videos_pre), 1, containsSummary_mean),
      pre_post = "pre"
    )
    nonviral_creators_long_post <- data.frame(
      # repeat each id by nonviral_creators_videos_post times
      id = rep(1:n_nonviral, nonviral_creators_videos_post),
      pre_post = "post"
    )
    nonviral_creators_long_post$containsSummary = ifelse(nonviral_creators_long_post$id %in% treated_nonviral_creators,
                                                          rbinom(sum(nonviral_creators_videos_post), 1, containsSummary_mean + feature_effect_size),
                                                          rbinom(sum(nonviral_creators_videos_post), 1, containsSummary_mean))
    nonviral_creators_long <- rbind(nonviral_creators_long_pre, nonviral_creators_long_post)
    
  
    # merge
    nonviral_creators_long <- merge(nonviral_creators, nonviral_creators_long, by = "id")

    
    ### generate wide data
    nonviral_creators_wide <- nonviral_creators_long %>% 
      group_by(id, treatment, assigned_focal_creator, follower_growth_perc, pre_post) %>% 
      summarise(containsSummary = mean(containsSummary, na.rm = TRUE))
    
    nonviral_creators_wide <- reshape2::dcast(nonviral_creators_wide, 
                                              id + treatment + assigned_focal_creator + follower_growth_perc ~ pre_post, 
                                              value.var = "containsSummary")
    nonviral_creators_wide$containsSummary_diff <- nonviral_creators_wide$post - nonviral_creators_wide$pre
    
    nonviral_creators$treatment <- factor(nonviral_creators$treatment, levels = c("control", "treatment"))
    
    ### run regression
    model_diff <- feols(containsSummary_diff ~ treatment | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
    model_outcome <- feols(follower_growth_perc ~ treatment | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
    model_mediator <- feols(follower_growth_perc ~ treatment + containsSummary_diff | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
  
    pval_diffs <- c(pval_diffs, model_diff$coeftable[1, 4] %>% as.numeric())
    pval_outcomes <- c(pval_outcomes, model_outcome$coeftable[1, 4] %>% as.numeric())
    pval_mediators <- c(pval_mediators, model_mediator$coeftable[1, 4] %>% as.numeric())
    
    # model_diff_long <- feols(containsSummary ~ treatment | id + assigned_focal_creator, data = nonviral_creators_long)
    # model_outcome_long <- feols(follower_growth_perc ~ treatment | id + assigned_focal_creator, data = nonviral_creators_long)
    # model_mediator_long <- feols(follower_growth_perc ~ treatment + containsSummary | id + assigned_focal_creator, data = nonviral_creators_long)
    # 
    # pval_diffs_long <- c(pval_diffs_long, model_diff_long$coeftable[1, 4] %>% as.numeric())
    # pval_outcomes_long <- c(pval_outcomes_long, model_outcome_long$coeftable[1, 4] %>% as.numeric())
    # pval_mediators_long <- c(pval_mediators_long, model_mediator_long$coeftable[1, 4] %>% as.numeric())
    
    ################################
    ######## focal creators ########
    ################################
    focal_creators <- data.frame(
      id = 1:n_focal,
      # containsSummary_pre = rbinom(n_focal, 1, containsSummary_mean),
      # containsSummary_post = rbinom(n_focal, 1, containsSummary_mean),
      follower_growth_perc = rnorm(n_focal, follower_growth_perc_mean, follower_growth_perc_sd),
      follower_level = runif(n_focal, 5e4, 2.5e5) %>% round()
    )
    focal_creators$treatment <- ifelse(focal_creators$id %in% treated_focal_creators, "assigned", "notassigned")
    focal_creators$follower_growth_perc <- ifelse(focal_creators$treatment == "assigned", 
                                                  focal_creators$follower_growth_perc + follower_decline_effect_size * n_nonviral_per_focal/2, 
                                                  focal_creators$follower_growth_perc)
    
    focal_creators$treatment <- factor(focal_creators$treatment, levels = c("notassigned", "assigned"))
    model_outcome_focal <- lm(follower_growth_perc ~ treatment, data = focal_creators)
    pval_outcome_focal <- c(pval_outcome_focal, summary(model_outcome_focal)$coefficients[2, 4])
    
  }
  
  power_diffs <- mean(pval_diffs <= 0.05, na.rm = T)
  power_outcomes <- mean(pval_outcomes <= 0.05, na.rm = T)
  power_mediators <- mean(pval_mediators <= 0.05, na.rm = T)
  power_outcome_focal <- mean(pval_outcome_focal <= 0.05, na.rm = T)
  return (list(power_diffs = power_diffs, power_outcomes = power_outcomes, power_mediators = power_mediators, power_outcome_focal = power_outcome_focal))

}
# calculate time
start_time <- Sys.time()
feature_effect_size <- 0.1
# assume fixed effect size across all creators
follower_growth_effect_size <- 0.1

# on average how each non-viral creator affects focal creators
follower_decline_effect_size <- 0.005

n_nonvirals_to_test <- seq(20, 100, 20) # this is per treated focal creator
n_focals_to_test <- seq(20, 80, 20)

powers <- data.frame(matrix(nrow = length(n_nonvirals_to_test) * length(n_focals_to_test), ncol = 6))
colnames(powers) <- c("n_nonviral_per_treated_focal", "n_focal", "power_diffs", "power_outcomes", "power_mediators", "power_outcome_focal")

i <- 1
for (n_nonviral in n_nonvirals_to_test){
  for (n_focal in n_focals_to_test){
    power_list <- power_analysis(n_nonviral * n_focal / 2, n_focal, feature_effect_size, follower_growth_effect_size, follower_decline_effect_size)
    powers[i, ] <- c(n_nonviral, n_focal, unlist(power_list))
    i <- i + 1
  }
}
end_time <- Sys.time()
print(paste0("Time taken to run simulation: ", end_time - start_time))
## [1] "Time taken to run simulation: 8.92246528466543"

Note that as of right now, we randomize half of the focal creators to the control group and half of the focal creators to the treatment group

For example, if n_focal = 80, 40 are randomized into the control group.

# plot powers
melt(powers, id.vars = c("n_nonviral_per_treated_focal", "n_focal")) %>%
  # rename variable
  mutate(variable = case_when(
    variable == "power_diffs" ~ "Difference in containsSummary",
    variable == "power_outcomes" ~ "Follower Growth Percentage",
    variable == "power_mediators" ~ "Follower Growth Percentage &\ncontainsSummary",
    variable == "power_outcome_focal" ~ "Focal Creator Follower Growth Percentage"
  )) %>%
  ggplot(aes(x = n_nonviral_per_treated_focal, y = value, color = factor(n_focal))) +
  geom_line() +
  geom_point(size = 1) +
  # horizontal line at 0.8
  geom_hline(yintercept = 0.8, linetype = "dashed") +
  facet_wrap(~variable, scales = "free") +
  # set yscale to be 0 to 1 for all
  scale_y_continuous(limits = c(0, 1)) +
  labs(title = "Power Analysis",
       subtitle = "Treatment ~ Variables (with variables name shown on each facet)",
       x = "Number of Non-Viral Creators per Treated Focal Creator",
       y = "Power") +
  theme_minimal()