Hurricane News Pre-Test Results

# first column read as string
videos_df <- read.csv("data/tikapi/samples/sample2download_102724_with_stats_summary.csv", colClasses = c(id = "character"))
outcome_selfcollect <- read.csv("data/tikapi/stats/hurricane_news/overall_trendpop_stats_selfcollect.csv", colClasses = c(id = "character"))
follower_history <- read.csv("data/tikapi/stats/hurricane_news/overall_follower_history_trendpop.csv")

Preprocess

# remove punctuations
videos_df$containsSummary <- gsub("[[:punct:]]", "", videos_df$containsSummary)
videos_df$containsSummary <- gsub(" ", "", videos_df$containsSummary)
videos_df$containsSummary <- ifelse(videos_df$containsSummary == "Yes", 1, 0)
videos_df$pre_post <- ifelse(videos_df$created_date <= "2024-10-15", "pre", "post")
videos_df$pre_post <- factor(videos_df$pre_post, levels = c("pre", "post"))
control_creators <- c("itzzzalexxx1", "news5128", "highk3k41", "andrewmarkowitz96", "traviswingate", "xero_chaos", "news.everyday1")
videos_df$treatment <- ifelse(videos_df$profile_unique_id == "cbseveningnews", "control", 
                                    ifelse(videos_df$profile_unique_id %in% control_creators, "control", "treated"))
videos_df$treatment <- factor(videos_df$treatment, levels = c("control", "treated"))
response_creators <- c("chefsreviews", "kingtretainment", "mjwhomes")
videos_df$responded <- ifelse(videos_df$profile_unique_id %in% response_creators, 1, 0)

influencer_no_video <- videos_df %>% 
  group_by(profile_unique_id, pre_post) %>% 
  summarise(num_videos = n()) %>%
  spread(pre_post, num_videos) %>%
  filter(is.na(pre) | is.na(post)) %>%
  pull(profile_unique_id)

df_long <- videos_df %>% 
  filter(!profile_unique_id %in% influencer_no_video)

treated_creators <- df_long %>% 
  filter(treatment == "treated") %>% 
  pull(profile_unique_id)

control_creators <- df_long %>% 
  filter(treatment == "control") %>% 
  pull(profile_unique_id)

outcome_sc <- outcome_selfcollect %>% 
  filter(created_date %in% c("2024-10-21", "2024-11-04")) %>% 
  dplyr::select(c("id", "profile_unique_id", "followers_latest", "average_views_latest", "average_likes_latest", "average_comments_latest", "created_date"))
outcome_sc$followers_latest <- as.numeric(outcome_sc$followers_latest)
outcome_sc$average_views_latest <- as.numeric(outcome_sc$average_views_latest)
outcome_sc$average_likes_latest <- as.numeric(outcome_sc$average_likes_latest)
outcome_sc$average_comments_latest <- as.numeric(outcome_sc$average_comments_latest)

reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "followers_latest") %>% 
  mutate(follower_growth = `2024-11-04` - `2024-10-21`,
         follower_growth_perc = follower_growth / `2024-10-21`) %>%
  rename("follower_level_post0" = `2024-10-21`,
         "follower_level_post1" = `2024-11-04`) -> outcome_sc_follower

reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_views_latest") %>% 
  mutate(views_growth = `2024-11-04` - `2024-10-21`,
         views_growth_perc = views_growth / `2024-10-21`) %>%
  rename("views_level_post0" = `2024-10-21`,
         "views_level_post1" = `2024-11-04`) -> outcome_sc_views

reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_likes_latest") %>% 
  mutate(likes_growth = `2024-11-04` - `2024-10-21`,
         likes_growth_perc = likes_growth / `2024-10-21`) %>%
  rename("likes_level_post0" = `2024-10-21`,
         "likes_level_post1" = `2024-11-04`) -> outcome_sc_likes

reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_comments_latest") %>% 
  mutate(comments_growth = `2024-11-04` - `2024-10-21`,
         comments_growth_perc = comments_growth / `2024-10-21`) %>%
  rename("comments_level_post0" = `2024-10-21`,
         "comments_level_post1" = `2024-11-04`) -> outcome_sc_comments

# merge all
outcome_sc <- merge(outcome_sc_follower, outcome_sc_views, by = "profile_unique_id")
outcome_sc <- merge(outcome_sc, outcome_sc_likes, by = "profile_unique_id")
outcome_sc <- merge(outcome_sc, outcome_sc_comments, by = "profile_unique_id")
outcome_sc$treatment <- ifelse(outcome_sc$profile_unique_id %in% treated_creators, "treated", "control")

Descriptive Statistics

Outcome Statistics

Here we look at the summary statistics for the outcome variables: follower_growth, views_growth, likes_growth, and comments_growth.

The growth is calculated as the difference between the latest value collected on 11-04 and the latest value collected on 10-21. The intervention was done on 10-15.

It should be noted that for views, likes, and comments, this is a proxy as Trendpop as a statistic called “average_latest” for these three variables, so I’m using the difference between this average_latest collected on 10-21 and the average_latest collected on 11-04.

# summary stats on follower_growth and follower_growth_perc
outcome_sc %>% 
  summarise(avg_follow_growth = mean(follower_growth, na.rm = TRUE),
            sd_follow_growth = sd(follower_growth, na.rm = TRUE),
            `avg_follow_growth_%` = mean(follower_growth_perc, na.rm = TRUE),
            `sd_follow_growth_%` = sd(follower_growth_perc, na.rm = TRUE)) %>%
  pander()

avg_follow_growth	sd_follow_growth	avg_follow_growth_%	sd_follow_growth_%
451.6	859.8	0.1241	0.2215

# summary stats on views_growth and views_growth_perc
outcome_sc %>% 
  summarise(avg_view_growth = mean(views_growth, na.rm = TRUE),
            sd_view_growth = sd(views_growth, na.rm = TRUE),
            `avg_view_growth_%` = mean(views_growth_perc, na.rm = TRUE),
            `sd_view_growth_%` = sd(views_growth_perc, na.rm = TRUE)) %>%
  pander()

avg_view_growth	sd_view_growth	avg_view_growth_%	sd_view_growth_%
1226	42614	0.1296	0.9676

# summary stats on likes_growth and likes_growth_perc
outcome_sc %>% 
  summarise(avg_like_growth = mean(likes_growth, na.rm = TRUE),
            sd_like_growth = sd(likes_growth, na.rm = TRUE),
            `avg_like_growth_%` = mean(likes_growth_perc, na.rm = TRUE),
            `sd_like_growth_%` = sd(likes_growth_perc, na.rm = TRUE)) %>%
  pander()

avg_like_growth	sd_like_growth	avg_like_growth_%	sd_like_growth_%
953.7	5576	0.02213	0.5707

# summary stats on comments_growth and comments_growth_perc
outcome_sc %>% 
  summarise(avg_com_growth = mean(comments_growth, na.rm = TRUE),
            sd_com_growth = sd(comments_growth, na.rm = TRUE),
            `avg_com_growth_%` = mean(comments_growth_perc, na.rm = TRUE),
            `sd_com_growth_%` = sd(comments_growth_perc, na.rm = TRUE)) %>%
  pander()

avg_com_growth	sd_com_growth	avg_com_growth_%	sd_com_growth_%
-9.08	35.63	-0.07724	0.4694

By Treatment

outcome_sc %>% group_by(treatment) %>%
  summarise(avg_follow_growth = mean(follower_growth, na.rm = TRUE),
            sd_follow_growth = sd(follower_growth, na.rm = TRUE),
            `avg_follow_growth_%` = mean(follower_growth_perc, na.rm = TRUE),
            `sd_follow_growth_%` = sd(follower_growth_perc, na.rm = TRUE),
            avg_view_growth = mean(views_growth, na.rm = TRUE),
            sd_view_growth = sd(views_growth, na.rm = TRUE),
            `avg_view_growth_%` = mean(views_growth_perc, na.rm = TRUE),
            `sd_view_growth_%` = sd(views_growth_perc, na.rm = TRUE),
            avg_like_growth = mean(likes_growth, na.rm = TRUE),
            sd_like_growth = sd(likes_growth, na.rm = TRUE),
            `avg_like_growth_%` = mean(likes_growth_perc, na.rm = TRUE),
            `sd_like_growth_%` = sd(likes_growth_perc, na.rm = TRUE),
            avg_com_growth = mean(comments_growth, na.rm = TRUE),
            sd_com_growth = sd(comments_growth, na.rm = TRUE),
            `avg_com_growth_%` = mean(comments_growth_perc, na.rm = TRUE),
            `sd_com_growth_%` = sd(comments_growth_perc, na.rm = TRUE)) %>%
  pander()

Table continues below
treatment	avg_follow_growth	sd_follow_growth	avg_follow_growth_%
control	91.33	132.5	0.1537
treated	654.3	1026	0.1075

Table continues below
sd_follow_growth_%	avg_view_growth	sd_view_growth	avg_view_growth_%
0.2828	23327	61395	0.4312
0.1872	-11206	20910	-0.04009

Table continues below
sd_view_growth_%	avg_like_growth	sd_like_growth	avg_like_growth_%
0.9953	3108	9209	0.335
0.9404	-257.9	423.4	-0.1539

Table continues below
sd_like_growth_%	avg_com_growth	sd_com_growth	avg_com_growth_%
0.5606	-17.89	55.27	-0.07024
0.5115	-4.125	18.15	-0.08074

sd_com_growth_%
0.2713
0.5509

Focal Creator

outcome_sc %>% filter(profile_unique_id == "cbseveningnews") %>%
  dplyr::select(follower_growth, follower_growth_perc, views_growth, views_growth_perc, likes_growth, likes_growth_perc, comments_growth, comments_growth_perc) %>%
  pander()

Table continues below
follower_growth	follower_growth_perc	views_growth	views_growth_perc
0	0	185754	0.2489

likes_growth	likes_growth_perc	comments_growth	comments_growth_perc
27663	0.7405	-165	-0.1078

Feature Statistics

We look at the mean and standard deviation for the feature: containsSummary.

df_long %>% 
  summarise(overall_mean = mean(containsSummary, na.rm = TRUE),
            overall_sd = sd(containsSummary, na.rm = TRUE)) %>%
  pander()

overall_mean	overall_sd
0.4747	0.4997

containsSummary_mean <- mean(df_long$containsSummary, na.rm = TRUE)
containsSummary_sd <- sd(df_long$containsSummary, na.rm = TRUE)

By Treatment and Pre-Post

df_long %>% 
  group_by(treatment, pre_post) %>% 
  summarise(mean = mean(containsSummary, na.rm = TRUE),
            sd = sd(containsSummary, na.rm = TRUE),
            se = sd(containsSummary, na.rm = TRUE) / sqrt(n()),
            n = n()) %>%
  pander()

treatment	pre_post	mean	sd	se	n
control	pre	0.6074	0.4901	0.04219	135
control	post	0.6735	0.4714	0.04761	98
treated	pre	0.4017	0.4909	0.0262	351
treated	post	0.4175	0.4943	0.03444	206

Focal Creator

We look at the feature values for the focal creator.

df_long %>% filter(profile_unique_id == "cbseveningnews") %>%
  group_by(pre_post) %>%
  summarise(mean = mean(containsSummary, na.rm = TRUE),
            sd = sd(containsSummary, na.rm = TRUE),
            se = sd(containsSummary, na.rm = TRUE) / sqrt(n()),
            n = n()) %>%
  pander()

pre_post	mean	sd	se	n
pre	0.907	0.2939	0.04482	43
post	0.7222	0.4543	0.07571	36

Preliminary Regression

Panel Regression

We first estimate whether the feature containsSummary is associated with the outcome variables views. This is done using the entire dataset instead of separating into the pre and post periods.

lm(views~containsSummary, df_long) %>% summary()

## 
## Call:
## lm(formula = views ~ containsSummary, data = df_long)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
##  -316494  -309345   -96542   -83849 13483362 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        97299      42619   2.283 0.022695 *  
## containsSummary   219339      61858   3.546 0.000414 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 868200 on 788 degrees of freedom
## Multiple R-squared:  0.0157, Adjusted R-squared:  0.01446 
## F-statistic: 12.57 on 1 and 788 DF,  p-value: 0.0004145

We then estimate whether we were able to move the non-viral creators to be more like the focal creator by measuring whether contains summary increased for the treated creators.

panderOptions('round',4)
# round to 3 digits
model_p_t <- feols(containsSummary ~ pre_post | profile_unique_id, 
   data = df_long %>% filter(treatment == "treated"))

model_p_t %>% 
  summary()

## OLS estimation, Dep. Var.: containsSummary
## Observations: 557
## Fixed-effects: profile_unique_id: 16
## Standard-errors: Clustered (profile_unique_id) 
##              Estimate Std. Error  t value Pr(>|t|) 
## pre_postpost 0.013726   0.060035 0.228629  0.82224 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.427024     Adj. R2: 0.222401
##                  Within R2: 2.318e-4

We run an interaction regression here: containsSummary ~ pre_post * treatment

model_p <- feols(containsSummary ~ pre_post * treatment,
   data = df_long)

model_p %>% summary()

## OLS estimation, Dep. Var.: containsSummary
## Observations: 790
## Standard-errors: IID 
##                                Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)                    0.607407   0.042114 14.422774  < 2.2e-16 ***
## pre_postpost                   0.066062   0.064938  1.017315 3.0932e-01    
## treatmenttreated              -0.205698   0.049556 -4.150823 3.6748e-05 ***
## pre_postpost:treatmenttreated -0.050296   0.077855 -0.646018 5.1846e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.488085   Adj. R2: 0.040995

Cross-Sectional

We aggregate to the creator level given our outcome variables are at the creator level.

generate_wide_data <- function(df_long){
  # create wide data by aggregating
  df_wide <- df_long %>% 
    group_by(profile_unique_id, pre_post, treatment, responded) %>% 
    summarise(mean_summary = mean(containsSummary, na.rm = TRUE),
              num_video = n())
  
  post_data <- df_wide %>% 
    filter(pre_post == "post") %>% 
    ungroup() %>%
    dplyr::select(-pre_post)
  
  pre_data <- df_wide %>%
    filter(pre_post == "pre") %>% 
    ungroup() %>%
    dplyr::select(-pre_post)
  
  df_wide <- merge(pre_data, post_data, 
                   by = c("profile_unique_id", "treatment", "responded"), 
                   suffixes = c("_pre", "_post"), all.x = TRUE, all.y = TRUE)
  # round to 3 digits
  df_wide <- df_wide %>% mutate_if(is.numeric, ~round(., 3))
  return (df_wide)
    
}

df_wide  <- generate_wide_data(df_long)
df_wide <- merge(df_wide, outcome_sc, by = c("profile_unique_id", "treatment"))

Average Contains Summary ~ Treatment

df_wide$mean_summary_diff <- df_wide$mean_summary_post - df_wide$mean_summary_pre
lm(mean_summary_diff ~ treatment + num_video_pre + num_video_post + follower_level_post0, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-0.0675	0.1241	-0.544	0.5935
treatmenttreated	-0.0227	0.1151	-0.1968	0.8463
num_video_pre	-0.0023	0.0067	-0.3458	0.7337
num_video_post	0.0098	0.0073	1.336	0.1991
follower_level_post0	0	0	-1.599	0.1282

Fitting linear model: mean_summary_diff ~ treatment + num_video_pre + num_video_post + follower_level_post0
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	0.224	0.2068	0.0201

{Outcome} ~ Average Contains Summary

Follower Growth

follower_growth_perc_mean <- mean(df_wide$follower_growth_perc, na.rm = TRUE)
follower_growth_perc_sd <- sd(df_wide$follower_growth_perc, na.rm = TRUE)

lm(follower_growth ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	533.3	195.7	2.725	0.013
mean_summary_diff	671.7	877	0.7659	0.4527

Fitting linear model: follower_growth ~ mean_summary_diff
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	909.5	0.0285	-0.0201

lm(follower_growth_perc ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	0.1452	0.0482	3.013	0.0069
mean_summary_diff	0.3642	0.216	1.687	0.1072

Fitting linear model: follower_growth_perc ~ mean_summary_diff
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	0.224	0.1245	0.0807

Views Growth

lm(views_growth ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-796.9	9674	-0.0824	0.9352
mean_summary_diff	-52910	43360	-1.22	0.2366

Fitting linear model: views_growth ~ mean_summary_diff
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	44965	0.0693	0.0228

lm(views_growth_perc ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-0.0186	0.1727	-0.1075	0.9155
mean_summary_diff	-1.039	0.774	-1.343	0.1944

Fitting linear model: views_growth_perc ~ mean_summary_diff
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	0.8027	0.0827	0.0368

Likes Growth

lm(likes_growth ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	944.6	1291	0.7316	0.4729
mean_summary_diff	-4616	5787	-0.7976	0.4345

Fitting linear model: likes_growth ~ mean_summary_diff
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	6002	0.0308	-0.0176

lm(likes_growth_perc ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	0.0087	0.1339	0.065	0.9488
mean_summary_diff	-0.1677	0.6	-0.2796	0.7827

Fitting linear model: likes_growth_perc ~ mean_summary_diff
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	0.6222	0.0039	-0.0459

Comments Growth

lm(comments_growth ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-10.02	8.344	-1.201	0.2438
mean_summary_diff	9.978	37.4	0.2668	0.7923

Fitting linear model: comments_growth ~ mean_summary_diff
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	38.78	0.0035	-0.0463

lm(comments_growth_perc ~ mean_summary_diff, data = df_wide) %>% 
  summary() %>%
  pander()

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-0.0762	0.1043	-0.7309	0.4733
mean_summary_diff	-0.3959	0.4674	-0.8471	0.407

Fitting linear model: comments_growth_perc ~ mean_summary_diff
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
22	0.4847	0.0346	-0.0136

Power Analysis (Simulation)

Intervention

Find non-top focal creators (5K – 250/500K followers) who recently received a viral video success - Point out to non-viral creators recently success and ask them to move towards - 2x2 Scenarios: a control group for non-viral creators and a control for viral creators

Note: all creators should be within the 5K – 250/500K followers’ range in this experiment - Rationale: we won’t be able to influence top-creators (i.e. >= 250/500K followers) - Need to decide the upper bound based on the creators we get

Outcome to measure

Whether non-viral creators are moving
Whether non-viral creators are succeeding
What impact on the non-top focal creators
[NEW] What impact on other creators who have similar feature in the same space

Data Tables

Non-viral creators

Columns

non-viral creator id
assigned focal creator
treatment group (assigned, non-assigned)
pre-post
features (pre & post)
- containsSummary
- 32 dimensions
- overall 32 dimensions
outcomes
- follower growth
- follower growth percentage
- views growth
- views growth percentage
- likes growth
- likes growth percentage
- comments growth
- comments growth percentage

Focal creator table

Columns

treatment group (be assigned, non-assigned)
features (pre & post)
- containsSummary
- 32 dimensions
- overall 32 dimensions
outcomes
- follower growth
- follower growth percentage
- views growth
- views growth percentage
- likes growth
- likes growth percentage
- comments growth
- comments growth percentage

Note: for now, we simulate data using only containsSummary feature and follower growth percentage outcome.

# average number of videos per creator group by pre post
# divided by 2 because of pre and post
avg_num_videos <- round(nrow(df_long) / length(unique(df_long$profile_unique_id)) / 2)

Simulation Procedure

Generate non-viral creators data
- Assign focal creators
- Assign treatment
- Generate features: binomial distribution for containsSummary with mean equals 0.4746835 from pilot data. Those in the treatment group will have a higher probability of containsSummary (0.1 as of now). This is generated as panel data first then aggregated to non-viral creator level to calculate the difference between containsSummary Pre and containsSummary Post.
- Generate outcomes: normal distribution for follower growth percentage with mean equals 0.1342599 and standard deviation equals to 0.2335862 from pilot data. Those in the treatment group will have a fixed effect size of 0.1.
- Note that for each treated focal creator, we assign half of the non-viral creators to the treatment group.
Generate focal creators data
- Assign treatment
- Generate features: ignored for now.
- Generate outcomes: normal distribution for follower growth percentage with mean equals 0.1342599 and standard deviation equals to 0.2335862 from pilot data. Those in the assigned group will have a fixed effect size of 0.005 times the number of non-viral creators in the treatment group. 0.005 means that if the non-focal creator has 100,000 followers, the focal creator will have 100,000 * 0.005 = 500 fewer new followers from each non-viral creator in the treatment group.
Run Regressions and record p-value for Treatment coefficient
- Run regression for non-viral creators
  - Difference in containsSummary_diff ~ Treatment
  - Follower growth percentage ~ Treatment
  - Follower growth percentage ~ Treatment + containsSummary_diff
- Run regression for focal creators
  - Follower growth percentage ~ Treatment

Entire procedure is run 500 times to calculate the power of the test.

power_analysis <- function(n_nonviral, n_focal, feature_effect_size, follower_growth_effect_size, follower_decline_effect_size, n_run = 500){
  pval_diffs <- c()
  pval_outcomes <- c()
  pval_mediators <- c()
  pval_outcome_focal <- c()
  
  pval_diffs_long <- c()
  pval_outcomes_long <- c()
  pval_mediators_long <- c()
  
  for (i in 1:n_run){
    ####################################
    ######## treatment assignment ######
    ####################################
    treated_focal_creators <- sample(1:n_focal, n_focal/2, replace = FALSE)
    n_nonviral_per_focal <- n_nonviral / (n_focal / 2)
    # print(paste0("n_nonviral_per_focal: ", n_nonviral_per_focal))
    assigned_focal_creator = sample(rep(treated_focal_creators, each = n_nonviral_per_focal), n_nonviral, replace = FALSE)
    # each assigned focal creator should have half non_viral creators in treatment

    # check if this is an integer
    if (n_nonviral_per_focal %% 1 != 0){
      stop("n_nonviral should be divisible by n_focal")
    }
    treated_nonviral_creators <- c()
    for (i in treated_focal_creators){
      treated_nonviral_creators <- c(treated_nonviral_creators, sample(which(assigned_focal_creator == i), n_nonviral_per_focal/2, replace = FALSE))
    }
    
    ####################################
    ######## non-viral creators ########
    ####################################
    nonviral_creators <- data.frame(
      id = 1:n_nonviral,
      # even distribution
      assigned_focal_creator = assigned_focal_creator,
      follower_growth_perc = rnorm(n_nonviral, follower_growth_perc_mean, follower_growth_perc_sd)
    )
    nonviral_creators$treatment <- ifelse(nonviral_creators$id %in% treated_nonviral_creators, "treatment", "control")
    nonviral_creators$follower_growth_perc <- ifelse(nonviral_creators$treatment == "treatment", 
                                                     nonviral_creators$follower_growth_perc + follower_growth_effect_size, 
                                                     nonviral_creators$follower_growth_perc)
    
    
    ### generate long data
    nonviral_creators_videos_pre <- rpois(n_nonviral, avg_num_videos)
    nonviral_creators_videos_post <- rpois(n_nonviral, avg_num_videos)
    
    nonviral_creators_long_pre <- data.frame(
      # repeat each id by nonviral_creators_videos_pre times
      id = rep(1:n_nonviral, nonviral_creators_videos_pre),
      containsSummary = rbinom(sum(nonviral_creators_videos_pre), 1, containsSummary_mean),
      pre_post = "pre"
    )
    nonviral_creators_long_post <- data.frame(
      # repeat each id by nonviral_creators_videos_post times
      id = rep(1:n_nonviral, nonviral_creators_videos_post),
      pre_post = "post"
    )
    nonviral_creators_long_post$containsSummary = ifelse(nonviral_creators_long_post$id %in% treated_nonviral_creators,
                                                          rbinom(sum(nonviral_creators_videos_post), 1, containsSummary_mean + feature_effect_size),
                                                          rbinom(sum(nonviral_creators_videos_post), 1, containsSummary_mean))
    nonviral_creators_long <- rbind(nonviral_creators_long_pre, nonviral_creators_long_post)
    
  
    # merge
    nonviral_creators_long <- merge(nonviral_creators, nonviral_creators_long, by = "id")

    
    ### generate wide data
    nonviral_creators_wide <- nonviral_creators_long %>% 
      group_by(id, treatment, assigned_focal_creator, follower_growth_perc, pre_post) %>% 
      summarise(containsSummary = mean(containsSummary, na.rm = TRUE))
    
    nonviral_creators_wide <- reshape2::dcast(nonviral_creators_wide, 
                                              id + treatment + assigned_focal_creator + follower_growth_perc ~ pre_post, 
                                              value.var = "containsSummary")
    nonviral_creators_wide$containsSummary_diff <- nonviral_creators_wide$post - nonviral_creators_wide$pre
    
    nonviral_creators$treatment <- factor(nonviral_creators$treatment, levels = c("control", "treatment"))
    
    ### run regression
    model_diff <- feols(containsSummary_diff ~ treatment | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
    model_outcome <- feols(follower_growth_perc ~ treatment | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
    model_mediator <- feols(follower_growth_perc ~ treatment + containsSummary_diff | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
  
    pval_diffs <- c(pval_diffs, model_diff$coeftable[1, 4] %>% as.numeric())
    pval_outcomes <- c(pval_outcomes, model_outcome$coeftable[1, 4] %>% as.numeric())
    pval_mediators <- c(pval_mediators, model_mediator$coeftable[1, 4] %>% as.numeric())
    
    # model_diff_long <- feols(containsSummary ~ treatment | id + assigned_focal_creator, data = nonviral_creators_long)
    # model_outcome_long <- feols(follower_growth_perc ~ treatment | id + assigned_focal_creator, data = nonviral_creators_long)
    # model_mediator_long <- feols(follower_growth_perc ~ treatment + containsSummary | id + assigned_focal_creator, data = nonviral_creators_long)
    # 
    # pval_diffs_long <- c(pval_diffs_long, model_diff_long$coeftable[1, 4] %>% as.numeric())
    # pval_outcomes_long <- c(pval_outcomes_long, model_outcome_long$coeftable[1, 4] %>% as.numeric())
    # pval_mediators_long <- c(pval_mediators_long, model_mediator_long$coeftable[1, 4] %>% as.numeric())
    
    ################################
    ######## focal creators ########
    ################################
    focal_creators <- data.frame(
      id = 1:n_focal,
      # containsSummary_pre = rbinom(n_focal, 1, containsSummary_mean),
      # containsSummary_post = rbinom(n_focal, 1, containsSummary_mean),
      follower_growth_perc = rnorm(n_focal, follower_growth_perc_mean, follower_growth_perc_sd),
      follower_level = runif(n_focal, 5e4, 2.5e5) %>% round()
    )
    focal_creators$treatment <- ifelse(focal_creators$id %in% treated_focal_creators, "assigned", "notassigned")
    focal_creators$follower_growth_perc <- ifelse(focal_creators$treatment == "assigned", 
                                                  focal_creators$follower_growth_perc + follower_decline_effect_size * n_nonviral_per_focal/2, 
                                                  focal_creators$follower_growth_perc)
    
    focal_creators$treatment <- factor(focal_creators$treatment, levels = c("notassigned", "assigned"))
    model_outcome_focal <- lm(follower_growth_perc ~ treatment, data = focal_creators)
    pval_outcome_focal <- c(pval_outcome_focal, summary(model_outcome_focal)$coefficients[2, 4])
    
  }
  
  power_diffs <- mean(pval_diffs <= 0.05, na.rm = T)
  power_outcomes <- mean(pval_outcomes <= 0.05, na.rm = T)
  power_mediators <- mean(pval_mediators <= 0.05, na.rm = T)
  power_outcome_focal <- mean(pval_outcome_focal <= 0.05, na.rm = T)
  return (list(power_diffs = power_diffs, power_outcomes = power_outcomes, power_mediators = power_mediators, power_outcome_focal = power_outcome_focal))

}

# calculate time
start_time <- Sys.time()
feature_effect_size <- 0.1
# assume fixed effect size across all creators
follower_growth_effect_size <- 0.1

# on average how each non-viral creator affects focal creators
follower_decline_effect_size <- 0.005

n_nonvirals_to_test <- seq(20, 100, 20) # this is per treated focal creator
n_focals_to_test <- seq(20, 80, 20)

powers <- data.frame(matrix(nrow = length(n_nonvirals_to_test) * length(n_focals_to_test), ncol = 6))
colnames(powers) <- c("n_nonviral_per_treated_focal", "n_focal", "power_diffs", "power_outcomes", "power_mediators", "power_outcome_focal")

i <- 1
for (n_nonviral in n_nonvirals_to_test){
  for (n_focal in n_focals_to_test){
    power_list <- power_analysis(n_nonviral * n_focal / 2, n_focal, feature_effect_size, follower_growth_effect_size, follower_decline_effect_size)
    powers[i, ] <- c(n_nonviral, n_focal, unlist(power_list))
    i <- i + 1
  }
}
end_time <- Sys.time()
print(paste0("Time taken to run simulation: ", end_time - start_time))

## [1] "Time taken to run simulation: 8.92246528466543"

Note that as of right now, we randomize half of the focal creators to the control group and half of the focal creators to the treatment group

For example, if n_focal = 80, 40 are randomized into the control group.

# plot powers
melt(powers, id.vars = c("n_nonviral_per_treated_focal", "n_focal")) %>%
  # rename variable
  mutate(variable = case_when(
    variable == "power_diffs" ~ "Difference in containsSummary",
    variable == "power_outcomes" ~ "Follower Growth Percentage",
    variable == "power_mediators" ~ "Follower Growth Percentage &\ncontainsSummary",
    variable == "power_outcome_focal" ~ "Focal Creator Follower Growth Percentage"
  )) %>%
  ggplot(aes(x = n_nonviral_per_treated_focal, y = value, color = factor(n_focal))) +
  geom_line() +
  geom_point(size = 1) +
  # horizontal line at 0.8
  geom_hline(yintercept = 0.8, linetype = "dashed") +
  facet_wrap(~variable, scales = "free") +
  # set yscale to be 0 to 1 for all
  scale_y_continuous(limits = c(0, 1)) +
  labs(title = "Power Analysis",
       subtitle = "Treatment ~ Variables (with variables name shown on each facet)",
       x = "Number of Non-Viral Creators per Treated Focal Creator",
       y = "Power") +
  theme_minimal()

Power Analysis for Creator Experiment

2024-11-09