# first column read as string
videos_df <- read.csv("data/tikapi/samples/sample2download_102724_with_stats_summary.csv", colClasses = c(id = "character"))
outcome_selfcollect <- read.csv("data/tikapi/stats/hurricane_news/overall_trendpop_stats_selfcollect.csv", colClasses = c(id = "character"))
follower_history <- read.csv("data/tikapi/stats/hurricane_news/overall_follower_history_trendpop.csv")
# remove punctuations
videos_df$containsSummary <- gsub("[[:punct:]]", "", videos_df$containsSummary)
videos_df$containsSummary <- gsub(" ", "", videos_df$containsSummary)
videos_df$containsSummary <- ifelse(videos_df$containsSummary == "Yes", 1, 0)
videos_df$pre_post <- ifelse(videos_df$created_date <= "2024-10-15", "pre", "post")
videos_df$pre_post <- factor(videos_df$pre_post, levels = c("pre", "post"))
control_creators <- c("itzzzalexxx1", "news5128", "highk3k41", "andrewmarkowitz96", "traviswingate", "xero_chaos", "news.everyday1")
videos_df$treatment <- ifelse(videos_df$profile_unique_id == "cbseveningnews", "control",
ifelse(videos_df$profile_unique_id %in% control_creators, "control", "treated"))
videos_df$treatment <- factor(videos_df$treatment, levels = c("control", "treated"))
response_creators <- c("chefsreviews", "kingtretainment", "mjwhomes")
videos_df$responded <- ifelse(videos_df$profile_unique_id %in% response_creators, 1, 0)
influencer_no_video <- videos_df %>%
group_by(profile_unique_id, pre_post) %>%
summarise(num_videos = n()) %>%
spread(pre_post, num_videos) %>%
filter(is.na(pre) | is.na(post)) %>%
pull(profile_unique_id)
df_long <- videos_df %>%
filter(!profile_unique_id %in% influencer_no_video)
treated_creators <- df_long %>%
filter(treatment == "treated") %>%
pull(profile_unique_id)
control_creators <- df_long %>%
filter(treatment == "control") %>%
pull(profile_unique_id)
outcome_sc <- outcome_selfcollect %>%
filter(created_date %in% c("2024-10-21", "2024-11-04")) %>%
dplyr::select(c("id", "profile_unique_id", "followers_latest", "average_views_latest", "average_likes_latest", "average_comments_latest", "created_date"))
outcome_sc$followers_latest <- as.numeric(outcome_sc$followers_latest)
outcome_sc$average_views_latest <- as.numeric(outcome_sc$average_views_latest)
outcome_sc$average_likes_latest <- as.numeric(outcome_sc$average_likes_latest)
outcome_sc$average_comments_latest <- as.numeric(outcome_sc$average_comments_latest)
reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "followers_latest") %>%
mutate(follower_growth = `2024-11-04` - `2024-10-21`,
follower_growth_perc = follower_growth / `2024-10-21`) %>%
rename("follower_level_post0" = `2024-10-21`,
"follower_level_post1" = `2024-11-04`) -> outcome_sc_follower
reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_views_latest") %>%
mutate(views_growth = `2024-11-04` - `2024-10-21`,
views_growth_perc = views_growth / `2024-10-21`) %>%
rename("views_level_post0" = `2024-10-21`,
"views_level_post1" = `2024-11-04`) -> outcome_sc_views
reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_likes_latest") %>%
mutate(likes_growth = `2024-11-04` - `2024-10-21`,
likes_growth_perc = likes_growth / `2024-10-21`) %>%
rename("likes_level_post0" = `2024-10-21`,
"likes_level_post1" = `2024-11-04`) -> outcome_sc_likes
reshape2::dcast(outcome_sc, profile_unique_id ~ created_date, value.var = "average_comments_latest") %>%
mutate(comments_growth = `2024-11-04` - `2024-10-21`,
comments_growth_perc = comments_growth / `2024-10-21`) %>%
rename("comments_level_post0" = `2024-10-21`,
"comments_level_post1" = `2024-11-04`) -> outcome_sc_comments
# merge all
outcome_sc <- merge(outcome_sc_follower, outcome_sc_views, by = "profile_unique_id")
outcome_sc <- merge(outcome_sc, outcome_sc_likes, by = "profile_unique_id")
outcome_sc <- merge(outcome_sc, outcome_sc_comments, by = "profile_unique_id")
outcome_sc$treatment <- ifelse(outcome_sc$profile_unique_id %in% treated_creators, "treated", "control")
Here we look at the summary statistics for the outcome variables: follower_growth, views_growth, likes_growth, and comments_growth.
The growth is calculated as the difference between the latest value collected on 11-04 and the latest value collected on 10-21. The intervention was done on 10-15.
It should be noted that for views, likes, and comments, this is a proxy as Trendpop as a statistic called “average_latest” for these three variables, so I’m using the difference between this average_latest collected on 10-21 and the average_latest collected on 11-04.
# summary stats on follower_growth and follower_growth_perc
outcome_sc %>%
summarise(avg_follow_growth = mean(follower_growth, na.rm = TRUE),
sd_follow_growth = sd(follower_growth, na.rm = TRUE),
`avg_follow_growth_%` = mean(follower_growth_perc, na.rm = TRUE),
`sd_follow_growth_%` = sd(follower_growth_perc, na.rm = TRUE)) %>%
pander()
| avg_follow_growth | sd_follow_growth | avg_follow_growth_% | sd_follow_growth_% |
|---|---|---|---|
| 451.6 | 859.8 | 0.1241 | 0.2215 |
# summary stats on views_growth and views_growth_perc
outcome_sc %>%
summarise(avg_view_growth = mean(views_growth, na.rm = TRUE),
sd_view_growth = sd(views_growth, na.rm = TRUE),
`avg_view_growth_%` = mean(views_growth_perc, na.rm = TRUE),
`sd_view_growth_%` = sd(views_growth_perc, na.rm = TRUE)) %>%
pander()
| avg_view_growth | sd_view_growth | avg_view_growth_% | sd_view_growth_% |
|---|---|---|---|
| 1226 | 42614 | 0.1296 | 0.9676 |
# summary stats on likes_growth and likes_growth_perc
outcome_sc %>%
summarise(avg_like_growth = mean(likes_growth, na.rm = TRUE),
sd_like_growth = sd(likes_growth, na.rm = TRUE),
`avg_like_growth_%` = mean(likes_growth_perc, na.rm = TRUE),
`sd_like_growth_%` = sd(likes_growth_perc, na.rm = TRUE)) %>%
pander()
| avg_like_growth | sd_like_growth | avg_like_growth_% | sd_like_growth_% |
|---|---|---|---|
| 953.7 | 5576 | 0.02213 | 0.5707 |
# summary stats on comments_growth and comments_growth_perc
outcome_sc %>%
summarise(avg_com_growth = mean(comments_growth, na.rm = TRUE),
sd_com_growth = sd(comments_growth, na.rm = TRUE),
`avg_com_growth_%` = mean(comments_growth_perc, na.rm = TRUE),
`sd_com_growth_%` = sd(comments_growth_perc, na.rm = TRUE)) %>%
pander()
| avg_com_growth | sd_com_growth | avg_com_growth_% | sd_com_growth_% |
|---|---|---|---|
| -9.08 | 35.63 | -0.07724 | 0.4694 |
By Treatment
outcome_sc %>% group_by(treatment) %>%
summarise(avg_follow_growth = mean(follower_growth, na.rm = TRUE),
sd_follow_growth = sd(follower_growth, na.rm = TRUE),
`avg_follow_growth_%` = mean(follower_growth_perc, na.rm = TRUE),
`sd_follow_growth_%` = sd(follower_growth_perc, na.rm = TRUE),
avg_view_growth = mean(views_growth, na.rm = TRUE),
sd_view_growth = sd(views_growth, na.rm = TRUE),
`avg_view_growth_%` = mean(views_growth_perc, na.rm = TRUE),
`sd_view_growth_%` = sd(views_growth_perc, na.rm = TRUE),
avg_like_growth = mean(likes_growth, na.rm = TRUE),
sd_like_growth = sd(likes_growth, na.rm = TRUE),
`avg_like_growth_%` = mean(likes_growth_perc, na.rm = TRUE),
`sd_like_growth_%` = sd(likes_growth_perc, na.rm = TRUE),
avg_com_growth = mean(comments_growth, na.rm = TRUE),
sd_com_growth = sd(comments_growth, na.rm = TRUE),
`avg_com_growth_%` = mean(comments_growth_perc, na.rm = TRUE),
`sd_com_growth_%` = sd(comments_growth_perc, na.rm = TRUE)) %>%
pander()
| treatment | avg_follow_growth | sd_follow_growth | avg_follow_growth_% |
|---|---|---|---|
| control | 91.33 | 132.5 | 0.1537 |
| treated | 654.3 | 1026 | 0.1075 |
| sd_follow_growth_% | avg_view_growth | sd_view_growth | avg_view_growth_% |
|---|---|---|---|
| 0.2828 | 23327 | 61395 | 0.4312 |
| 0.1872 | -11206 | 20910 | -0.04009 |
| sd_view_growth_% | avg_like_growth | sd_like_growth | avg_like_growth_% |
|---|---|---|---|
| 0.9953 | 3108 | 9209 | 0.335 |
| 0.9404 | -257.9 | 423.4 | -0.1539 |
| sd_like_growth_% | avg_com_growth | sd_com_growth | avg_com_growth_% |
|---|---|---|---|
| 0.5606 | -17.89 | 55.27 | -0.07024 |
| 0.5115 | -4.125 | 18.15 | -0.08074 |
| sd_com_growth_% |
|---|
| 0.2713 |
| 0.5509 |
Focal Creator
outcome_sc %>% filter(profile_unique_id == "cbseveningnews") %>%
dplyr::select(follower_growth, follower_growth_perc, views_growth, views_growth_perc, likes_growth, likes_growth_perc, comments_growth, comments_growth_perc) %>%
pander()
| follower_growth | follower_growth_perc | views_growth | views_growth_perc |
|---|---|---|---|
| 0 | 0 | 185754 | 0.2489 |
| likes_growth | likes_growth_perc | comments_growth | comments_growth_perc |
|---|---|---|---|
| 27663 | 0.7405 | -165 | -0.1078 |
We look at the mean and standard deviation for the feature: containsSummary.
df_long %>%
summarise(overall_mean = mean(containsSummary, na.rm = TRUE),
overall_sd = sd(containsSummary, na.rm = TRUE)) %>%
pander()
| overall_mean | overall_sd |
|---|---|
| 0.4747 | 0.4997 |
containsSummary_mean <- mean(df_long$containsSummary, na.rm = TRUE)
containsSummary_sd <- sd(df_long$containsSummary, na.rm = TRUE)
By Treatment and Pre-Post
df_long %>%
group_by(treatment, pre_post) %>%
summarise(mean = mean(containsSummary, na.rm = TRUE),
sd = sd(containsSummary, na.rm = TRUE),
se = sd(containsSummary, na.rm = TRUE) / sqrt(n()),
n = n()) %>%
pander()
| treatment | pre_post | mean | sd | se | n |
|---|---|---|---|---|---|
| control | pre | 0.6074 | 0.4901 | 0.04219 | 135 |
| control | post | 0.6735 | 0.4714 | 0.04761 | 98 |
| treated | pre | 0.4017 | 0.4909 | 0.0262 | 351 |
| treated | post | 0.4175 | 0.4943 | 0.03444 | 206 |
Focal Creator
We look at the feature values for the focal creator.
df_long %>% filter(profile_unique_id == "cbseveningnews") %>%
group_by(pre_post) %>%
summarise(mean = mean(containsSummary, na.rm = TRUE),
sd = sd(containsSummary, na.rm = TRUE),
se = sd(containsSummary, na.rm = TRUE) / sqrt(n()),
n = n()) %>%
pander()
| pre_post | mean | sd | se | n |
|---|---|---|---|---|
| pre | 0.907 | 0.2939 | 0.04482 | 43 |
| post | 0.7222 | 0.4543 | 0.07571 | 36 |
We first estimate whether the feature containsSummary is
associated with the outcome variables views. This is done
using the entire dataset instead of separating into the pre and post
periods.
lm(views~containsSummary, df_long) %>% summary()
##
## Call:
## lm(formula = views ~ containsSummary, data = df_long)
##
## Residuals:
## Min 1Q Median 3Q Max
## -316494 -309345 -96542 -83849 13483362
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97299 42619 2.283 0.022695 *
## containsSummary 219339 61858 3.546 0.000414 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 868200 on 788 degrees of freedom
## Multiple R-squared: 0.0157, Adjusted R-squared: 0.01446
## F-statistic: 12.57 on 1 and 788 DF, p-value: 0.0004145
We then estimate whether we were able to move the non-viral creators to be more like the focal creator by measuring whether contains summary increased for the treated creators.
panderOptions('round',4)
# round to 3 digits
model_p_t <- feols(containsSummary ~ pre_post | profile_unique_id,
data = df_long %>% filter(treatment == "treated"))
model_p_t %>%
summary()
## OLS estimation, Dep. Var.: containsSummary
## Observations: 557
## Fixed-effects: profile_unique_id: 16
## Standard-errors: Clustered (profile_unique_id)
## Estimate Std. Error t value Pr(>|t|)
## pre_postpost 0.013726 0.060035 0.228629 0.82224
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.427024 Adj. R2: 0.222401
## Within R2: 2.318e-4
We run an interaction regression here: containsSummary ~ pre_post * treatment
model_p <- feols(containsSummary ~ pre_post * treatment,
data = df_long)
model_p %>% summary()
## OLS estimation, Dep. Var.: containsSummary
## Observations: 790
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.607407 0.042114 14.422774 < 2.2e-16 ***
## pre_postpost 0.066062 0.064938 1.017315 3.0932e-01
## treatmenttreated -0.205698 0.049556 -4.150823 3.6748e-05 ***
## pre_postpost:treatmenttreated -0.050296 0.077855 -0.646018 5.1846e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.488085 Adj. R2: 0.040995
We aggregate to the creator level given our outcome variables are at the creator level.
generate_wide_data <- function(df_long){
# create wide data by aggregating
df_wide <- df_long %>%
group_by(profile_unique_id, pre_post, treatment, responded) %>%
summarise(mean_summary = mean(containsSummary, na.rm = TRUE),
num_video = n())
post_data <- df_wide %>%
filter(pre_post == "post") %>%
ungroup() %>%
dplyr::select(-pre_post)
pre_data <- df_wide %>%
filter(pre_post == "pre") %>%
ungroup() %>%
dplyr::select(-pre_post)
df_wide <- merge(pre_data, post_data,
by = c("profile_unique_id", "treatment", "responded"),
suffixes = c("_pre", "_post"), all.x = TRUE, all.y = TRUE)
# round to 3 digits
df_wide <- df_wide %>% mutate_if(is.numeric, ~round(., 3))
return (df_wide)
}
df_wide <- generate_wide_data(df_long)
df_wide <- merge(df_wide, outcome_sc, by = c("profile_unique_id", "treatment"))
Average Contains Summary ~ Treatment
df_wide$mean_summary_diff <- df_wide$mean_summary_post - df_wide$mean_summary_pre
lm(mean_summary_diff ~ treatment + num_video_pre + num_video_post + follower_level_post0, data = df_wide) %>%
summary() %>%
pander()
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -0.0675 | 0.1241 | -0.544 | 0.5935 |
| treatmenttreated | -0.0227 | 0.1151 | -0.1968 | 0.8463 |
| num_video_pre | -0.0023 | 0.0067 | -0.3458 | 0.7337 |
| num_video_post | 0.0098 | 0.0073 | 1.336 | 0.1991 |
| follower_level_post0 | 0 | 0 | -1.599 | 0.1282 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 22 | 0.224 | 0.2068 | 0.0201 |
{Outcome} ~ Average Contains Summary
follower_growth_perc_mean <- mean(df_wide$follower_growth_perc, na.rm = TRUE)
follower_growth_perc_sd <- sd(df_wide$follower_growth_perc, na.rm = TRUE)
lm(follower_growth ~ mean_summary_diff, data = df_wide) %>%
summary() %>%
pander()
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 533.3 | 195.7 | 2.725 | 0.013 |
| mean_summary_diff | 671.7 | 877 | 0.7659 | 0.4527 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 22 | 909.5 | 0.0285 | -0.0201 |
lm(follower_growth_perc ~ mean_summary_diff, data = df_wide) %>%
summary() %>%
pander()
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 0.1452 | 0.0482 | 3.013 | 0.0069 |
| mean_summary_diff | 0.3642 | 0.216 | 1.687 | 0.1072 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 22 | 0.224 | 0.1245 | 0.0807 |
lm(views_growth ~ mean_summary_diff, data = df_wide) %>%
summary() %>%
pander()
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -796.9 | 9674 | -0.0824 | 0.9352 |
| mean_summary_diff | -52910 | 43360 | -1.22 | 0.2366 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 22 | 44965 | 0.0693 | 0.0228 |
lm(views_growth_perc ~ mean_summary_diff, data = df_wide) %>%
summary() %>%
pander()
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -0.0186 | 0.1727 | -0.1075 | 0.9155 |
| mean_summary_diff | -1.039 | 0.774 | -1.343 | 0.1944 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 22 | 0.8027 | 0.0827 | 0.0368 |
lm(likes_growth ~ mean_summary_diff, data = df_wide) %>%
summary() %>%
pander()
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 944.6 | 1291 | 0.7316 | 0.4729 |
| mean_summary_diff | -4616 | 5787 | -0.7976 | 0.4345 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 22 | 6002 | 0.0308 | -0.0176 |
lm(likes_growth_perc ~ mean_summary_diff, data = df_wide) %>%
summary() %>%
pander()
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 0.0087 | 0.1339 | 0.065 | 0.9488 |
| mean_summary_diff | -0.1677 | 0.6 | -0.2796 | 0.7827 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 22 | 0.6222 | 0.0039 | -0.0459 |
Intervention
Find non-top focal creators (5K – 250/500K followers) who recently received a viral video success - Point out to non-viral creators recently success and ask them to move towards - 2x2 Scenarios: a control group for non-viral creators and a control for viral creators
Note: all creators should be within the 5K – 250/500K followers’ range in this experiment - Rationale: we won’t be able to influence top-creators (i.e. >= 250/500K followers) - Need to decide the upper bound based on the creators we get
Outcome to measure
Data Tables
Non-viral creators
Columns
Focal creator table
Columns
Note: for now, we simulate data using only containsSummary feature and follower growth percentage outcome.
# average number of videos per creator group by pre post
# divided by 2 because of pre and post
avg_num_videos <- round(nrow(df_long) / length(unique(df_long$profile_unique_id)) / 2)
Simulation Procedure
Entire procedure is run 500 times to calculate the power of the test.
power_analysis <- function(n_nonviral, n_focal, feature_effect_size, follower_growth_effect_size, follower_decline_effect_size, n_run = 500){
pval_diffs <- c()
pval_outcomes <- c()
pval_mediators <- c()
pval_outcome_focal <- c()
pval_diffs_long <- c()
pval_outcomes_long <- c()
pval_mediators_long <- c()
for (i in 1:n_run){
####################################
######## treatment assignment ######
####################################
treated_focal_creators <- sample(1:n_focal, n_focal/2, replace = FALSE)
n_nonviral_per_focal <- n_nonviral / (n_focal / 2)
# print(paste0("n_nonviral_per_focal: ", n_nonviral_per_focal))
assigned_focal_creator = sample(rep(treated_focal_creators, each = n_nonviral_per_focal), n_nonviral, replace = FALSE)
# each assigned focal creator should have half non_viral creators in treatment
# check if this is an integer
if (n_nonviral_per_focal %% 1 != 0){
stop("n_nonviral should be divisible by n_focal")
}
treated_nonviral_creators <- c()
for (i in treated_focal_creators){
treated_nonviral_creators <- c(treated_nonviral_creators, sample(which(assigned_focal_creator == i), n_nonviral_per_focal/2, replace = FALSE))
}
####################################
######## non-viral creators ########
####################################
nonviral_creators <- data.frame(
id = 1:n_nonviral,
# even distribution
assigned_focal_creator = assigned_focal_creator,
follower_growth_perc = rnorm(n_nonviral, follower_growth_perc_mean, follower_growth_perc_sd)
)
nonviral_creators$treatment <- ifelse(nonviral_creators$id %in% treated_nonviral_creators, "treatment", "control")
nonviral_creators$follower_growth_perc <- ifelse(nonviral_creators$treatment == "treatment",
nonviral_creators$follower_growth_perc + follower_growth_effect_size,
nonviral_creators$follower_growth_perc)
### generate long data
nonviral_creators_videos_pre <- rpois(n_nonviral, avg_num_videos)
nonviral_creators_videos_post <- rpois(n_nonviral, avg_num_videos)
nonviral_creators_long_pre <- data.frame(
# repeat each id by nonviral_creators_videos_pre times
id = rep(1:n_nonviral, nonviral_creators_videos_pre),
containsSummary = rbinom(sum(nonviral_creators_videos_pre), 1, containsSummary_mean),
pre_post = "pre"
)
nonviral_creators_long_post <- data.frame(
# repeat each id by nonviral_creators_videos_post times
id = rep(1:n_nonviral, nonviral_creators_videos_post),
pre_post = "post"
)
nonviral_creators_long_post$containsSummary = ifelse(nonviral_creators_long_post$id %in% treated_nonviral_creators,
rbinom(sum(nonviral_creators_videos_post), 1, containsSummary_mean + feature_effect_size),
rbinom(sum(nonviral_creators_videos_post), 1, containsSummary_mean))
nonviral_creators_long <- rbind(nonviral_creators_long_pre, nonviral_creators_long_post)
# merge
nonviral_creators_long <- merge(nonviral_creators, nonviral_creators_long, by = "id")
### generate wide data
nonviral_creators_wide <- nonviral_creators_long %>%
group_by(id, treatment, assigned_focal_creator, follower_growth_perc, pre_post) %>%
summarise(containsSummary = mean(containsSummary, na.rm = TRUE))
nonviral_creators_wide <- reshape2::dcast(nonviral_creators_wide,
id + treatment + assigned_focal_creator + follower_growth_perc ~ pre_post,
value.var = "containsSummary")
nonviral_creators_wide$containsSummary_diff <- nonviral_creators_wide$post - nonviral_creators_wide$pre
nonviral_creators$treatment <- factor(nonviral_creators$treatment, levels = c("control", "treatment"))
### run regression
model_diff <- feols(containsSummary_diff ~ treatment | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
model_outcome <- feols(follower_growth_perc ~ treatment | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
model_mediator <- feols(follower_growth_perc ~ treatment + containsSummary_diff | assigned_focal_creator, data = nonviral_creators_wide, notes = FALSE)
pval_diffs <- c(pval_diffs, model_diff$coeftable[1, 4] %>% as.numeric())
pval_outcomes <- c(pval_outcomes, model_outcome$coeftable[1, 4] %>% as.numeric())
pval_mediators <- c(pval_mediators, model_mediator$coeftable[1, 4] %>% as.numeric())
# model_diff_long <- feols(containsSummary ~ treatment | id + assigned_focal_creator, data = nonviral_creators_long)
# model_outcome_long <- feols(follower_growth_perc ~ treatment | id + assigned_focal_creator, data = nonviral_creators_long)
# model_mediator_long <- feols(follower_growth_perc ~ treatment + containsSummary | id + assigned_focal_creator, data = nonviral_creators_long)
#
# pval_diffs_long <- c(pval_diffs_long, model_diff_long$coeftable[1, 4] %>% as.numeric())
# pval_outcomes_long <- c(pval_outcomes_long, model_outcome_long$coeftable[1, 4] %>% as.numeric())
# pval_mediators_long <- c(pval_mediators_long, model_mediator_long$coeftable[1, 4] %>% as.numeric())
################################
######## focal creators ########
################################
focal_creators <- data.frame(
id = 1:n_focal,
# containsSummary_pre = rbinom(n_focal, 1, containsSummary_mean),
# containsSummary_post = rbinom(n_focal, 1, containsSummary_mean),
follower_growth_perc = rnorm(n_focal, follower_growth_perc_mean, follower_growth_perc_sd),
follower_level = runif(n_focal, 5e4, 2.5e5) %>% round()
)
focal_creators$treatment <- ifelse(focal_creators$id %in% treated_focal_creators, "assigned", "notassigned")
focal_creators$follower_growth_perc <- ifelse(focal_creators$treatment == "assigned",
focal_creators$follower_growth_perc + follower_decline_effect_size * n_nonviral_per_focal/2,
focal_creators$follower_growth_perc)
focal_creators$treatment <- factor(focal_creators$treatment, levels = c("notassigned", "assigned"))
model_outcome_focal <- lm(follower_growth_perc ~ treatment, data = focal_creators)
pval_outcome_focal <- c(pval_outcome_focal, summary(model_outcome_focal)$coefficients[2, 4])
}
power_diffs <- mean(pval_diffs <= 0.05, na.rm = T)
power_outcomes <- mean(pval_outcomes <= 0.05, na.rm = T)
power_mediators <- mean(pval_mediators <= 0.05, na.rm = T)
power_outcome_focal <- mean(pval_outcome_focal <= 0.05, na.rm = T)
return (list(power_diffs = power_diffs, power_outcomes = power_outcomes, power_mediators = power_mediators, power_outcome_focal = power_outcome_focal))
}
# calculate time
start_time <- Sys.time()
feature_effect_size <- 0.1
# assume fixed effect size across all creators
follower_growth_effect_size <- 0.1
# on average how each non-viral creator affects focal creators
follower_decline_effect_size <- 0.005
n_nonvirals_to_test <- seq(20, 100, 20) # this is per treated focal creator
n_focals_to_test <- seq(20, 80, 20)
powers <- data.frame(matrix(nrow = length(n_nonvirals_to_test) * length(n_focals_to_test), ncol = 6))
colnames(powers) <- c("n_nonviral_per_treated_focal", "n_focal", "power_diffs", "power_outcomes", "power_mediators", "power_outcome_focal")
i <- 1
for (n_nonviral in n_nonvirals_to_test){
for (n_focal in n_focals_to_test){
power_list <- power_analysis(n_nonviral * n_focal / 2, n_focal, feature_effect_size, follower_growth_effect_size, follower_decline_effect_size)
powers[i, ] <- c(n_nonviral, n_focal, unlist(power_list))
i <- i + 1
}
}
end_time <- Sys.time()
print(paste0("Time taken to run simulation: ", end_time - start_time))
## [1] "Time taken to run simulation: 8.92246528466543"
Note that as of right now, we randomize half of the focal creators to the control group and half of the focal creators to the treatment group
For example, if n_focal = 80, 40 are randomized into the control group.
# plot powers
melt(powers, id.vars = c("n_nonviral_per_treated_focal", "n_focal")) %>%
# rename variable
mutate(variable = case_when(
variable == "power_diffs" ~ "Difference in containsSummary",
variable == "power_outcomes" ~ "Follower Growth Percentage",
variable == "power_mediators" ~ "Follower Growth Percentage &\ncontainsSummary",
variable == "power_outcome_focal" ~ "Focal Creator Follower Growth Percentage"
)) %>%
ggplot(aes(x = n_nonviral_per_treated_focal, y = value, color = factor(n_focal))) +
geom_line() +
geom_point(size = 1) +
# horizontal line at 0.8
geom_hline(yintercept = 0.8, linetype = "dashed") +
facet_wrap(~variable, scales = "free") +
# set yscale to be 0 to 1 for all
scale_y_continuous(limits = c(0, 1)) +
labs(title = "Power Analysis",
subtitle = "Treatment ~ Variables (with variables name shown on each facet)",
x = "Number of Non-Viral Creators per Treated Focal Creator",
y = "Power") +
theme_minimal()
Comments Growth