setwd("C:/Users/ASUS/Desktop/快手")

数据:2022.09.01–2022.11.01 random pk viewer change data

1.prepare

data = read.csv("Random_pk_viewer_change_data_clean.csv")

自变量

### fan-size
data = data %>%mutate(fans_piar_cat = case_when(
  (before_fans_count>=10000) & (other_before_fans_count>=10000)~ 3,
  (before_fans_count>=10000) & (other_before_fans_count<10000)~ 2,
  (before_fans_count<10000) & (other_before_fans_count>= 10000)~ 1,
  (before_fans_count<10000) & (other_before_fans_count<10000)~ 0
))

data <- data %>% 
  mutate(fans_piar_cat = factor(fans_piar_cat, level = c(0,1,2,3), labels = c("Small vs Small ", "Small vs Big", "Big vs Small", "Big vs Big")))

### 合作
# (1) 按照直播类型来划分:同类:1
data <- data %>% 
  mutate(is_cooperative = case_when(
    (live_operation_tag == other_live_operation_tag) ~ 1,  
    TRUE ~ 0
  ))

table(data$fans_piar_cat)
## 
## Small vs Small     Small vs Big    Big vs Small      Big vs Big 
##            7597             616             605             847
table(data$is_cooperative)
## 
##    0    1 
## 6507 3163

因变量

# # table(data$total_cost_amt) # 119056 为0
# # (1) 人均粉丝打赏
# data$avg_fan_total_cost_amt= (data$total_cost_amt)/(data$before_fans_count + 1)
# # (2) 涨粉
# # data$follow_author_fans_count
# # (3) 掉粉
# # data$unfollow_author_fans_count
# # (4) net 涨粉
# data$net_follow_fans= data$follow_author_fans_count - data$unfollow_author_fans_count
# summary(data$net_follow_fans)
# # (5) 吸粉
# # data$already_follow_other_fans_count
# # (6) 粉丝被对方吸走
# # data$other_already_follow_other_fans_count
# # (7) net吸粉
# data$net_attract_fans= data$already_follow_other_fans_count - data$other_already_follow_other_fans_count
# summary(data$net_attract_fans)

# (8) 自己的viewer跑到对方直播间(掉viewer)
# data$a_to_b_user_count

# (9) 对方的viewer跑到自己的直播间(吸viewer)
# data$b_to_a_user_count
 
# (10) net吸来的viewer
data$net_viewer_change= data$b_to_a_user_count  - data$a_to_b_user_count
summary(data$net_viewer_change)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -516.0000    0.0000    0.0000   -0.1611    0.0000  250.0000

(1) plot_predictions_with_ci

library(ggplot2)
library(dplyr)

# 创建函数
plot_predictions_with_ci <- function(model_1, data) {
  
  # 预测值和标准误差
  predictions <- predict(model_1, newdata = data, se.fit = TRUE)
  
  # 将预测值和标准误差添加到数据框中
  data$predicted_y <- predictions$fit 
  data$se_fit <- predictions$se.fit
  
  # 按照分组变量计算均值和95%的置信区间
  summary_data <- data %>%
    group_by(fans_piar_cat, is_cooperative) %>%
    summarise(
      mean_predicted_y = mean(predicted_y),
      ci_lower = mean(predicted_y) - 1.96 * mean(se_fit),
      ci_upper = mean(predicted_y) + 1.96 * mean(se_fit)
    )
  
  # 绘制图表
  plot <- ggplot(summary_data, aes(x = factor(fans_piar_cat), y = mean_predicted_y, color = factor(is_cooperative))) +
    geom_point(size = 3) +
    geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
    labs(
      title = "Mean Predicted y Values with Confidence Intervals",
      x = "Pair Categories",
      y = "Mean Predicted y",
      color = "Is Same Category"
    ) +
    theme_minimal()
  
  # 返回图表
  return(plot)
}

(2) plot_predictions_fans

library(ggplot2)
library(dplyr)

# 创建函数
plot_predictions_fans <- function(model_1, data) {
  
  # 预测值和标准误差
  predictions <- predict(model_1, newdata = data, se.fit = TRUE)
  
  # 将预测值和标准误差添加到数据框中
  data$predicted_y <- predictions$fit 
  data$se_fit <- predictions$se.fit
  
  # 按照分组变量计算均值和95%的置信区间
  summary_data <- data %>%
    group_by(fans_piar_cat) %>%
    summarise(
      mean_predicted_y = mean(predicted_y),
      ci_lower = mean(predicted_y) - 1.96 * mean(se_fit),
      ci_upper = mean(predicted_y) + 1.96 * mean(se_fit)
    )
  
  # 绘制图表
  plot <- ggplot(summary_data, aes(x = factor(fans_piar_cat), y = mean_predicted_y, )) +
    geom_point(size = 3) +
    geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
    labs(
      title = "Mean Predicted y Values with Confidence Intervals",
      x = "fans Pair",
      y = "Mean Predicted y",
      color = "Is Same Category"
    ) +
    theme_minimal()
  
  # 返回图表
  return(plot)
}

(3) plot_predictions_cate

library(ggplot2)
library(dplyr)

# 创建函数
plot_predictions_cate <- function(model_1, data) {
  
  # 预测值和标准误差
  predictions <- predict(model_1, newdata = data, se.fit = TRUE)
  
  # 将预测值和标准误差添加到数据框中
  data$predicted_y <- predictions$fit 
  data$se_fit <- predictions$se.fit
  
  # 按照分组变量计算均值和95%的置信区间
  summary_data <- data %>%
    group_by(is_cooperative) %>%
    summarise(
      mean_predicted_y = mean(predicted_y),
      ci_lower = mean(predicted_y) - 1.96 * mean(se_fit),
      ci_upper = mean(predicted_y) + 1.96 * mean(se_fit)
    )
  
  # 绘制图表
  plot <- ggplot(summary_data, aes(x = factor(is_cooperative), y = mean_predicted_y )) +
    geom_point(size = 3) +
    geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
    labs(
      title = "Mean Predicted y Values with Confidence Intervals",
      x = "Categories",
      y = "Mean Predicted y",
      color = "Is Same Category"
    ) +
    theme_minimal()
  
  # 返回图表
  return(plot)
}

(4) plot_continue_cutoff

library(dplyr)
library(ggplot2)
library(scales)  # for comma formatting
## 
## 载入程序包:'scales'
## The following object is masked from 'package:fixest':
## 
##     pvalue
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plm::between()      masks dplyr::between()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ plm::lag()          masks dplyr::lag(), stats::lag()
## ✖ plm::lead()         masks dplyr::lead()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 定义函数,输入参数为模型公式
plot_continue_cutoff <- function(data, model_formula) {
  
  # Step 1: Create a sequence of 100 cut-offs from 0.1k to 1000k, log-stepped
  cutoffs <- exp(seq(log(100), log(1e6), length.out = 100))
  
  # Create a matrix to store coefficients of factor(fans_piar_cat)
  coefficients_matrix <- matrix(NA, nrow = 100, ncol = 3)
  
  # Create a matrix to store the counts of each category
  pct_matrix <- matrix(NA, nrow = 100, ncol = 4)  # For 4 categories
  
  # Step 2: Loop over each cut-off, create fans_piar_cat, run the regression, and store the coefficients
  for (i in seq_along(cutoffs)) {
    cut_off <- cutoffs[i]
    
    # Create the fans_piar_cat variable based on cut-off
    data <- data %>%
      mutate(fans_piar_cat_temp = case_when(
        (before_fans_count < cut_off) & (other_before_fans_count < cut_off) ~ "Small vs Small",
        (before_fans_count < cut_off) & (other_before_fans_count >= cut_off) ~ "Small vs Big",
        (before_fans_count >= cut_off) & (other_before_fans_count < cut_off) ~ "Big vs Small",
        (before_fans_count >= cut_off) & (other_before_fans_count >= cut_off) ~ "Big vs Big"
      )) %>%
      mutate(fans_piar_cat_temp = relevel(factor(fans_piar_cat_temp), ref = "Small vs Small"))
    
    # 1.计算percentage 
    pct_summary <- data %>%
      count(fans_piar_cat_temp) %>%
      mutate(pct = n / sum(n) * 100)
    
    pct_matrix[i, ] <- pct_summary %>%
      complete(fans_piar_cat_temp = c("Big vs Big", "Small vs Big", "Big vs Small", "Small vs Small"), fill = list(pct = 0)) %>%
      pull(pct)
    
    # 2.计算每个model的系数
    model <- feols(model_formula, data = data,vcov = ~author_id)
    
    coef_summary <- summary(model)$coefficients
    
    # print(coef_summary)
    # Store the coefficients for the three categories in the matrix
    coefficients_matrix[i, ] <- coef_summary[c("factor(fans_piar_cat_temp)Small vs Big", 
                                               "factor(fans_piar_cat_temp)Big vs Small", 
                                               "factor(fans_piar_cat_temp)Big vs Big")]  # "(Intercept)")]
  }
    #print(coefficients_matrix)
  
  ## 1.画出四组percentage图
  pct_df <- data.frame(
    Cutoff = cutoffs,
    Big_vs_Big = pct_matrix[, 1],
    Big_vs_Small = pct_matrix[, 2],
    Small_vs_Big = pct_matrix[, 3],
    Small_vs_Small = pct_matrix[, 4]
  )
  
  pct_df_long <- pct_df %>%
    pivot_longer(cols = c("Big_vs_Big", "Big_vs_Small","Small_vs_Big","Small_vs_Small"),
                 names_to = "Category", values_to = "Percentage")
  
  # print(pct_df_long)
  # 画图
  pct_plot <- ggplot(pct_df_long, aes(x = Cutoff, y = Percentage, color = Category)) +
    geom_line() +
    scale_x_log10() +  # Logarithmic scale for the cut-off
    labs(title = "Distribution of fans_piar_cat across Cutoffs",
         x = "Cutoff (log scale)", y = "Percentage",
         color = "Category") +
    theme_minimal()
  
  # print(pct_plot)
  
  ## 2.画出model coef图
  coef_df <- data.frame(
    Cutoff = cutoffs,
    Small_vs_Big = coefficients_matrix[, 1],
    Big_vs_Small = coefficients_matrix[, 2],
    Big_vs_Big = coefficients_matrix[, 3]
    #Small_vs_Small = coefficients_matrix[, 4]
  )
  
  coef_df_long <- coef_df %>%
    pivot_longer(cols = c("Small_vs_Big", "Big_vs_Small", "Big_vs_Big"),
                 names_to = "Category", values_to = "Coefficient")
  
  ## 画图
  coef_plot <- ggplot(coef_df_long, aes(x = Cutoff, y = Coefficient, color = Category)) +
    geom_line() +
    scale_x_log10() +  # Logarithmic scale for the cut-off
    labs(title = "Evolution of Coefficients for fans_piar_cat across Cutoffs",
         x = "Cutoff (log scale)", y = "Coefficient",
         color = "Category") +
    theme_minimal()
  
  print(coef_plot)
}

2.regression

(1) 查看打赏是否来自于现有直播间的viewer

data$percent_of_gifter = (data$original_gifter_cnt)/(data$pk_gifter_cnt)
summary(data$percent_of_gifter)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.333   1.000   0.710   1.000   1.000    8337
library(ggplot2)

# 假设你有一个变量 'x',可以用直方图展示其分布
ggplot(data, aes(x = percent_of_gifter)) +
  geom_histogram(binwidth = 0.1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of x", x = "X values", y = "Frequency")
## Warning: Removed 8337 rows containing non-finite outside the scale range
## (`stat_bin()`).

## (2) DV:掉viewer:log(data$a_to_b_user_count+1) ### (1) regression

# 自己的viewer跑走
model <- feols(log(a_to_b_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(a_to_b_user_count + 1)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error   t value
## factor(fans_piar_cat)Small vs Big                 0.114304   0.021472  5.323314
## factor(fans_piar_cat)Big vs Small                 0.184777   0.028587  6.463749
## factor(fans_piar_cat)Big vs Big                   0.259616   0.025856 10.040872
## is_cooperative                                    0.012229   0.006726  1.818109
## factor(fans_piar_cat)Small vs Big:is_cooperative  0.060407   0.044967  1.343366
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.032796   0.041314 -0.793814
## factor(fans_piar_cat)Big vs Big:is_cooperative   -0.067022   0.037599 -1.782556
##                                                    Pr(>|t|)    
## factor(fans_piar_cat)Small vs Big                1.0431e-07 ***
## factor(fans_piar_cat)Big vs Small                1.0735e-10 ***
## factor(fans_piar_cat)Big vs Big                   < 2.2e-16 ***
## is_cooperative                                   6.9080e-02 .  
## factor(fans_piar_cat)Small vs Big:is_cooperative 1.7919e-01    
## factor(fans_piar_cat)Big vs Small:is_cooperative 4.2732e-01    
## factor(fans_piar_cat)Big vs Big:is_cooperative   7.4692e-02 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.332662     Adj. R2: 0.054593
##                  Within R2: 0.054838

(2) plot both

# 画图
model <- feols(log(a_to_b_user_count+1)  ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(a_to_b_user_count + 1)
## Observations: 9,665
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error   t value
## (Intercept)                                       0.070916   0.003637 19.499666
## factor(fans_piar_cat)Small vs Big                 0.117487   0.021393  5.491934
## factor(fans_piar_cat)Big vs Small                 0.184024   0.028611  6.431839
## factor(fans_piar_cat)Big vs Big                   0.259789   0.025756 10.086484
## is_cooperative                                    0.012075   0.006628  1.821677
## factor(fans_piar_cat)Small vs Big:is_cooperative  0.057657   0.044608  1.292510
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.030939   0.041121 -0.752378
## factor(fans_piar_cat)Big vs Big:is_cooperative   -0.068796   0.037510 -1.834077
##                                                    Pr(>|t|)    
## (Intercept)                                       < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big                4.0819e-08 ***
## factor(fans_piar_cat)Big vs Small                1.3237e-10 ***
## factor(fans_piar_cat)Big vs Big                   < 2.2e-16 ***
## is_cooperative                                   6.8537e-02 .  
## factor(fans_piar_cat)Small vs Big:is_cooperative 1.9621e-01    
## factor(fans_piar_cat)Big vs Small:is_cooperative 4.5184e-01    
## factor(fans_piar_cat)Big vs Big:is_cooperative   6.6675e-02 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.333764   Adj. R2: 0.054332
plot_predictions_with_ci(model, data)
## `summarise()` has grouped output by 'fans_piar_cat'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

(3) plot fans

# 画图
model <- feols(log(a_to_b_user_count+1)  ~ factor(fans_piar_cat), data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(a_to_b_user_count + 1)
## Observations: 9,665
## Standard-errors: Clustered (author_id) 
##                                   Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)                       0.074737   0.003044 24.54987 < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big 0.138909   0.019828  7.00580 2.629e-12 ***
## factor(fans_piar_cat)Big vs Small 0.173374   0.021210  8.17400 3.380e-16 ***
## factor(fans_piar_cat)Big vs Big   0.234740   0.019418 12.08879 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.334015   Adj. R2: 0.053302
plot_predictions_fans(model, data)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

(4) plot category

# 画图
model <- feols(log(a_to_b_user_count+1)  ~ is_cooperative, data = data,vcov = ~author_id)

summary(model)
## OLS estimation, Dep. Var.: log(a_to_b_user_count + 1)
## Observations: 9,670
## Standard-errors: Clustered (author_id) 
##                Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)    0.110034   0.004278 25.72329 < 2.2e-16 ***
## is_cooperative 0.015046   0.007514  2.00249  0.045261 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.343191   Adj. R2: 3.195e-4
plot_predictions_cate(model, data)

(6) plot_fan_pair_heatmap

library(ggplot2)
library(dplyr)
library(RColorBrewer)

# Step 1: Set the ranges for the fan counts (fewer, broader bins)
data <- data %>% mutate(fans_new_range = case_when(
  before_fans_count > 0 & before_fans_count <= 1000 ~ "0-1k",
  before_fans_count > 1000 & before_fans_count <= 10000 ~ "1k-10k",
  before_fans_count > 10000 & before_fans_count <= 100000 ~ "10k-100k",
  before_fans_count > 100000 & before_fans_count <= 1000000 ~ "100k-1M",
  TRUE ~ ">1M"
))

data <- data %>% mutate(other_fans_new_range = case_when(
  other_before_fans_count > 0 & other_before_fans_count <= 1000 ~ "0-1k",
  other_before_fans_count > 1000 & other_before_fans_count <= 10000 ~ "1k-10k",
  other_before_fans_count > 10000 & other_before_fans_count <= 100000 ~ "10k-100k",
  other_before_fans_count > 100000 & other_before_fans_count <= 1000000 ~ "100k-1M",
  TRUE ~ ">1M"
))

# Step 2: Set the correct order for the categories
data <- data %>%
  mutate(fans_new_range = factor(fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")),
         other_fans_new_range = factor(other_fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")))

# Step 3: Aggregate the data to calculate the mean and confidence intervals, handling 0% cases
agg_data <- data %>%
  group_by(fans_new_range, other_fans_new_range) %>%
  summarize(
    count = n(),  # 计算每个组合的数量
    Y = ifelse(count > 0, mean(log(a_to_b_user_count+1) , na.rm = TRUE), NA),  # 如果有数据,计算均值;否则设为NA
    ci_lower = ifelse(count > 0, Y - qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA),  # 如果有数据,计算CI;否则设为NA
    ci_upper = ifelse(count > 0, Y + qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA)   # 如果有数据,计算CI;否则设为NA
  ) %>%
  ungroup() %>%
  mutate(percentage = count / sum(count) * 100)
## `summarise()` has grouped output by 'fans_new_range'. You can override using
## the `.groups` argument.
# Step 4: Plot the data in a heatmap-like format with custom colors and percentage labels
ggplot(agg_data, aes(x = fans_new_range, y = other_fans_new_range, fill = Y)) +
  geom_tile() +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
  geom_text(aes(label = paste0(round(percentage, 2), "%")), color = "black", size = 4) +
  scale_fill_gradientn(colors = c("yellow", "orange", "darkorange", "red", "darkred"), 
                     values = scales::rescale(c(0, 0.05, 0.1, 0.15, 0.2)), 
                     #limits = c(0, 0.2), 
                     na.value = "white") + 
  labs(title = "Average Fan Total Cost Amount by Fan Ranges",
       x = "Fans Range",
       y = "Other Fans Range",
       fill = "Avg Fan Total Cost Amt") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

(3) DV:吸viewer:log(b_to_a_user_count+1)

# 别人的viewer跑来
model <- feols(log(b_to_a_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error   t value
## factor(fans_piar_cat)Small vs Big                 0.065159   0.021448  3.038017
## factor(fans_piar_cat)Big vs Small                 0.069651   0.019270  3.614489
## factor(fans_piar_cat)Big vs Big                   0.154335   0.020542  7.513207
## is_cooperative                                    0.002334   0.004242  0.550192
## factor(fans_piar_cat)Small vs Big:is_cooperative  0.016593   0.029754  0.557663
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.033771   0.025369 -1.331196
## factor(fans_piar_cat)Big vs Big:is_cooperative    0.002425   0.035324  0.068653
##                                                    Pr(>|t|)    
## factor(fans_piar_cat)Small vs Big                2.3881e-03 ** 
## factor(fans_piar_cat)Big vs Small                3.0255e-04 ***
## factor(fans_piar_cat)Big vs Big                  6.3109e-14 ***
## is_cooperative                                   5.8220e-01    
## factor(fans_piar_cat)Small vs Big:is_cooperative 5.7709e-01    
## factor(fans_piar_cat)Big vs Small:is_cooperative 1.8316e-01    
## factor(fans_piar_cat)Big vs Big:is_cooperative   9.4527e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.241592     Adj. R2: 0.03464 
##                  Within R2: 0.035915

(1) regression

# 自己的viewer跑走
model <- feols(log(b_to_a_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error   t value
## factor(fans_piar_cat)Small vs Big                 0.065159   0.021448  3.038017
## factor(fans_piar_cat)Big vs Small                 0.069651   0.019270  3.614489
## factor(fans_piar_cat)Big vs Big                   0.154335   0.020542  7.513207
## is_cooperative                                    0.002334   0.004242  0.550192
## factor(fans_piar_cat)Small vs Big:is_cooperative  0.016593   0.029754  0.557663
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.033771   0.025369 -1.331196
## factor(fans_piar_cat)Big vs Big:is_cooperative    0.002425   0.035324  0.068653
##                                                    Pr(>|t|)    
## factor(fans_piar_cat)Small vs Big                2.3881e-03 ** 
## factor(fans_piar_cat)Big vs Small                3.0255e-04 ***
## factor(fans_piar_cat)Big vs Big                  6.3109e-14 ***
## is_cooperative                                   5.8220e-01    
## factor(fans_piar_cat)Small vs Big:is_cooperative 5.7709e-01    
## factor(fans_piar_cat)Big vs Small:is_cooperative 1.8316e-01    
## factor(fans_piar_cat)Big vs Big:is_cooperative   9.4527e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.241592     Adj. R2: 0.03464 
##                  Within R2: 0.035915

(2) plot both

# 画图
model <- feols(log(b_to_a_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,665
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error   t value
## (Intercept)                                       0.022434   0.002306  9.728907
## factor(fans_piar_cat)Small vs Big                 0.066886   0.021397  3.125988
## factor(fans_piar_cat)Big vs Small                 0.068487   0.019193  3.568331
## factor(fans_piar_cat)Big vs Big                   0.154916   0.020531  7.545473
## is_cooperative                                    0.001818   0.004111  0.442351
## factor(fans_piar_cat)Small vs Big:is_cooperative  0.016021   0.029638  0.540534
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.031047   0.025214 -1.231328
## factor(fans_piar_cat)Big vs Big:is_cooperative    0.001535   0.035305  0.043466
##                                                    Pr(>|t|)    
## (Intercept)                                       < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big                1.7776e-03 ** 
## factor(fans_piar_cat)Big vs Small                3.6110e-04 ***
## factor(fans_piar_cat)Big vs Big                  4.9368e-14 ***
## is_cooperative                                   6.5825e-01    
## factor(fans_piar_cat)Small vs Big:is_cooperative 5.8884e-01    
## factor(fans_piar_cat)Big vs Small:is_cooperative 2.1823e-01    
## factor(fans_piar_cat)Big vs Big:is_cooperative   9.6533e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.242249   Adj. R2: 0.035511
plot_predictions_with_ci(model, data)
## `summarise()` has grouped output by 'fans_piar_cat'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

(3) plot fans

# 画图
model <- feols(log(b_to_a_user_count+1) ~ factor(fans_piar_cat), data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,665
## Standard-errors: Clustered (author_id) 
##                                   Estimate Std. Error  t value   Pr(>|t|)    
## (Intercept)                       0.023010   0.001906 12.07192  < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big 0.072769   0.015910  4.57374 4.8538e-06 ***
## factor(fans_piar_cat)Big vs Small 0.057331   0.013663  4.19600 2.7423e-05 ***
## factor(fans_piar_cat)Big vs Big   0.155596   0.016840  9.23963  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.242286   Adj. R2: 0.035617
plot_predictions_fans(model, data)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

(4) plot category

# 画图
model <- feols(log(b_to_a_user_count+1) ~ is_cooperative, data = data,vcov = ~author_id)

summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,670
## Standard-errors: Clustered (author_id) 
##                Estimate Std. Error   t value  Pr(>|t|)    
## (Intercept)    0.043138   0.003060 14.096094 < 2.2e-16 ***
## is_cooperative 0.005232   0.005306  0.986081   0.32412    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.246684   Adj. R2: -4.411e-6
plot_predictions_cate(model, data)

(6) plot_fan_pair_heatmap

library(ggplot2)
library(dplyr)
library(RColorBrewer)

# Step 1: Set the ranges for the fan counts (fewer, broader bins)
data <- data %>% mutate(fans_new_range = case_when(
  before_fans_count > 0 & before_fans_count <= 1000 ~ "0-1k",
  before_fans_count > 1000 & before_fans_count <= 10000 ~ "1k-10k",
  before_fans_count > 10000 & before_fans_count <= 100000 ~ "10k-100k",
  before_fans_count > 100000 & before_fans_count <= 1000000 ~ "100k-1M",
  TRUE ~ ">1M"
))

data <- data %>% mutate(other_fans_new_range = case_when(
  other_before_fans_count > 0 & other_before_fans_count <= 1000 ~ "0-1k",
  other_before_fans_count > 1000 & other_before_fans_count <= 10000 ~ "1k-10k",
  other_before_fans_count > 10000 & other_before_fans_count <= 100000 ~ "10k-100k",
  other_before_fans_count > 100000 & other_before_fans_count <= 1000000 ~ "100k-1M",
  TRUE ~ ">1M"
))

# Step 2: Set the correct order for the categories
data <- data %>%
  mutate(fans_new_range = factor(fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")),
         other_fans_new_range = factor(other_fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")))

# Step 3: Aggregate the data to calculate the mean and confidence intervals, handling 0% cases
agg_data <- data %>%
  group_by(fans_new_range, other_fans_new_range) %>%
  summarize(
    count = n(),  # 计算每个组合的数量
    Y = ifelse(count > 0, mean(log(b_to_a_user_count+1), na.rm = TRUE), NA),  # 如果有数据,计算均值;否则设为NA
    ci_lower = ifelse(count > 0, Y - qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA),  # 如果有数据,计算CI;否则设为NA
    ci_upper = ifelse(count > 0, Y + qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA)   # 如果有数据,计算CI;否则设为NA
  ) %>%
  ungroup() %>%
  mutate(percentage = count / sum(count) * 100)
## `summarise()` has grouped output by 'fans_new_range'. You can override using
## the `.groups` argument.
# Step 4: Plot the data in a heatmap-like format with custom colors and percentage labels
ggplot(agg_data, aes(x = fans_new_range, y = other_fans_new_range, fill = Y)) +
  geom_tile() +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
  geom_text(aes(label = paste0(round(percentage, 2), "%")), color = "black", size = 4) +
  scale_fill_gradientn(colors = c("yellow", "orange", "darkorange", "red", "darkred"), 
                     values = scales::rescale(c(0, 0.05, 0.1, 0.15, 0.2)), 
                     #limits = c(0, 0.2), 
                     na.value = "white") + 
  labs(title = "Average Fan Total Cost Amount by Fan Ranges",
       x = "Fans Range",
       y = "Other Fans Range",
       fill = "Avg Fan Total Cost Amt") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

(4) DV:net吸引viewer:log(net_viewer_change+517)

summary(data$net_viewer_change)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -516.0000    0.0000    0.0000   -0.1611    0.0000  250.0000
# 纯获得的viewer
model <- feols(log(net_viewer_change+517) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error   t value
## factor(fans_piar_cat)Small vs Big                 0.001494   0.001231  1.214193
## factor(fans_piar_cat)Big vs Small                -0.016542   0.016000 -1.033848
## factor(fans_piar_cat)Big vs Big                  -0.000807   0.000369 -2.187275
## is_cooperative                                    0.000011   0.000140  0.080151
## factor(fans_piar_cat)Small vs Big:is_cooperative -0.001805   0.001313 -1.375159
## factor(fans_piar_cat)Big vs Small:is_cooperative  0.016107   0.016019  1.005523
## factor(fans_piar_cat)Big vs Big:is_cooperative    0.001391   0.000753  1.847283
##                                                  Pr(>|t|)    
## factor(fans_piar_cat)Small vs Big                0.224705    
## factor(fans_piar_cat)Big vs Small                0.301235    
## factor(fans_piar_cat)Big vs Big                  0.028748 *  
## is_cooperative                                   0.936119    
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.169116    
## factor(fans_piar_cat)Big vs Small:is_cooperative 0.314672    
## factor(fans_piar_cat)Big vs Big:is_cooperative   0.064738 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063398     Adj. R2: 0.002967
##                  Within R2: 0.00262

(1) regression

# 自己的viewer跑走
model <- feols(log(net_viewer_change+517) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error   t value
## factor(fans_piar_cat)Small vs Big                 0.001494   0.001231  1.214193
## factor(fans_piar_cat)Big vs Small                -0.016542   0.016000 -1.033848
## factor(fans_piar_cat)Big vs Big                  -0.000807   0.000369 -2.187275
## is_cooperative                                    0.000011   0.000140  0.080151
## factor(fans_piar_cat)Small vs Big:is_cooperative -0.001805   0.001313 -1.375159
## factor(fans_piar_cat)Big vs Small:is_cooperative  0.016107   0.016019  1.005523
## factor(fans_piar_cat)Big vs Big:is_cooperative    0.001391   0.000753  1.847283
##                                                  Pr(>|t|)    
## factor(fans_piar_cat)Small vs Big                0.224705    
## factor(fans_piar_cat)Big vs Small                0.301235    
## factor(fans_piar_cat)Big vs Big                  0.028748 *  
## is_cooperative                                   0.936119    
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.169116    
## factor(fans_piar_cat)Big vs Small:is_cooperative 0.314672    
## factor(fans_piar_cat)Big vs Big:is_cooperative   0.064738 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063398     Adj. R2: 0.002967
##                  Within R2: 0.00262

(2) plot both

# 画图
model <- feols(log(net_viewer_change+517) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,665
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error
## (Intercept)                                       6.247881   0.000015
## factor(fans_piar_cat)Small vs Big                 0.000874   0.001009
## factor(fans_piar_cat)Big vs Small                -0.016648   0.016174
## factor(fans_piar_cat)Big vs Big                  -0.000749   0.000281
## is_cooperative                                   -0.000024   0.000027
## factor(fans_piar_cat)Small vs Big:is_cooperative -0.001840   0.001122
## factor(fans_piar_cat)Big vs Small:is_cooperative  0.016241   0.016174
## factor(fans_piar_cat)Big vs Big:is_cooperative    0.000910   0.000357
##                                                        t value  Pr(>|t|)    
## (Intercept)                                      410265.260485 < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big                     0.866220 0.3863921    
## factor(fans_piar_cat)Big vs Small                    -1.029287 0.3033720    
## factor(fans_piar_cat)Big vs Big                      -2.660993 0.0078046 ** 
## is_cooperative                                       -0.902975 0.3665631    
## factor(fans_piar_cat)Small vs Big:is_cooperative     -1.639873 0.1010660    
## factor(fans_piar_cat)Big vs Small:is_cooperative      1.004149 0.3153331    
## factor(fans_piar_cat)Big vs Big:is_cooperative        2.546293 0.0109036 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063633   Adj. R2: 0.001903
plot_predictions_with_ci(model, data)
## `summarise()` has grouped output by 'fans_piar_cat'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

(3) plot fans

# 画图
model <- feols(log(net_viewer_change+517) ~ factor(fans_piar_cat), data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,665
## Standard-errors: Clustered (author_id) 
##                                    Estimate Std. Error       t value  Pr(>|t|)
## (Intercept)                        6.247873   0.000013 498534.959414 < 2.2e-16
## factor(fans_piar_cat)Small vs Big  0.000206   0.000669      0.308666  0.757583
## factor(fans_piar_cat)Big vs Small -0.010770   0.010323     -1.043312  0.296831
## factor(fans_piar_cat)Big vs Big   -0.000410   0.000195     -2.103332  0.035464
##                                      
## (Intercept)                       ***
## factor(fans_piar_cat)Small vs Big    
## factor(fans_piar_cat)Big vs Small    
## factor(fans_piar_cat)Big vs Big   *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063663   Adj. R2: 0.001363
plot_predictions_fans(model, data)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

(4) plot category

# 画图
model <- feols(log(net_viewer_change+517) ~ is_cooperative, data = data,vcov = ~author_id)

summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,670
## Standard-errors: Clustered (author_id) 
##                Estimate Std. Error     t value  Pr(>|t|)    
## (Intercept)    6.246885   0.000963 6490.212117 < 2.2e-16 ***
## is_cooperative 0.000891   0.000964    0.924964   0.35501    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063699   Adj. R2: -6.034e-5
plot_predictions_cate(model, data)

(6) plot_fan_pair_heatmap

library(ggplot2)
library(dplyr)
library(RColorBrewer)

# Step 1: Set the ranges for the fan counts (fewer, broader bins)
data <- data %>% mutate(fans_new_range = case_when(
  before_fans_count > 0 & before_fans_count <= 1000 ~ "0-1k",
  before_fans_count > 1000 & before_fans_count <= 10000 ~ "1k-10k",
  before_fans_count > 10000 & before_fans_count <= 100000 ~ "10k-100k",
  before_fans_count > 100000 & before_fans_count <= 1000000 ~ "100k-1M",
  TRUE ~ ">1M"
))

data <- data %>% mutate(other_fans_new_range = case_when(
  other_before_fans_count > 0 & other_before_fans_count <= 1000 ~ "0-1k",
  other_before_fans_count > 1000 & other_before_fans_count <= 10000 ~ "1k-10k",
  other_before_fans_count > 10000 & other_before_fans_count <= 100000 ~ "10k-100k",
  other_before_fans_count > 100000 & other_before_fans_count <= 1000000 ~ "100k-1M",
  TRUE ~ ">1M"
))

# Step 2: Set the correct order for the categories
data <- data %>%
  mutate(fans_new_range = factor(fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")),
         other_fans_new_range = factor(other_fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")))

# Step 3: Aggregate the data to calculate the mean and confidence intervals, handling 0% cases
agg_data <- data %>%
  group_by(fans_new_range, other_fans_new_range) %>%
  summarize(
    count = n(),  # 计算每个组合的数量
    Y = ifelse(count > 0, mean(log(net_viewer_change+517), na.rm = TRUE), NA),  # 如果有数据,计算均值;否则设为NA
    ci_lower = ifelse(count > 0, Y - qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA),  # 如果有数据,计算CI;否则设为NA
    ci_upper = ifelse(count > 0, Y + qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA)   # 如果有数据,计算CI;否则设为NA
  ) %>%
  ungroup() %>%
  mutate(percentage = count / sum(count) * 100)
## `summarise()` has grouped output by 'fans_new_range'. You can override using
## the `.groups` argument.
# Step 4: Plot the data in a heatmap-like format with custom colors and percentage labels
ggplot(agg_data, aes(x = fans_new_range, y = other_fans_new_range, fill = Y)) +
  geom_tile() +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
  geom_text(aes(label = paste0(round(percentage, 2), "%")), color = "black", size = 4) +
  scale_fill_gradientn(colors = c("yellow", "orange", "darkorange", "red", "darkred"), 
                     values = scales::rescale(c(0, 0.05, 0.1, 0.15, 0.2)), 
                     #limits = c(0, 0.2), 
                     na.value = "white") + 
  labs(title = "Average Fan Total Cost Amount by Fan Ranges",
       x = "Fans Range",
       y = "Other Fans Range",
       fill = "Avg Fan Total Cost Amt") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))