setwd("C:/Users/ASUS/Desktop/快手")
数据:2022.09.01–2022.11.01 random pk viewer change data
data = read.csv("Random_pk_viewer_change_data_clean.csv")
### fan-size
data = data %>%mutate(fans_piar_cat = case_when(
(before_fans_count>=10000) & (other_before_fans_count>=10000)~ 3,
(before_fans_count>=10000) & (other_before_fans_count<10000)~ 2,
(before_fans_count<10000) & (other_before_fans_count>= 10000)~ 1,
(before_fans_count<10000) & (other_before_fans_count<10000)~ 0
))
data <- data %>%
mutate(fans_piar_cat = factor(fans_piar_cat, level = c(0,1,2,3), labels = c("Small vs Small ", "Small vs Big", "Big vs Small", "Big vs Big")))
### 合作
# (1) 按照直播类型来划分:同类:1
data <- data %>%
mutate(is_cooperative = case_when(
(live_operation_tag == other_live_operation_tag) ~ 1,
TRUE ~ 0
))
table(data$fans_piar_cat)
##
## Small vs Small Small vs Big Big vs Small Big vs Big
## 7597 616 605 847
table(data$is_cooperative)
##
## 0 1
## 6507 3163
# # table(data$total_cost_amt) # 119056 为0
# # (1) 人均粉丝打赏
# data$avg_fan_total_cost_amt= (data$total_cost_amt)/(data$before_fans_count + 1)
# # (2) 涨粉
# # data$follow_author_fans_count
# # (3) 掉粉
# # data$unfollow_author_fans_count
# # (4) net 涨粉
# data$net_follow_fans= data$follow_author_fans_count - data$unfollow_author_fans_count
# summary(data$net_follow_fans)
# # (5) 吸粉
# # data$already_follow_other_fans_count
# # (6) 粉丝被对方吸走
# # data$other_already_follow_other_fans_count
# # (7) net吸粉
# data$net_attract_fans= data$already_follow_other_fans_count - data$other_already_follow_other_fans_count
# summary(data$net_attract_fans)
# (8) 自己的viewer跑到对方直播间(掉viewer)
# data$a_to_b_user_count
# (9) 对方的viewer跑到自己的直播间(吸viewer)
# data$b_to_a_user_count
# (10) net吸来的viewer
data$net_viewer_change= data$b_to_a_user_count - data$a_to_b_user_count
summary(data$net_viewer_change)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -516.0000 0.0000 0.0000 -0.1611 0.0000 250.0000
library(ggplot2)
library(dplyr)
# 创建函数
plot_predictions_with_ci <- function(model_1, data) {
# 预测值和标准误差
predictions <- predict(model_1, newdata = data, se.fit = TRUE)
# 将预测值和标准误差添加到数据框中
data$predicted_y <- predictions$fit
data$se_fit <- predictions$se.fit
# 按照分组变量计算均值和95%的置信区间
summary_data <- data %>%
group_by(fans_piar_cat, is_cooperative) %>%
summarise(
mean_predicted_y = mean(predicted_y),
ci_lower = mean(predicted_y) - 1.96 * mean(se_fit),
ci_upper = mean(predicted_y) + 1.96 * mean(se_fit)
)
# 绘制图表
plot <- ggplot(summary_data, aes(x = factor(fans_piar_cat), y = mean_predicted_y, color = factor(is_cooperative))) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
labs(
title = "Mean Predicted y Values with Confidence Intervals",
x = "Pair Categories",
y = "Mean Predicted y",
color = "Is Same Category"
) +
theme_minimal()
# 返回图表
return(plot)
}
library(ggplot2)
library(dplyr)
# 创建函数
plot_predictions_fans <- function(model_1, data) {
# 预测值和标准误差
predictions <- predict(model_1, newdata = data, se.fit = TRUE)
# 将预测值和标准误差添加到数据框中
data$predicted_y <- predictions$fit
data$se_fit <- predictions$se.fit
# 按照分组变量计算均值和95%的置信区间
summary_data <- data %>%
group_by(fans_piar_cat) %>%
summarise(
mean_predicted_y = mean(predicted_y),
ci_lower = mean(predicted_y) - 1.96 * mean(se_fit),
ci_upper = mean(predicted_y) + 1.96 * mean(se_fit)
)
# 绘制图表
plot <- ggplot(summary_data, aes(x = factor(fans_piar_cat), y = mean_predicted_y, )) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
labs(
title = "Mean Predicted y Values with Confidence Intervals",
x = "fans Pair",
y = "Mean Predicted y",
color = "Is Same Category"
) +
theme_minimal()
# 返回图表
return(plot)
}
library(ggplot2)
library(dplyr)
# 创建函数
plot_predictions_cate <- function(model_1, data) {
# 预测值和标准误差
predictions <- predict(model_1, newdata = data, se.fit = TRUE)
# 将预测值和标准误差添加到数据框中
data$predicted_y <- predictions$fit
data$se_fit <- predictions$se.fit
# 按照分组变量计算均值和95%的置信区间
summary_data <- data %>%
group_by(is_cooperative) %>%
summarise(
mean_predicted_y = mean(predicted_y),
ci_lower = mean(predicted_y) - 1.96 * mean(se_fit),
ci_upper = mean(predicted_y) + 1.96 * mean(se_fit)
)
# 绘制图表
plot <- ggplot(summary_data, aes(x = factor(is_cooperative), y = mean_predicted_y )) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
labs(
title = "Mean Predicted y Values with Confidence Intervals",
x = "Categories",
y = "Mean Predicted y",
color = "Is Same Category"
) +
theme_minimal()
# 返回图表
return(plot)
}
library(dplyr)
library(ggplot2)
library(scales) # for comma formatting
##
## 载入程序包:'scales'
## The following object is masked from 'package:fixest':
##
## pvalue
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plm::between() masks dplyr::between()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ plm::lag() masks dplyr::lag(), stats::lag()
## ✖ plm::lead() masks dplyr::lead()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 定义函数,输入参数为模型公式
plot_continue_cutoff <- function(data, model_formula) {
# Step 1: Create a sequence of 100 cut-offs from 0.1k to 1000k, log-stepped
cutoffs <- exp(seq(log(100), log(1e6), length.out = 100))
# Create a matrix to store coefficients of factor(fans_piar_cat)
coefficients_matrix <- matrix(NA, nrow = 100, ncol = 3)
# Create a matrix to store the counts of each category
pct_matrix <- matrix(NA, nrow = 100, ncol = 4) # For 4 categories
# Step 2: Loop over each cut-off, create fans_piar_cat, run the regression, and store the coefficients
for (i in seq_along(cutoffs)) {
cut_off <- cutoffs[i]
# Create the fans_piar_cat variable based on cut-off
data <- data %>%
mutate(fans_piar_cat_temp = case_when(
(before_fans_count < cut_off) & (other_before_fans_count < cut_off) ~ "Small vs Small",
(before_fans_count < cut_off) & (other_before_fans_count >= cut_off) ~ "Small vs Big",
(before_fans_count >= cut_off) & (other_before_fans_count < cut_off) ~ "Big vs Small",
(before_fans_count >= cut_off) & (other_before_fans_count >= cut_off) ~ "Big vs Big"
)) %>%
mutate(fans_piar_cat_temp = relevel(factor(fans_piar_cat_temp), ref = "Small vs Small"))
# 1.计算percentage
pct_summary <- data %>%
count(fans_piar_cat_temp) %>%
mutate(pct = n / sum(n) * 100)
pct_matrix[i, ] <- pct_summary %>%
complete(fans_piar_cat_temp = c("Big vs Big", "Small vs Big", "Big vs Small", "Small vs Small"), fill = list(pct = 0)) %>%
pull(pct)
# 2.计算每个model的系数
model <- feols(model_formula, data = data,vcov = ~author_id)
coef_summary <- summary(model)$coefficients
# print(coef_summary)
# Store the coefficients for the three categories in the matrix
coefficients_matrix[i, ] <- coef_summary[c("factor(fans_piar_cat_temp)Small vs Big",
"factor(fans_piar_cat_temp)Big vs Small",
"factor(fans_piar_cat_temp)Big vs Big")] # "(Intercept)")]
}
#print(coefficients_matrix)
## 1.画出四组percentage图
pct_df <- data.frame(
Cutoff = cutoffs,
Big_vs_Big = pct_matrix[, 1],
Big_vs_Small = pct_matrix[, 2],
Small_vs_Big = pct_matrix[, 3],
Small_vs_Small = pct_matrix[, 4]
)
pct_df_long <- pct_df %>%
pivot_longer(cols = c("Big_vs_Big", "Big_vs_Small","Small_vs_Big","Small_vs_Small"),
names_to = "Category", values_to = "Percentage")
# print(pct_df_long)
# 画图
pct_plot <- ggplot(pct_df_long, aes(x = Cutoff, y = Percentage, color = Category)) +
geom_line() +
scale_x_log10() + # Logarithmic scale for the cut-off
labs(title = "Distribution of fans_piar_cat across Cutoffs",
x = "Cutoff (log scale)", y = "Percentage",
color = "Category") +
theme_minimal()
# print(pct_plot)
## 2.画出model coef图
coef_df <- data.frame(
Cutoff = cutoffs,
Small_vs_Big = coefficients_matrix[, 1],
Big_vs_Small = coefficients_matrix[, 2],
Big_vs_Big = coefficients_matrix[, 3]
#Small_vs_Small = coefficients_matrix[, 4]
)
coef_df_long <- coef_df %>%
pivot_longer(cols = c("Small_vs_Big", "Big_vs_Small", "Big_vs_Big"),
names_to = "Category", values_to = "Coefficient")
## 画图
coef_plot <- ggplot(coef_df_long, aes(x = Cutoff, y = Coefficient, color = Category)) +
geom_line() +
scale_x_log10() + # Logarithmic scale for the cut-off
labs(title = "Evolution of Coefficients for fans_piar_cat across Cutoffs",
x = "Cutoff (log scale)", y = "Coefficient",
color = "Category") +
theme_minimal()
print(coef_plot)
}
data$percent_of_gifter = (data$original_gifter_cnt)/(data$pk_gifter_cnt)
summary(data$percent_of_gifter)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.333 1.000 0.710 1.000 1.000 8337
library(ggplot2)
# 假设你有一个变量 'x',可以用直方图展示其分布
ggplot(data, aes(x = percent_of_gifter)) +
geom_histogram(binwidth = 0.1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of x", x = "X values", y = "Frequency")
## Warning: Removed 8337 rows containing non-finite outside the scale range
## (`stat_bin()`).
## (2) DV:掉viewer:log(data$a_to_b_user_count+1) ### (1) regression
# 自己的viewer跑走
model <- feols(log(a_to_b_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(a_to_b_user_count + 1)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value
## factor(fans_piar_cat)Small vs Big 0.114304 0.021472 5.323314
## factor(fans_piar_cat)Big vs Small 0.184777 0.028587 6.463749
## factor(fans_piar_cat)Big vs Big 0.259616 0.025856 10.040872
## is_cooperative 0.012229 0.006726 1.818109
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.060407 0.044967 1.343366
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.032796 0.041314 -0.793814
## factor(fans_piar_cat)Big vs Big:is_cooperative -0.067022 0.037599 -1.782556
## Pr(>|t|)
## factor(fans_piar_cat)Small vs Big 1.0431e-07 ***
## factor(fans_piar_cat)Big vs Small 1.0735e-10 ***
## factor(fans_piar_cat)Big vs Big < 2.2e-16 ***
## is_cooperative 6.9080e-02 .
## factor(fans_piar_cat)Small vs Big:is_cooperative 1.7919e-01
## factor(fans_piar_cat)Big vs Small:is_cooperative 4.2732e-01
## factor(fans_piar_cat)Big vs Big:is_cooperative 7.4692e-02 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.332662 Adj. R2: 0.054593
## Within R2: 0.054838
# 画图
model <- feols(log(a_to_b_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(a_to_b_user_count + 1)
## Observations: 9,665
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value
## (Intercept) 0.070916 0.003637 19.499666
## factor(fans_piar_cat)Small vs Big 0.117487 0.021393 5.491934
## factor(fans_piar_cat)Big vs Small 0.184024 0.028611 6.431839
## factor(fans_piar_cat)Big vs Big 0.259789 0.025756 10.086484
## is_cooperative 0.012075 0.006628 1.821677
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.057657 0.044608 1.292510
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.030939 0.041121 -0.752378
## factor(fans_piar_cat)Big vs Big:is_cooperative -0.068796 0.037510 -1.834077
## Pr(>|t|)
## (Intercept) < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big 4.0819e-08 ***
## factor(fans_piar_cat)Big vs Small 1.3237e-10 ***
## factor(fans_piar_cat)Big vs Big < 2.2e-16 ***
## is_cooperative 6.8537e-02 .
## factor(fans_piar_cat)Small vs Big:is_cooperative 1.9621e-01
## factor(fans_piar_cat)Big vs Small:is_cooperative 4.5184e-01
## factor(fans_piar_cat)Big vs Big:is_cooperative 6.6675e-02 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.333764 Adj. R2: 0.054332
plot_predictions_with_ci(model, data)
## `summarise()` has grouped output by 'fans_piar_cat'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# 画图
model <- feols(log(a_to_b_user_count+1) ~ factor(fans_piar_cat), data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(a_to_b_user_count + 1)
## Observations: 9,665
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.074737 0.003044 24.54987 < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big 0.138909 0.019828 7.00580 2.629e-12 ***
## factor(fans_piar_cat)Big vs Small 0.173374 0.021210 8.17400 3.380e-16 ***
## factor(fans_piar_cat)Big vs Big 0.234740 0.019418 12.08879 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.334015 Adj. R2: 0.053302
plot_predictions_fans(model, data)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# 画图
model <- feols(log(a_to_b_user_count+1) ~ is_cooperative, data = data,vcov = ~author_id)
summary(model)
## OLS estimation, Dep. Var.: log(a_to_b_user_count + 1)
## Observations: 9,670
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.110034 0.004278 25.72329 < 2.2e-16 ***
## is_cooperative 0.015046 0.007514 2.00249 0.045261 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.343191 Adj. R2: 3.195e-4
plot_predictions_cate(model, data)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
# Step 1: Set the ranges for the fan counts (fewer, broader bins)
data <- data %>% mutate(fans_new_range = case_when(
before_fans_count > 0 & before_fans_count <= 1000 ~ "0-1k",
before_fans_count > 1000 & before_fans_count <= 10000 ~ "1k-10k",
before_fans_count > 10000 & before_fans_count <= 100000 ~ "10k-100k",
before_fans_count > 100000 & before_fans_count <= 1000000 ~ "100k-1M",
TRUE ~ ">1M"
))
data <- data %>% mutate(other_fans_new_range = case_when(
other_before_fans_count > 0 & other_before_fans_count <= 1000 ~ "0-1k",
other_before_fans_count > 1000 & other_before_fans_count <= 10000 ~ "1k-10k",
other_before_fans_count > 10000 & other_before_fans_count <= 100000 ~ "10k-100k",
other_before_fans_count > 100000 & other_before_fans_count <= 1000000 ~ "100k-1M",
TRUE ~ ">1M"
))
# Step 2: Set the correct order for the categories
data <- data %>%
mutate(fans_new_range = factor(fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")),
other_fans_new_range = factor(other_fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")))
# Step 3: Aggregate the data to calculate the mean and confidence intervals, handling 0% cases
agg_data <- data %>%
group_by(fans_new_range, other_fans_new_range) %>%
summarize(
count = n(), # 计算每个组合的数量
Y = ifelse(count > 0, mean(log(a_to_b_user_count+1) , na.rm = TRUE), NA), # 如果有数据,计算均值;否则设为NA
ci_lower = ifelse(count > 0, Y - qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA), # 如果有数据,计算CI;否则设为NA
ci_upper = ifelse(count > 0, Y + qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA) # 如果有数据,计算CI;否则设为NA
) %>%
ungroup() %>%
mutate(percentage = count / sum(count) * 100)
## `summarise()` has grouped output by 'fans_new_range'. You can override using
## the `.groups` argument.
# Step 4: Plot the data in a heatmap-like format with custom colors and percentage labels
ggplot(agg_data, aes(x = fans_new_range, y = other_fans_new_range, fill = Y)) +
geom_tile() +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
geom_text(aes(label = paste0(round(percentage, 2), "%")), color = "black", size = 4) +
scale_fill_gradientn(colors = c("yellow", "orange", "darkorange", "red", "darkred"),
values = scales::rescale(c(0, 0.05, 0.1, 0.15, 0.2)),
#limits = c(0, 0.2),
na.value = "white") +
labs(title = "Average Fan Total Cost Amount by Fan Ranges",
x = "Fans Range",
y = "Other Fans Range",
fill = "Avg Fan Total Cost Amt") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# 别人的viewer跑来
model <- feols(log(b_to_a_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value
## factor(fans_piar_cat)Small vs Big 0.065159 0.021448 3.038017
## factor(fans_piar_cat)Big vs Small 0.069651 0.019270 3.614489
## factor(fans_piar_cat)Big vs Big 0.154335 0.020542 7.513207
## is_cooperative 0.002334 0.004242 0.550192
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.016593 0.029754 0.557663
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.033771 0.025369 -1.331196
## factor(fans_piar_cat)Big vs Big:is_cooperative 0.002425 0.035324 0.068653
## Pr(>|t|)
## factor(fans_piar_cat)Small vs Big 2.3881e-03 **
## factor(fans_piar_cat)Big vs Small 3.0255e-04 ***
## factor(fans_piar_cat)Big vs Big 6.3109e-14 ***
## is_cooperative 5.8220e-01
## factor(fans_piar_cat)Small vs Big:is_cooperative 5.7709e-01
## factor(fans_piar_cat)Big vs Small:is_cooperative 1.8316e-01
## factor(fans_piar_cat)Big vs Big:is_cooperative 9.4527e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.241592 Adj. R2: 0.03464
## Within R2: 0.035915
# 自己的viewer跑走
model <- feols(log(b_to_a_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value
## factor(fans_piar_cat)Small vs Big 0.065159 0.021448 3.038017
## factor(fans_piar_cat)Big vs Small 0.069651 0.019270 3.614489
## factor(fans_piar_cat)Big vs Big 0.154335 0.020542 7.513207
## is_cooperative 0.002334 0.004242 0.550192
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.016593 0.029754 0.557663
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.033771 0.025369 -1.331196
## factor(fans_piar_cat)Big vs Big:is_cooperative 0.002425 0.035324 0.068653
## Pr(>|t|)
## factor(fans_piar_cat)Small vs Big 2.3881e-03 **
## factor(fans_piar_cat)Big vs Small 3.0255e-04 ***
## factor(fans_piar_cat)Big vs Big 6.3109e-14 ***
## is_cooperative 5.8220e-01
## factor(fans_piar_cat)Small vs Big:is_cooperative 5.7709e-01
## factor(fans_piar_cat)Big vs Small:is_cooperative 1.8316e-01
## factor(fans_piar_cat)Big vs Big:is_cooperative 9.4527e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.241592 Adj. R2: 0.03464
## Within R2: 0.035915
# 画图
model <- feols(log(b_to_a_user_count+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,665
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value
## (Intercept) 0.022434 0.002306 9.728907
## factor(fans_piar_cat)Small vs Big 0.066886 0.021397 3.125988
## factor(fans_piar_cat)Big vs Small 0.068487 0.019193 3.568331
## factor(fans_piar_cat)Big vs Big 0.154916 0.020531 7.545473
## is_cooperative 0.001818 0.004111 0.442351
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.016021 0.029638 0.540534
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.031047 0.025214 -1.231328
## factor(fans_piar_cat)Big vs Big:is_cooperative 0.001535 0.035305 0.043466
## Pr(>|t|)
## (Intercept) < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big 1.7776e-03 **
## factor(fans_piar_cat)Big vs Small 3.6110e-04 ***
## factor(fans_piar_cat)Big vs Big 4.9368e-14 ***
## is_cooperative 6.5825e-01
## factor(fans_piar_cat)Small vs Big:is_cooperative 5.8884e-01
## factor(fans_piar_cat)Big vs Small:is_cooperative 2.1823e-01
## factor(fans_piar_cat)Big vs Big:is_cooperative 9.6533e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.242249 Adj. R2: 0.035511
plot_predictions_with_ci(model, data)
## `summarise()` has grouped output by 'fans_piar_cat'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# 画图
model <- feols(log(b_to_a_user_count+1) ~ factor(fans_piar_cat), data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,665
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.023010 0.001906 12.07192 < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big 0.072769 0.015910 4.57374 4.8538e-06 ***
## factor(fans_piar_cat)Big vs Small 0.057331 0.013663 4.19600 2.7423e-05 ***
## factor(fans_piar_cat)Big vs Big 0.155596 0.016840 9.23963 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.242286 Adj. R2: 0.035617
plot_predictions_fans(model, data)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# 画图
model <- feols(log(b_to_a_user_count+1) ~ is_cooperative, data = data,vcov = ~author_id)
summary(model)
## OLS estimation, Dep. Var.: log(b_to_a_user_count + 1)
## Observations: 9,670
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.043138 0.003060 14.096094 < 2.2e-16 ***
## is_cooperative 0.005232 0.005306 0.986081 0.32412
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.246684 Adj. R2: -4.411e-6
plot_predictions_cate(model, data)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
# Step 1: Set the ranges for the fan counts (fewer, broader bins)
data <- data %>% mutate(fans_new_range = case_when(
before_fans_count > 0 & before_fans_count <= 1000 ~ "0-1k",
before_fans_count > 1000 & before_fans_count <= 10000 ~ "1k-10k",
before_fans_count > 10000 & before_fans_count <= 100000 ~ "10k-100k",
before_fans_count > 100000 & before_fans_count <= 1000000 ~ "100k-1M",
TRUE ~ ">1M"
))
data <- data %>% mutate(other_fans_new_range = case_when(
other_before_fans_count > 0 & other_before_fans_count <= 1000 ~ "0-1k",
other_before_fans_count > 1000 & other_before_fans_count <= 10000 ~ "1k-10k",
other_before_fans_count > 10000 & other_before_fans_count <= 100000 ~ "10k-100k",
other_before_fans_count > 100000 & other_before_fans_count <= 1000000 ~ "100k-1M",
TRUE ~ ">1M"
))
# Step 2: Set the correct order for the categories
data <- data %>%
mutate(fans_new_range = factor(fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")),
other_fans_new_range = factor(other_fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")))
# Step 3: Aggregate the data to calculate the mean and confidence intervals, handling 0% cases
agg_data <- data %>%
group_by(fans_new_range, other_fans_new_range) %>%
summarize(
count = n(), # 计算每个组合的数量
Y = ifelse(count > 0, mean(log(b_to_a_user_count+1), na.rm = TRUE), NA), # 如果有数据,计算均值;否则设为NA
ci_lower = ifelse(count > 0, Y - qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA), # 如果有数据,计算CI;否则设为NA
ci_upper = ifelse(count > 0, Y + qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA) # 如果有数据,计算CI;否则设为NA
) %>%
ungroup() %>%
mutate(percentage = count / sum(count) * 100)
## `summarise()` has grouped output by 'fans_new_range'. You can override using
## the `.groups` argument.
# Step 4: Plot the data in a heatmap-like format with custom colors and percentage labels
ggplot(agg_data, aes(x = fans_new_range, y = other_fans_new_range, fill = Y)) +
geom_tile() +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
geom_text(aes(label = paste0(round(percentage, 2), "%")), color = "black", size = 4) +
scale_fill_gradientn(colors = c("yellow", "orange", "darkorange", "red", "darkred"),
values = scales::rescale(c(0, 0.05, 0.1, 0.15, 0.2)),
#limits = c(0, 0.2),
na.value = "white") +
labs(title = "Average Fan Total Cost Amount by Fan Ranges",
x = "Fans Range",
y = "Other Fans Range",
fill = "Avg Fan Total Cost Amt") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
summary(data$net_viewer_change)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -516.0000 0.0000 0.0000 -0.1611 0.0000 250.0000
# 纯获得的viewer
model <- feols(log(net_viewer_change+517) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value
## factor(fans_piar_cat)Small vs Big 0.001494 0.001231 1.214193
## factor(fans_piar_cat)Big vs Small -0.016542 0.016000 -1.033848
## factor(fans_piar_cat)Big vs Big -0.000807 0.000369 -2.187275
## is_cooperative 0.000011 0.000140 0.080151
## factor(fans_piar_cat)Small vs Big:is_cooperative -0.001805 0.001313 -1.375159
## factor(fans_piar_cat)Big vs Small:is_cooperative 0.016107 0.016019 1.005523
## factor(fans_piar_cat)Big vs Big:is_cooperative 0.001391 0.000753 1.847283
## Pr(>|t|)
## factor(fans_piar_cat)Small vs Big 0.224705
## factor(fans_piar_cat)Big vs Small 0.301235
## factor(fans_piar_cat)Big vs Big 0.028748 *
## is_cooperative 0.936119
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.169116
## factor(fans_piar_cat)Big vs Small:is_cooperative 0.314672
## factor(fans_piar_cat)Big vs Big:is_cooperative 0.064738 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063398 Adj. R2: 0.002967
## Within R2: 0.00262
# 自己的viewer跑走
model <- feols(log(net_viewer_change+517) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5, Fixed-effects: 5).
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,665
## Fixed-effects: p_date: 62
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value
## factor(fans_piar_cat)Small vs Big 0.001494 0.001231 1.214193
## factor(fans_piar_cat)Big vs Small -0.016542 0.016000 -1.033848
## factor(fans_piar_cat)Big vs Big -0.000807 0.000369 -2.187275
## is_cooperative 0.000011 0.000140 0.080151
## factor(fans_piar_cat)Small vs Big:is_cooperative -0.001805 0.001313 -1.375159
## factor(fans_piar_cat)Big vs Small:is_cooperative 0.016107 0.016019 1.005523
## factor(fans_piar_cat)Big vs Big:is_cooperative 0.001391 0.000753 1.847283
## Pr(>|t|)
## factor(fans_piar_cat)Small vs Big 0.224705
## factor(fans_piar_cat)Big vs Small 0.301235
## factor(fans_piar_cat)Big vs Big 0.028748 *
## is_cooperative 0.936119
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.169116
## factor(fans_piar_cat)Big vs Small:is_cooperative 0.314672
## factor(fans_piar_cat)Big vs Big:is_cooperative 0.064738 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063398 Adj. R2: 0.002967
## Within R2: 0.00262
# 画图
model <- feols(log(net_viewer_change+517) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative, data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,665
## Standard-errors: Clustered (author_id)
## Estimate Std. Error
## (Intercept) 6.247881 0.000015
## factor(fans_piar_cat)Small vs Big 0.000874 0.001009
## factor(fans_piar_cat)Big vs Small -0.016648 0.016174
## factor(fans_piar_cat)Big vs Big -0.000749 0.000281
## is_cooperative -0.000024 0.000027
## factor(fans_piar_cat)Small vs Big:is_cooperative -0.001840 0.001122
## factor(fans_piar_cat)Big vs Small:is_cooperative 0.016241 0.016174
## factor(fans_piar_cat)Big vs Big:is_cooperative 0.000910 0.000357
## t value Pr(>|t|)
## (Intercept) 410265.260485 < 2.2e-16 ***
## factor(fans_piar_cat)Small vs Big 0.866220 0.3863921
## factor(fans_piar_cat)Big vs Small -1.029287 0.3033720
## factor(fans_piar_cat)Big vs Big -2.660993 0.0078046 **
## is_cooperative -0.902975 0.3665631
## factor(fans_piar_cat)Small vs Big:is_cooperative -1.639873 0.1010660
## factor(fans_piar_cat)Big vs Small:is_cooperative 1.004149 0.3153331
## factor(fans_piar_cat)Big vs Big:is_cooperative 2.546293 0.0109036 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063633 Adj. R2: 0.001903
plot_predictions_with_ci(model, data)
## `summarise()` has grouped output by 'fans_piar_cat'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# 画图
model <- feols(log(net_viewer_change+517) ~ factor(fans_piar_cat), data = data,vcov = ~author_id)
## NOTE: 5 observations removed because of NA values (RHS: 5).
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,665
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.247873 0.000013 498534.959414 < 2.2e-16
## factor(fans_piar_cat)Small vs Big 0.000206 0.000669 0.308666 0.757583
## factor(fans_piar_cat)Big vs Small -0.010770 0.010323 -1.043312 0.296831
## factor(fans_piar_cat)Big vs Big -0.000410 0.000195 -2.103332 0.035464
##
## (Intercept) ***
## factor(fans_piar_cat)Small vs Big
## factor(fans_piar_cat)Big vs Small
## factor(fans_piar_cat)Big vs Big *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063663 Adj. R2: 0.001363
plot_predictions_fans(model, data)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# 画图
model <- feols(log(net_viewer_change+517) ~ is_cooperative, data = data,vcov = ~author_id)
summary(model)
## OLS estimation, Dep. Var.: log(net_viewer_change + 517)
## Observations: 9,670
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.246885 0.000963 6490.212117 < 2.2e-16 ***
## is_cooperative 0.000891 0.000964 0.924964 0.35501
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.063699 Adj. R2: -6.034e-5
plot_predictions_cate(model, data)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
# Step 1: Set the ranges for the fan counts (fewer, broader bins)
data <- data %>% mutate(fans_new_range = case_when(
before_fans_count > 0 & before_fans_count <= 1000 ~ "0-1k",
before_fans_count > 1000 & before_fans_count <= 10000 ~ "1k-10k",
before_fans_count > 10000 & before_fans_count <= 100000 ~ "10k-100k",
before_fans_count > 100000 & before_fans_count <= 1000000 ~ "100k-1M",
TRUE ~ ">1M"
))
data <- data %>% mutate(other_fans_new_range = case_when(
other_before_fans_count > 0 & other_before_fans_count <= 1000 ~ "0-1k",
other_before_fans_count > 1000 & other_before_fans_count <= 10000 ~ "1k-10k",
other_before_fans_count > 10000 & other_before_fans_count <= 100000 ~ "10k-100k",
other_before_fans_count > 100000 & other_before_fans_count <= 1000000 ~ "100k-1M",
TRUE ~ ">1M"
))
# Step 2: Set the correct order for the categories
data <- data %>%
mutate(fans_new_range = factor(fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")),
other_fans_new_range = factor(other_fans_new_range, levels = c("0-1k", "1k-10k", "10k-100k", "100k-1M", ">1M")))
# Step 3: Aggregate the data to calculate the mean and confidence intervals, handling 0% cases
agg_data <- data %>%
group_by(fans_new_range, other_fans_new_range) %>%
summarize(
count = n(), # 计算每个组合的数量
Y = ifelse(count > 0, mean(log(net_viewer_change+517), na.rm = TRUE), NA), # 如果有数据,计算均值;否则设为NA
ci_lower = ifelse(count > 0, Y - qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA), # 如果有数据,计算CI;否则设为NA
ci_upper = ifelse(count > 0, Y + qt(0.975, n()) * sd(Y, na.rm = TRUE) / sqrt(n()), NA) # 如果有数据,计算CI;否则设为NA
) %>%
ungroup() %>%
mutate(percentage = count / sum(count) * 100)
## `summarise()` has grouped output by 'fans_new_range'. You can override using
## the `.groups` argument.
# Step 4: Plot the data in a heatmap-like format with custom colors and percentage labels
ggplot(agg_data, aes(x = fans_new_range, y = other_fans_new_range, fill = Y)) +
geom_tile() +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
geom_text(aes(label = paste0(round(percentage, 2), "%")), color = "black", size = 4) +
scale_fill_gradientn(colors = c("yellow", "orange", "darkorange", "red", "darkred"),
values = scales::rescale(c(0, 0.05, 0.1, 0.15, 0.2)),
#limits = c(0, 0.2),
na.value = "white") +
labs(title = "Average Fan Total Cost Amount by Fan Ranges",
x = "Fans Range",
y = "Other Fans Range",
fill = "Avg Fan Total Cost Amt") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))