setwd("C:/Users/ASUS/Desktop/快手")

1.read data

data <- read_csv("Author_live_data.csv")

# 修改列名：使用sub()函数移除列名前面的"data."前缀
new_column_names <- sub("^author_live_downsample\\.", "", names(data))
names(data) <- new_column_names

查看数据结构

colnames(data)

##  [1] "author_id"               "live_id"                
##  [3] "p_date"                  "is_pk_live"             
##  [5] "valid_play_duration"     "valid_play_user_num"    
##  [7] "avg_valid_play_duration" "total_cost_amt"         
##  [9] "total_cost_user_num"     "avg_total_cost_amt"     
## [11] "comment_cnt"             "comment_user_num"       
## [13] "avg_comment_cnt"         "like_cnt"               
## [15] "like_user_num"           "avg_like_cnt"           
## [17] "share_success_cnt"       "share_success_user_num" 
## [19] "avg_share_success_cnt"   "follow_author_cnt"      
## [21] "gender"                  "age_range"              
## [23] "fre_country_region"      "fre_city_level"         
## [25] "fans_user_num"           "follow_user_num"        
## [27] "reg_day_cnt"             "is_big_v"               
## [29] "author_type"             "live_type"              
## [31] "is_shop_car_live"        "is_paid_show_live"      
## [33] "live_content_category"

head(data)

## # A tibble: 6 × 33
##    author_id   live_id p_date is_pk_live valid_play_duration valid_play_user_num
##        <dbl>     <dbl>  <dbl>      <dbl>               <dbl>               <dbl>
## 1 1926380389    9.44e9 2.02e7          0               11945                   5
## 2 1804013348    9.47e9 2.02e7          0            15520318                 246
## 3 1797821702    9.47e9 2.02e7          1             1015002                  17
## 4 2081027986    9.44e9 2.02e7          0               17058                   3
## 5  931438415    9.44e9 2.02e7          0              574724                  31
## 6  230462094    9.45e9 2.02e7          1              604570                  23
## # ℹ 27 more variables: avg_valid_play_duration <dbl>, total_cost_amt <dbl>,
## #   total_cost_user_num <dbl>, avg_total_cost_amt <dbl>, comment_cnt <dbl>,
## #   comment_user_num <dbl>, avg_comment_cnt <dbl>, like_cnt <dbl>,
## #   like_user_num <dbl>, avg_like_cnt <dbl>, share_success_cnt <dbl>,
## #   share_success_user_num <dbl>, avg_share_success_cnt <dbl>,
## #   follow_author_cnt <dbl>, gender <chr>, age_range <chr>,
## #   fre_country_region <chr>, fre_city_level <chr>, fans_user_num <dbl>, …

summary(data)

##    author_id            live_id              p_date           is_pk_live    
##  Min.   :3.000e+00   Min.   :9.427e+09   Min.   :20220421   Min.   :0.0000  
##  1st Qu.:5.112e+08   1st Qu.:9.443e+09   1st Qu.:20220423   1st Qu.:0.0000  
##  Median :9.324e+08   Median :9.452e+09   Median :20220425   Median :0.0000  
##  Mean   :1.095e+09   Mean   :9.452e+09   Mean   :20220425   Mean   :0.1514  
##  3rd Qu.:1.499e+09   3rd Qu.:9.461e+09   3rd Qu.:20220427   3rd Qu.:0.0000  
##  Max.   :2.859e+09   Max.   :9.470e+09   Max.   :20220429   Max.   :1.0000  
##  valid_play_duration valid_play_user_num avg_valid_play_duration
##  Min.   :0.000e+00   Min.   :      0     Min.   :       0       
##  1st Qu.:3.133e+04   1st Qu.:      4     1st Qu.:    6119       
##  Median :7.069e+05   Median :     22     Median :   25499       
##  Mean   :1.104e+08   Mean   :    784     Mean   :   62373       
##  3rd Qu.:6.667e+06   3rd Qu.:    100     3rd Qu.:   67457       
##  Max.   :1.861e+12   Max.   :5845708     Max.   :21966802       
##  total_cost_amt      total_cost_user_num avg_total_cost_amt  comment_cnt      
##  Min.   :   -575.0   Min.   :    0.000   Min.   : -574.99   Min.   :     0.0  
##  1st Qu.:      0.0   1st Qu.:    0.000   1st Qu.:    0.00   1st Qu.:     0.0  
##  Median :      0.0   Median :    0.000   Median :    0.00   Median :     1.0  
##  Mean   :    238.8   Mean   :    2.585   Mean   :   23.59   Mean   :    50.4  
##  3rd Qu.:      1.0   3rd Qu.:    1.000   3rd Qu.:    1.00   3rd Qu.:    14.0  
##  Max.   :2665249.0   Max.   :18918.000   Max.   :99999.00   Max.   :800939.0  
##  comment_user_num   avg_comment_cnt      like_cnt        like_user_num      
##  Min.   :    0.00   Min.   :  0.000   Min.   :       0   Min.   :     0.00  
##  1st Qu.:    0.00   1st Qu.:  0.000   1st Qu.:       0   1st Qu.:     0.00  
##  Median :    1.00   Median :  1.000   Median :       2   Median :     1.00  
##  Mean   :   11.62   Mean   :  2.209   Mean   :    1364   Mean   :    15.56  
##  3rd Qu.:    4.00   3rd Qu.:  3.000   3rd Qu.:      92   3rd Qu.:     5.00  
##  Max.   :74953.00   Max.   :382.996   Max.   :16165318   Max.   :113578.00  
##   avg_like_cnt      share_success_cnt  share_success_user_num
##  Min.   :     0.0   Min.   :    0.00   Min.   :    0.000     
##  1st Qu.:     0.0   1st Qu.:    0.00   1st Qu.:    0.000     
##  Median :     1.5   Median :    0.00   Median :    0.000     
##  Mean   :    32.2   Mean   :    3.78   Mean   :    1.835     
##  3rd Qu.:    16.6   3rd Qu.:    0.00   3rd Qu.:    0.000     
##  Max.   :594909.1   Max.   :36956.00   Max.   :14619.000     
##  avg_share_success_cnt follow_author_cnt     gender           age_range        
##  Min.   :  0.0000      Min.   :    0.00   Length:958541      Length:958541     
##  1st Qu.:  0.0000      1st Qu.:    0.00   Class :character   Class :character  
##  Median :  0.0000      Median :    0.00   Mode  :character   Mode  :character  
##  Mean   :  0.2367      Mean   :    9.41                                        
##  3rd Qu.:  0.0000      3rd Qu.:    1.00                                        
##  Max.   :207.9979      Max.   :42538.00                                        
##  fre_country_region fre_city_level     fans_user_num      follow_user_num 
##  Length:958541      Length:958541      Min.   :       0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:     267   1st Qu.: 107.0  
##  Mode  :character   Mode  :character   Median :     777   Median : 423.0  
##                                        Mean   :   20572   Mean   : 930.6  
##                                        3rd Qu.:    2809   3rd Qu.:1225.0  
##                                        Max.   :40103191   Max.   :5033.0  
##   reg_day_cnt      is_big_v         author_type          live_type    
##  Min.   :   0   Min.   :0.0000000   Length:958541      Min.   :1.000  
##  1st Qu.: 951   1st Qu.:0.0000000   Class :character   1st Qu.:1.000  
##  Median :1459   Median :0.0000000   Mode  :character   Median :1.000  
##  Mean   :1374   Mean   :0.0002326                      Mean   :1.461  
##  3rd Qu.:1809   3rd Qu.:0.0000000                      3rd Qu.:2.000  
##  Max.   :3870   Max.   :1.0000000                      Max.   :3.000  
##  is_shop_car_live  is_paid_show_live live_content_category
##  Min.   :0.00000   Min.   :0         Length:958541        
##  1st Qu.:0.00000   1st Qu.:0         Class :character     
##  Median :0.00000   Median :0         Mode  :character     
##  Mean   :0.05704   Mean   :0                              
##  3rd Qu.:0.00000   3rd Qu.:0                              
##  Max.   :1.00000   Max.   :0

print(length(data$is_pk_live))

## [1] 958541

print(length(unique(data$author_id)))

## [1] 719070

sum(data$is_pk_live == 1)

## [1] 145132

Data结构：Author_id数:719070; live_id数：719070;有PK的live_id数：145132

2.处理变量

2.1 对于每个author按照p_date的顺序对于live_id进行排序

data <- data %>%
  group_by(author_id) %>%
  mutate(live_id_order = row_number(live_id)) %>%
  ungroup()

# 检查是否排序
author_rows <- data %>%
  filter(author_id == 11318492)

author_rows

## # A tibble: 137 × 34
##    author_id   live_id p_date is_pk_live valid_play_duration valid_play_user_num
##        <dbl>     <dbl>  <dbl>      <dbl>               <dbl>               <dbl>
##  1  11318492    9.44e9 2.02e7          0                   0                   0
##  2  11318492    9.45e9 2.02e7          0                   0                   0
##  3  11318492    9.44e9 2.02e7          0                   0                   0
##  4  11318492    9.44e9 2.02e7          0                   0                   0
##  5  11318492    9.44e9 2.02e7          0               41342                   2
##  6  11318492    9.44e9 2.02e7          0                   0                   0
##  7  11318492    9.44e9 2.02e7          0                   0                   0
##  8  11318492    9.44e9 2.02e7          0                   0                   0
##  9  11318492    9.45e9 2.02e7          0                   0                   0
## 10  11318492    9.45e9 2.02e7          0                   0                   0
## # ℹ 127 more rows
## # ℹ 28 more variables: avg_valid_play_duration <dbl>, total_cost_amt <dbl>,
## #   total_cost_user_num <dbl>, avg_total_cost_amt <dbl>, comment_cnt <dbl>,
## #   comment_user_num <dbl>, avg_comment_cnt <dbl>, like_cnt <dbl>,
## #   like_user_num <dbl>, avg_like_cnt <dbl>, share_success_cnt <dbl>,
## #   share_success_user_num <dbl>, avg_share_success_cnt <dbl>,
## #   follow_author_cnt <dbl>, gender <chr>, age_range <chr>, …

max(data$live_id_order)

## [1] 137

Data 结构：多人PK功能上线日期：20220425 date(前后4天，一共9天的数据): 20220421–20220429 Author_id数:719070; live_id数：719070;有PK的live_id数：145132 对于全部Author来说,9天内直播次数最多的Author有：137场直播 Author对应的直播场次：1-137

2.2 处理categorical variable

处理类别变量函数

assign_numbers <- function(data, column) {
  unique_categories <- unique(data[[column]])
  category_numbers <- match(data[[column]], unique_categories)
  data[[paste0(column, "_number")]] <- category_numbers - 1 # 将编号减1，使得"M"对应1，"F"对应0
  return(data)
}

性别：gender

data <- assign_numbers(data, "gender")
unique(data$gender_number)

## [1] 0 1 2

年龄：age_range

data <- assign_numbers(data, "age_range")
unique(data$age_range_number)

## [1] 0 1 2 3 4 5 6 7

地区：fre_country_region

data <- assign_numbers(data, "fre_country_region")
unique(data$fre_country_region_number)

## [1] 0 1 2

城市：fre_city_level

data <- assign_numbers(data, "fre_city_level")
unique(data$fre_city_level_number)

## [1] 0 1 2 3 4 5 6

作者类型：author_type

data <- assign_numbers(data, "author_type")
unique(data$author_type_number)

## [1] 0 1 2 3

直播内容类型：live_content_category

data <- assign_numbers(data, "live_content_category")
unique(data$live_content_category_number)

## [1] 0 1

3.Panel data regression - TWFE

3.1 固定两个: author_id fixed, live_sequence fixed

library(plm) 
# author_id fixed, live_sequence fixed
# 指定author_id和live_id_order作为索引
pdata <- pdata.frame(data, index = c("author_id", "live_id_order"))

(1) Y:直播间打赏-total_cost_amt

model <- plm(total_cost_amt ~ is_pk_live, data = pdata, model = "within", effect = "twoways")
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = total_cost_amt ~ is_pk_live, data = pdata, effect = "twoways", 
##     model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median 3rd Qu.    Max. 
## -833647       0       0       0  833647 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live  590.741     41.848  14.117 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    6.3962e+12
## Residual Sum of Squares: 6.3908e+12
## R-Squared:      0.00083193
## Adj. R-Squared: -3.0017
## F-statistic: 199.275 on 1 and 239334 DF, p-value: < 2.22e-16

显著+

(2) Y:直播间人均观看时长-avg_valid_play_duration

model <- plm(avg_valid_play_duration ~ is_pk_live, data = pdata, model = "within", effect = "twoways")
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = avg_valid_play_duration ~ is_pk_live, data = pdata, 
##     effect = "twoways", model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -1.0899e+07 -3.6380e-12  0.0000e+00  1.8190e-12  1.2614e+07 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live  18130.3     1076.6   16.84 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    4.235e+15
## Residual Sum of Squares: 4.23e+15
## R-Squared:      0.0011835
## Adj. R-Squared: -3.0003
## F-statistic: 283.59 on 1 and 239334 DF, p-value: < 2.22e-16

显著+

(3) Y:直播间人均互动

人均评论:avg_comment_cnt

model <- plm(avg_comment_cnt ~ is_pk_live, data = pdata, model = "within", effect = "twoways")
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = avg_comment_cnt ~ is_pk_live, data = pdata, effect = "twoways", 
##     model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median 3rd Qu.    Max. 
## -155.49    0.00    0.00    0.00  183.09 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live 1.786546   0.028278  63.179 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    2966800
## Residual Sum of Squares: 2918100
## R-Squared:      0.016404
## Adj. R-Squared: -2.9393
## F-statistic: 3991.58 on 1 and 239334 DF, p-value: < 2.22e-16

结论：显著+

人均点赞;avg_like_cnt

model <- plm(avg_like_cnt ~ is_pk_live, data = pdata, model = "within", effect = "twoways")
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = avg_like_cnt ~ is_pk_live, data = pdata, effect = "twoways", 
##     model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -1.9857e+05 -6.6613e-16  0.0000e+00  1.1102e-15  3.9600e+05 
## 
## Coefficients:
##            Estimate Std. Error t-value Pr(>|t|)  
## is_pk_live  16.4931     8.1063  2.0346  0.04189 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    2.3981e+11
## Residual Sum of Squares: 2.3981e+11
## R-Squared:      1.7296e-05
## Adj. R-Squared: -3.005
## F-statistic: 4.13956 on 1 and 239334 DF, p-value: 0.041893

结论：不显著

人均转发：avg_share_success_cnt

model <- plm(avg_share_success_cnt ~ is_pk_live, data = pdata, model = "within", effect = "twoways")
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = avg_share_success_cnt ~ is_pk_live, data = pdata, 
##     effect = "twoways", model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median 3rd Qu.    Max. 
## -42.402   0.000   0.000   0.000 165.599 
## 
## Coefficients:
##             Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live 0.1375878  0.0055478  24.801 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    112610
## Residual Sum of Squares: 112320
## R-Squared:      0.0025633
## Adj. R-Squared: -2.9948
## F-statistic: 615.071 on 1 and 239334 DF, p-value: < 2.22e-16

结论：显著+

(4) Y:直播间涨粉：follow_author_cnt

model <- plm(follow_author_cnt ~ is_pk_live, data = pdata, model = "within", effect = "twoways")
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = follow_author_cnt ~ is_pk_live, data = pdata, effect = "twoways", 
##     model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median 3rd Qu.    Max. 
##  -14314       0       0       0   23955 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live   9.9152     1.2740  7.7829 7.116e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    5924400000
## Residual Sum of Squares: 5922900000
## R-Squared:      0.00025303
## Adj. R-Squared: -3.004
## F-statistic: 60.5736 on 1 and 239334 DF, p-value: 7.1159e-15

结论：显著+

总结：固定Author和live sequence,regression的结果为：直播间打赏：显著+ 人均(viewer)观看时长：显著+ 人均(viewer)直播间评论：显著+ 人均(viewer)直播间点赞：不显著人均(viewer)直播间转发：显著+ 直播间内涨粉数：显著+

3.2 固定3个：author_id fixed, live_id fixed, date fixed

(1) Y:直播间打赏-total_cost_amt

library(fixest)
model <- feols(total_cost_amt ~ is_pk_live | author_id + live_id_order + p_date, data = data)
summary(model)

## OLS estimation, Dep. Var.: total_cost_amt
## Observations: 958,541
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  591.642    38.9796 15.1782 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 2,582.0     Adj. R2: 0.366598
##                 Within R2: 8.342e-4

结论：显著+ ### (2) Y:直播间人均观看时长-avg_valid_play_duration

library(fixest)
model <- feols(avg_valid_play_duration ~ is_pk_live | author_id + live_id_order + p_date, data = data)
summary(model)

## OLS estimation, Dep. Var.: avg_valid_play_duration
## Observations: 958,541
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  18104.1    681.786  26.554 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 66,426.9     Adj. R2: 0.199344
##                  Within R2: 0.00118

结论：显著+

(3) Y:直播间人均互动

人均评论:avg_comment_cnt

library(fixest)
model <- feols(avg_comment_cnt ~ is_pk_live | author_id + live_id_order + p_date, data = data)
summary(model)

## OLS estimation, Dep. Var.: avg_comment_cnt
## Observations: 958,541
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live   1.7844   0.031901 55.9348 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.74469     Adj. R2: 0.309601
##                 Within R2: 0.016363

结论：显著+

人均点赞;avg_like_cnt

library(fixest)
model <- feols(avg_like_cnt ~ is_pk_live | author_id + live_id_order + p_date, data = data)
summary(model)

## OLS estimation, Dep. Var.: avg_like_cnt
## Observations: 958,541
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  16.6003   0.907549 18.2914 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 500.2     Adj. R2: -1.50329
##               Within R2: 1.752e-5

结论：显著+

人均转发：avg_share_success_cnt

library(fixest)
model <- feols(avg_share_success_cnt ~ is_pk_live | author_id + live_id_order + p_date, data = data)
summary(model)

## OLS estimation, Dep. Var.: avg_share_success_cnt
## Observations: 958,541
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  0.13735   0.005219 26.3187 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.342294     Adj. R2: 0.136187
##                  Within R2: 0.002554

结论：显著+

(4) Y:直播间涨粉：follow_author_cnt

library(fixest)
model <- feols(follow_author_cnt ~ is_pk_live | author_id + live_id_order + p_date, data = data)
summary(model)

## OLS estimation, Dep. Var.: follow_author_cnt
## Observations: 958,541
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value   Pr(>|t|)    
## is_pk_live   9.8983    1.65489 5.98126 2.2153e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 78.6     Adj. R2: 0.303568
##              Within R2: 2.521e-4

结论：显著+

总结：固定Author,live sequence和date,regression的结果为：直播间打赏：显著+ 人均(viewer)观看时长：显著+ 人均(viewer)直播间评论：显著+ 人均(viewer)直播间点赞：显著+ 人均(viewer)直播间转发：显著+ 直播间内涨粉数：显著+

4.Entropy balancing

计算 Entropy weight

library(ebal)
library(tidyverse)

# 提取因变量
Y <- data %>%pull(total_cost_amt)

# 提取处理变量
D <- data %>%pull(is_pk_live)

# 提取匹配变量：Author feature
X <- data %>%
  select(gender_number,age_range_number,fre_country_region_number,fre_city_level_number,
    fans_user_num,follow_user_num,reg_day_cnt,author_type_number) %>%
  as.matrix()

# 计算权重
eb <- ebalance(D, X)

## Converged within tolerance

# 生成加权数据集
data_treat <- data %>% filter(D == 1) %>% mutate(weights = 1)   # 对于处理组，权重为1

data_con <- data %>% filter(D == 0) %>% mutate(weights = eb$w)   # 对于对照组，使用 eBalance 计算的权重

# 合并处理组和对照组的数据
data_weighted <- bind_rows(data_treat, data_con)

head(data_weighted)

## # A tibble: 6 × 41
##    author_id   live_id p_date is_pk_live valid_play_duration valid_play_user_num
##        <dbl>     <dbl>  <dbl>      <dbl>               <dbl>               <dbl>
## 1 1797821702    9.47e9 2.02e7          1             1015002                  17
## 2  230462094    9.45e9 2.02e7          1              604570                  23
## 3  688725060    9.45e9 2.02e7          1            13182275                 274
## 4  586073164    9.47e9 2.02e7          1         18683824825               85252
## 5   57903715    9.45e9 2.02e7          1            44059975                 298
## 6  665432520    9.46e9 2.02e7          1             3727673                  89
## # ℹ 35 more variables: avg_valid_play_duration <dbl>, total_cost_amt <dbl>,
## #   total_cost_user_num <dbl>, avg_total_cost_amt <dbl>, comment_cnt <dbl>,
## #   comment_user_num <dbl>, avg_comment_cnt <dbl>, like_cnt <dbl>,
## #   like_user_num <dbl>, avg_like_cnt <dbl>, share_success_cnt <dbl>,
## #   share_success_user_num <dbl>, avg_share_success_cnt <dbl>,
## #   follow_author_cnt <dbl>, gender <chr>, age_range <chr>,
## #   fre_country_region <chr>, fre_city_level <chr>, fans_user_num <dbl>, …

4.1 固定两个:author_id fixed, live_sequence fixed

# 指定author_id和live_id_order作为索引
pdata_weighted <- pdata.frame(data_weighted, index = c("author_id", "live_id_order"))

(1) Y:直播间打赏-total_cost_amt

model <- plm(total_cost_amt ~ is_pk_live, data = pdata_weighted, model = "within", effect = "twoways",weights = pdata_weighted$weights)
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = total_cost_amt ~ is_pk_live, data = pdata_weighted, 
##     weights = pdata_weighted$weights, effect = "twoways", model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -833647       0       0       0       0  833647 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live   563.41      48.73  11.562 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    6.3962e+12
## Residual Sum of Squares: 5.406e+12
## R-Squared:      0.00083193
## Adj. R-Squared: -3.0017
## F-statistic: 133.681 on 1 and 239334 DF, p-value: < 2.22e-16

结论：显著+

(2) Y:直播间人均观看时长-avg_valid_play_duration

model <- plm(avg_valid_play_duration ~ is_pk_live, data = pdata_weighted, model = "within", effect = "twoways",weights = pdata_weighted$weights)
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = avg_valid_play_duration ~ is_pk_live, data = pdata_weighted, 
##     weights = pdata_weighted$weights, effect = "twoways", model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -5267936        0        0        4        0  5267709 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live  17554.8      626.1  28.038 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    4.235e+15
## Residual Sum of Squares: 8.9243e+14
## R-Squared:      0.0011835
## Adj. R-Squared: -3.0003
## F-statistic: 786.154 on 1 and 239334 DF, p-value: < 2.22e-16

结论：显著+

(3) Y:直播间人均互动

人均评论:avg_comment_cnt

model <- plm(avg_comment_cnt ~ is_pk_live, data = pdata_weighted, model = "within", effect = "twoways",weights = pdata_weighted$weights)
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = avg_comment_cnt ~ is_pk_live, data = pdata_weighted, 
##     weights = pdata_weighted$weights, effect = "twoways", model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -60.2     0.0     0.0     0.0     0.0   108.9 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live  1.75777    0.02056  85.493 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    2966800
## Residual Sum of Squares: 962390
## R-Squared:      0.016404
## Adj. R-Squared: -2.9393
## F-statistic: 7309.11 on 1 and 239334 DF, p-value: < 2.22e-16

结论：显著+

人均点赞;avg_like_cnt

model <- plm(avg_like_cnt ~ is_pk_live, data = pdata_weighted, model = "within", effect = "twoways",weights = pdata_weighted$weights)
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = avg_like_cnt ~ is_pk_live, data = pdata_weighted, 
##     weights = pdata_weighted$weights, effect = "twoways", model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -71461       0       0       0       0  142525 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live  16.3850     3.7239     4.4 1.083e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    2.3981e+11
## Residual Sum of Squares: 3.1571e+10
## R-Squared:      1.7296e-05
## Adj. R-Squared: -3.005
## F-statistic: 19.3599 on 1 and 239334 DF, p-value: 1.083e-05

结论：显著+

人均转发：avg_share_success_cnt

model <- plm(avg_share_success_cnt ~ is_pk_live, data = pdata_weighted, model = "within", effect = "twoways",weights = pdata_weighted$weights)
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = avg_share_success_cnt ~ is_pk_live, data = pdata_weighted, 
##     weights = pdata_weighted$weights, effect = "twoways", model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -20.8     0.0     0.0     0.0     0.0    68.8 
## 
## Coefficients:
##             Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live 0.1325892  0.0039765  33.343 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    112610
## Residual Sum of Squares: 35999
## R-Squared:      0.0025633
## Adj. R-Squared: -2.9948
## F-statistic: 1111.77 on 1 and 239334 DF, p-value: < 2.22e-16

结论：显著+

(4) Y:直播间涨粉：follow_author_cnt

model <- plm(follow_author_cnt ~ is_pk_live, data = pdata_weighted, model = "within", effect = "twoways",weights = pdata_weighted$weights)
summary(model)

## Twoways effects Within Model
## 
## Call:
## plm(formula = follow_author_cnt ~ is_pk_live, data = pdata_weighted, 
##     weights = pdata_weighted$weights, effect = "twoways", model = "within")
## 
## Unbalanced Panel: n = 719070, T = 1-137, N = 958541
## 
## Residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -7381       0       0       0       0   17079 
## 
## Coefficients:
##            Estimate Std. Error t-value  Pr(>|t|)    
## is_pk_live  9.91970    0.97149  10.211 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    5924400000
## Residual Sum of Squares: 2148600000
## R-Squared:      0.00025303
## Adj. R-Squared: -3.004
## F-statistic: 104.261 on 1 and 239334 DF, p-value: < 2.22e-16

结论：显著+

结论：加上Entropy balance weight以后,固定Author和live sequence,regression的结果为：直播间打赏：显著+ 人均(viewer)观看时长：显著+ 人均(viewer)直播间评论：显著+ 人均(viewer)直播间点赞：显著+ 人均(viewer)直播间转发：显著+ 直播间内涨粉数：显著+

4.2 固定三个：author_id fixed, live_id fixed, date fixed

(1) Y:直播间打赏-total_cost_amt

library(fixest)
model <- feols(total_cost_amt ~ is_pk_live | author_id + live_id_order + p_date, data = data_weighted,weights = ~ weights)
summary(model)

## OLS estimation, Dep. Var.: total_cost_amt
## Observations: 958,541
## Weights: weights
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  643.242    49.3473  13.035 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 2,348.3     Adj. R2: 0.442648
##                 Within R2: 4.121e-4

(2) Y:直播间人均观看时长-avg_valid_play_duration

library(fixest)
model <- feols(avg_valid_play_duration ~ is_pk_live | author_id + live_id_order + p_date, data = data_weighted,weights = ~ weights)
summary(model)

## OLS estimation, Dep. Var.: avg_valid_play_duration
## Observations: 958,541
## Weights: weights
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  17896.3    748.221 23.9185 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 29,620.3     Adj. R2: 0.308593
##                  Within R2: 0.002002

(3) Y:直播间人均互动

人均评论:avg_comment_cnt

library(fixest)
model <- feols(avg_comment_cnt ~ is_pk_live | author_id + live_id_order + p_date, data = data_weighted,weights = ~ weights)
summary(model)

## OLS estimation, Dep. Var.: avg_comment_cnt
## Observations: 958,541
## Weights: weights
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  1.83197   0.035143 52.1295 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.944646     Adj. R2: 0.475642
##                  Within R2: 0.020248

人均点赞;avg_like_cnt

library(fixest)
model <- feols(avg_like_cnt ~ is_pk_live | author_id + live_id_order + p_date, data = data_weighted,weights = ~ weights)
summary(model)

## OLS estimation, Dep. Var.: avg_like_cnt
## Observations: 958,541
## Weights: weights
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  16.3475   0.946534 17.2709 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 181.2     Adj. R2: -1.35508
##               Within R2: 4.472e-5

人均转发：avg_share_success_cnt

library(fixest)
model <- feols(avg_share_success_cnt ~ is_pk_live | author_id + live_id_order + p_date, data = data_weighted,weights = ~ weights)
summary(model)

## OLS estimation, Dep. Var.: avg_share_success_cnt
## Observations: 958,541
## Weights: weights
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value  Pr(>|t|)    
## is_pk_live  0.13639   0.005391 25.3007 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.185865     Adj. R2: 0.320162
##                  Within R2: 0.00295

(4) Y:直播间涨粉：follow_author_cnt

model <- feols(follow_author_cnt ~ is_pk_live | author_id + live_id_order + p_date, data = data_weighted,weights = ~ weights)
summary(model)

## OLS estimation, Dep. Var.: follow_author_cnt
## Observations: 958,541
## Weights: weights
## Fixed-effects: author_id: 719,070,  live_id_order: 137,  p_date: 9
## Standard-errors: Clustered (author_id) 
##            Estimate Std. Error t value   Pr(>|t|)    
## is_pk_live  10.0129    1.98109 5.05424 4.3222e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 43.8     Adj. R2: 0.668629
##              Within R2: 2.868e-4

结论：加上Entropy balance weight以后,固定Author,live sequence和date,regression的结果为：直播间打赏：显著+ 人均(viewer)观看时长：显著+ 人均(viewer)直播间评论：显著+ 人均(viewer)直播间点赞：显著+ 人均(viewer)直播间转发：显著+ 直播间内涨粉数：显著+

PK_analysis

Qinlu

2024-08-17

1.read data

2.处理变量

2.1 对于每个author按照p_date的顺序对于live_id进行排序

2.2 处理categorical variable

3.Panel data regression - TWFE

3.1 固定两个: author_id fixed, live_sequence fixed

(1) Y:直播间打赏-total_cost_amt

(2) Y:直播间人均观看时长-avg_valid_play_duration

(3) Y:直播间人均互动

(4) Y:直播间涨粉：follow_author_cnt

3.2 固定3个：author_id fixed, live_id fixed, date fixed

(1) Y:直播间打赏-total_cost_amt

(3) Y:直播间人均互动

(4) Y:直播间涨粉：follow_author_cnt

4.Entropy balancing

计算 Entropy weight

4.1 固定两个:author_id fixed, live_sequence fixed

(1) Y:直播间打赏-total_cost_amt

(2) Y:直播间人均观看时长-avg_valid_play_duration

(3) Y:直播间人均互动

(4) Y:直播间涨粉：follow_author_cnt

4.2 固定三个：author_id fixed, live_id fixed, date fixed

(1) Y:直播间打赏-total_cost_amt

(2) Y:直播间人均观看时长-avg_valid_play_duration

(3) Y:直播间人均互动

(4) Y:直播间涨粉：follow_author_cnt