setwd("C:/Users/ASUS/Desktop/快手")

数据:2022.01.01–2024.09.01 全部的random PK pair

data= read.csv("Within_Random_Pk_final_clean.csv")
summary(data)
##      pk_id               p_date         other_author_id       author_id        
##  Min.   :2.470e+09   Min.   :20220101   Min.   :1.313e+03   Min.   :1.313e+03  
##  1st Qu.:2.928e+09   1st Qu.:20220716   1st Qu.:6.435e+08   1st Qu.:6.476e+08  
##  Median :5.028e+09   Median :20230617   Median :1.123e+09   Median :1.125e+09  
##  Mean   :4.456e+09   Mean   :20229670   Mean   :1.327e+09   Mean   :1.329e+09  
##  3rd Qu.:5.450e+09   3rd Qu.:20240127   3rd Qu.:1.893e+09   3rd Qu.:1.897e+09  
##  Max.   :5.886e+09   Max.   :20240901   Max.   :4.345e+09   Max.   :4.343e+09  
##                                                                                
##     live_id          valid_play_duration valid_play_user_num
##  Min.   :8.958e+09   Min.   :0.000e+00   Min.   :     0.0   
##  1st Qu.:9.760e+09   1st Qu.:1.225e+05   1st Qu.:     9.0   
##  Median :1.113e+10   Median :1.120e+06   Median :    40.0   
##  Mean   :1.092e+10   Mean   :5.116e+07   Mean   :   601.1   
##  3rd Qu.:1.195e+10   3rd Qu.:6.723e+06   3rd Qu.:   160.0   
##  Max.   :1.276e+10   Max.   :9.158e+10   Max.   :555052.0   
##                                                             
##  avg_valid_play_duration total_cost_amt     total_cost_user_num
##  Min.   :      0         Min.   :     0.0   Min.   :   0.000   
##  1st Qu.:  10022         1st Qu.:     0.0   1st Qu.:   0.000   
##  Median :  23524         Median :     0.0   Median :   0.000   
##  Mean   :  41674         Mean   :   156.5   Mean   :   1.581   
##  3rd Qu.:  49260         3rd Qu.:     5.0   3rd Qu.:   1.000   
##  Max.   :8290602         Max.   :330122.0   Max.   :1218.000   
##                                                                
##  avg_total_cost_amt  comment_cnt        comment_user_num    avg_comment_cnt  
##  Min.   :    0.00   Min.   :     0.00   Min.   :    0.000   Min.   :  0.000  
##  1st Qu.:    0.00   1st Qu.:     0.00   1st Qu.:    0.000   1st Qu.:  0.000  
##  Median :    0.00   Median :     2.00   Median :    1.000   Median :  1.333  
##  Mean   :   35.54   Mean   :    37.16   Mean   :    9.214   Mean   :  2.504  
##  3rd Qu.:    3.00   3rd Qu.:    19.00   3rd Qu.:    5.000   3rd Qu.:  3.400  
##  Max.   :35599.64   Max.   :223684.00   Max.   :13554.000   Max.   :190.998  
##                                                                              
##     like_cnt         like_user_num       avg_like_cnt      share_success_cnt  
##  Min.   :      0.0   Min.   :   0.000   Min.   :     0.0   Min.   :    0.000  
##  1st Qu.:      0.0   1st Qu.:   0.000   1st Qu.:     0.0   1st Qu.:    0.000  
##  Median :     11.0   Median :   2.000   Median :     4.0   Median :    0.000  
##  Mean   :    730.7   Mean   :   9.546   Mean   :    43.6   Mean   :    3.276  
##  3rd Qu.:    157.0   3rd Qu.:   6.000   3rd Qu.:    26.8   3rd Qu.:    0.000  
##  Max.   :1026119.0   Max.   :8047.000   Max.   :107281.7   Max.   :25255.000  
##                                                                               
##  share_success_user_num avg_share_success_cnt follow_author_cnt 
##  Min.   :    0.000      Min.   :  0.0000      Min.   :   0.000  
##  1st Qu.:    0.000      1st Qu.:  0.0000      1st Qu.:   0.000  
##  Median :    0.000      Median :  0.0000      Median :   0.000  
##  Mean   :    1.924      Mean   :  0.2342      Mean   :   5.142  
##  3rd Qu.:    0.000      3rd Qu.:  0.0000      3rd Qu.:   1.000  
##  Max.   :11274.000      Max.   :208.9979      Max.   :8101.000  
##                                                                 
##  cancel_follow_author_cnt follow_user_cnt   cancel_follow_user_cnt
##  Min.   :   0.000         Min.   :   0.00   Min.   : 0.00000      
##  1st Qu.:   0.000         1st Qu.:   0.00   1st Qu.: 0.00000      
##  Median :   0.000         Median :   0.00   Median : 0.00000      
##  Mean   :   1.462         Mean   :   0.78   Mean   : 0.05411      
##  3rd Qu.:   0.000         3rd Qu.:   0.00   3rd Qu.: 0.00000      
##  Max.   :4251.000         Max.   :6141.00   Max.   :66.00000      
##                                                                   
##  join_fans_group_cnt live_new_user_num   report_live_cnt    report_user_cnt    
##  Min.   :  0.000     Min.   :  0.00000   Min.   :  0.0000   Min.   :  0.00000  
##  1st Qu.:  0.000     1st Qu.:  0.00000   1st Qu.:  0.0000   1st Qu.:  0.00000  
##  Median :  0.000     Median :  0.00000   Median :  0.0000   Median :  0.00000  
##  Mean   :  0.323     Mean   :  0.05062   Mean   :  0.2084   Mean   :  0.01576  
##  3rd Qu.:  0.000     3rd Qu.:  0.00000   3rd Qu.:  0.0000   3rd Qu.:  0.00000  
##  Max.   :919.000     Max.   :114.00000   Max.   :917.0000   Max.   :109.00000  
##                                                                                
##  is_follow_each_other  pk_duration     pk_play_user_num    pk_play_duration   
##  Min.   :0.000        Min.   :    10   Min.   :     0.00   Min.   :0.000e+00  
##  1st Qu.:0.000        1st Qu.: 12446   1st Qu.:     1.00   1st Qu.:3.746e+03  
##  Median :0.000        Median : 48760   Median :     5.00   Median :4.042e+04  
##  Mean   :0.004        Mean   :126488   Mean   :    38.97   Mean   :6.088e+05  
##  3rd Qu.:0.000        3rd Qu.:254913   3rd Qu.:    23.00   3rd Qu.:4.092e+05  
##  Max.   :1.000        Max.   :446413   Max.   :115900.00   Max.   :1.419e+09  
##  NA's   :6983                                                                 
##  avg_pk_play_duration pk_follow_author_cnt pk_cancel_follow_author_cnt
##  Min.   :     0       Min.   :   0.0000    Min.   :  0.00000          
##  1st Qu.:  2133       1st Qu.:   0.0000    1st Qu.:  0.00000          
##  Median :  7110       Median :   0.0000    Median :  0.00000          
##  Mean   : 13047       Mean   :   0.1816    Mean   :  0.06546          
##  3rd Qu.: 16535       3rd Qu.:   0.0000    3rd Qu.:  0.00000          
##  Max.   :718371       Max.   :2003.0000    Max.   :136.00000          
##                                                                       
##  pk_follow_user_cnt pk_cancel_follow_user_cnt  pk_like_cnt      
##  Min.   : 0.00000   Min.   :0.000000          Min.   :   0.000  
##  1st Qu.: 0.00000   1st Qu.:0.000000          1st Qu.:   0.000  
##  Median : 0.00000   Median :0.000000          Median :   0.000  
##  Mean   : 0.04517   Mean   :0.002961          Mean   :   8.021  
##  3rd Qu.: 0.00000   3rd Qu.:0.000000          3rd Qu.:   1.000  
##  Max.   :90.00000   Max.   :5.000000          Max.   :5960.000  
##                                                                 
##  pk_like_user_num  avg_pk_like_cnt   pk_comment_cnt     pk_comment_user_num
##  Min.   :  0.000   Min.   :  0.000   Min.   :   0.000   Min.   :  0.0000   
##  1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:   0.000   1st Qu.:  0.0000   
##  Median :  0.000   Median :  0.000   Median :   0.000   Median :  0.0000   
##  Mean   :  0.621   Mean   :  3.686   Mean   :   1.107   Mean   :  0.4891   
##  3rd Qu.:  1.000   3rd Qu.:  1.000   3rd Qu.:   0.000   3rd Qu.:  0.0000   
##  Max.   :291.000   Max.   :453.665   Max.   :1304.000   Max.   :243.0000   
##                                                                            
##  avg_pk_comment_cnt pk_report_cnt      pk_report_user_cnt avg_pk_report_cnt
##  Min.   : 0.0000    Min.   : 0.00000   Min.   :0.00e+00   Min.   :      0  
##  1st Qu.: 0.0000    1st Qu.: 0.00000   1st Qu.:0.00e+00   1st Qu.:      0  
##  Median : 0.0000    Median : 0.00000   Median :0.00e+00   Median :      0  
##  Mean   : 0.5544    Mean   : 0.01089   Mean   :5.93e-04   Mean   :   1066  
##  3rd Qu.: 0.0000    3rd Qu.: 0.00000   3rd Qu.:0.00e+00   3rd Qu.:      0  
##  Max.   :45.9995    Max.   :71.00000   Max.   :1.40e+01   Max.   :7100000  
##                                                                            
##  pk_total_cost_amt   pk_total_cost_cnt avg_pk_total_cost_amt
##  Min.   :     0.00   Min.   :  0.000   Min.   :    0.000    
##  1st Qu.:     0.00   1st Qu.:  0.000   1st Qu.:    0.000    
##  Median :     0.00   Median :  0.000   Median :    0.000    
##  Mean   :    29.29   Mean   :  1.566   Mean   :    9.553    
##  3rd Qu.:     0.00   3rd Qu.:  0.000   3rd Qu.:    0.000    
##  Max.   :130013.00   Max.   :947.000   Max.   :28887.711    
##                                                             
##  follow_author_fans_count unfollow_author_fans_count
##  Min.   :   0.00          Min.   :   0.0000         
##  1st Qu.:   0.00          1st Qu.:   0.0000         
##  Median :   0.00          Median :   0.0000         
##  Mean   :   1.66          Mean   :   0.7025         
##  3rd Qu.:   0.00          3rd Qu.:   0.0000         
##  Max.   :5209.00          Max.   :2109.0000         
##                                                     
##  already_follow_other_fans_count    gender          fans_user_num     
##  Min.   :  0.00000               Length:187126      Min.   :       0  
##  1st Qu.:  0.00000               Class :character   1st Qu.:     414  
##  Median :  0.00000               Mode  :character   Median :    1198  
##  Mean   :  0.01516                                  Mean   :   17161  
##  3rd Qu.:  0.00000                                  3rd Qu.:    4708  
##  Max.   :125.00000                                  Max.   :11401415  
##                                                                       
##   fans_range        fans_group_fans_num author_type        live_operation_tag
##  Length:187126      Min.   :     0.0    Length:187126      Length:187126     
##  Class :character   1st Qu.:     1.0    Class :character   Class :character  
##  Mode  :character   Median :     5.0    Mode  :character   Mode  :character  
##                     Mean   :   254.6                                         
##                     3rd Qu.:    65.0                                         
##                     Max.   :332787.0                                         
##                                                                              
##  live_stream_category author_income_range  age_range         fre_country_region
##  Length:187126        Length:187126       Length:187126      Length:187126     
##  Class :character     Class :character    Class :character   Class :character  
##  Mode  :character     Mode  :character    Mode  :character   Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##  fre_city_level      reg_day_cnt   follow_user_num before_fans_count 
##  Length:187126      Min.   :   0   Min.   :   0    Min.   :       0  
##  Class :character   1st Qu.:1036   1st Qu.: 139    1st Qu.:     342  
##  Mode  :character   Median :1624   Median : 592    Median :     986  
##                     Mean   :1565   Mean   :1269    Mean   :   14119  
##                     3rd Qu.:2109   3rd Qu.:1835    3rd Qu.:    3881  
##                     Max.   :4387   Max.   :5023    Max.   :10848875  
##                                                                      
##  common_user_count    live_id.y         other_gender       other_fans_user_num
##  Min.   :     0.0   Min.   :8.959e+09   Length:187126      Min.   :       0   
##  1st Qu.:     0.0   1st Qu.:9.760e+09   Class :character   1st Qu.:     413   
##  Median :     0.0   Median :1.113e+10   Mode  :character   Median :    1190   
##  Mean   :   103.8   Mean   :1.092e+10                      Mean   :   17269   
##  3rd Qu.:     0.0   3rd Qu.:1.195e+10                      3rd Qu.:    4675   
##  Max.   :986301.0   Max.   :1.276e+10                      Max.   :37195072   
##                                                                               
##  other_fans_range   other_fans_group_fans_num other_author_type 
##  Length:187126      Min.   :      0           Length:187126     
##  Class :character   1st Qu.:      1           Class :character  
##  Mode  :character   Median :      5           Mode  :character  
##                     Mean   :    272                             
##                     3rd Qu.:     65                             
##                     Max.   :1065934                             
##                                                                 
##  other_live_operation_tag other_live_stream_category other_author_income_range
##  Length:187126            Length:187126              Length:187126            
##  Class :character         Class :character           Class :character         
##  Mode  :character         Mode  :character           Mode  :character         
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  other_age_range    other_fre_country_region other_fre_city_level
##  Length:187126      Length:187126            Length:187126       
##  Class :character   Class :character         Class :character    
##  Mode  :character   Mode  :character         Mode  :character    
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##  other_reg_day_cnt other_follow_user_num other_before_fans_count
##  Min.   :   0      Min.   :   0          Min.   :       0       
##  1st Qu.:1039      1st Qu.: 138          1st Qu.:     339       
##  Median :1625      Median : 592          Median :     982       
##  Mean   :1567      Mean   :1264          Mean   :   14276       
##  3rd Qu.:2110      3rd Qu.:1833          3rd Qu.:    3855       
##  Max.   :4577      Max.   :5022          Max.   :36176448       
##                                                                 
##  other_common_user_count live_id_order    new_author_index
##  Min.   :     0.0        Min.   :  1.00   Min.   :1       
##  1st Qu.:     0.0        1st Qu.:  1.00   1st Qu.:1       
##  Median :     0.0        Median :  1.00   Median :1       
##  Mean   :   103.8        Mean   :  2.18   Mean   :1       
##  3rd Qu.:     0.0        3rd Qu.:  1.00   3rd Qu.:1       
##  Max.   :986301.0        Max.   :218.00   Max.   :1       
## 
str(data)
## 'data.frame':    187126 obs. of  85 variables:
##  $ pk_id                          : num  2.47e+09 2.47e+09 2.47e+09 2.47e+09 2.47e+09 ...
##  $ p_date                         : int  20220101 20220101 20220101 20220101 20220101 20220101 20220101 20220101 20220101 20220101 ...
##  $ other_author_id                : num  2.06e+09 8.90e+08 2.67e+09 2.10e+09 1.33e+09 ...
##  $ author_id                      : num  9.15e+08 1.27e+09 2.66e+09 2.05e+07 1.45e+09 ...
##  $ live_id                        : num  8.96e+09 8.96e+09 8.96e+09 8.96e+09 8.96e+09 ...
##  $ valid_play_duration            : num  4525836 49630 306248 4608075 486888 ...
##  $ valid_play_user_num            : int  155 1 9 30 13 9 16 53 38 14 ...
##  $ avg_valid_play_duration        : num  29199 49630 34028 153602 37453 ...
##  $ total_cost_amt                 : int  10 0 0 0 0 0 208 208 0 0 ...
##  $ total_cost_user_num            : int  1 0 0 0 0 0 1 1 0 0 ...
##  $ avg_total_cost_amt             : num  10 0 0 0 0 ...
##  $ comment_cnt                    : int  32 0 0 0 0 4 16 21 4 4 ...
##  $ comment_user_num               : int  5 0 0 0 0 1 4 5 1 3 ...
##  $ avg_comment_cnt                : num  6.4 0 0 0 0 ...
##  $ like_cnt                       : int  306 0 0 0 0 0 12 20 0 2 ...
##  $ like_user_num                  : int  9 0 0 0 0 0 3 1 0 1 ...
##  $ avg_like_cnt                   : num  34 0 0 0 0 ...
##  $ share_success_cnt              : int  3 0 0 1 0 0 0 0 0 0 ...
##  $ share_success_user_num         : int  3 0 0 1 0 0 0 0 0 0 ...
##  $ avg_share_success_cnt          : num  1 0 0 1 0 ...
##  $ follow_author_cnt              : int  4 0 0 0 1 0 0 0 0 0 ...
##  $ cancel_follow_author_cnt       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ follow_user_cnt                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cancel_follow_user_cnt         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ join_fans_group_cnt            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ live_new_user_num              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ report_live_cnt                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ report_user_cnt                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_follow_each_other           : int  NA NA NA NA 0 NA NA NA NA NA ...
##  $ pk_duration                    : int  427197 390710 373737 334503 371514 310907 323632 333151 326283 370738 ...
##  $ pk_play_user_num               : int  3 2 2 11 9 4 1 6 13 5 ...
##  $ pk_play_duration               : int  67778 22072 30992 5038 344217 98164 39802 214399 290473 612846 ...
##  $ avg_pk_play_duration           : num  22593 11036 15496 458 38246 ...
##  $ pk_follow_author_cnt           : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ pk_cancel_follow_author_cnt    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pk_follow_user_cnt             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pk_cancel_follow_user_cnt      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pk_like_cnt                    : int  0 0 0 0 0 0 0 2 0 0 ...
##  $ pk_like_user_num               : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ avg_pk_like_cnt                : num  0 0 0 0 0 ...
##  $ pk_comment_cnt                 : int  0 0 0 0 0 1 0 0 1 1 ...
##  $ pk_comment_user_num            : int  0 0 0 0 0 1 0 0 1 1 ...
##  $ avg_pk_comment_cnt             : num  0 0 0 0 0 ...
##  $ pk_report_cnt                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pk_report_user_cnt             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ avg_pk_report_cnt              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pk_total_cost_amt              : int  0 0 0 0 0 0 0 208 0 0 ...
##  $ pk_total_cost_cnt              : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ avg_pk_total_cost_amt          : num  0 0 0 0 0 ...
##  $ follow_author_fans_count       : int  3 0 0 0 0 0 0 0 0 0 ...
##  $ unfollow_author_fans_count     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ already_follow_other_fans_count: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ gender                         : chr  "F" "M" "F" "F" ...
##  $ fans_user_num                  : int  899 872 677 8199 1000 15916 183 3118 138061 1615 ...
##  $ fans_range                     : chr  "100-1k" "100-1k" "100-1k" "1k-1w" ...
##  $ fans_group_fans_num            : int  32 1 38 4588 2 877 14 15 1044 5 ...
##  $ author_type                    : chr  "秀场主播" "秀场主播" "秀场主播" "秀场主播" ...
##  $ live_operation_tag             : chr  "闲聊互动" "UNKNOWN" "颜值" "闲聊互动" ...
##  $ live_stream_category           : chr  "1002=才艺技能展示,2020=唱歌,3045=热歌金曲" "1009=闲聊互动,2109=帅哥闲聊互动,3431=其他帅哥闲聊互动" "" "" ...
##  $ author_income_range            : chr  "(50,500]" "(10,50]" "(50,500]" "(500,1000]" ...
##  $ age_range                      : chr  "50+" "18-23" "18-23" "24-30" ...
##  $ fre_country_region             : chr  "南方" "南方" "南方" "北方" ...
##  $ fre_city_level                 : chr  "一线城市" "新一线城市" "二线城市" "三线城市" ...
##  $ reg_day_cnt                    : int  1363 1037 37 2633 881 662 1400 2162 2711 1196 ...
##  $ follow_user_num                : int  112 428 53 49 679 3518 44 1 878 3026 ...
##  $ before_fans_count              : int  901 873 684 8296 999 15900 189 3126 138040 1630 ...
##  $ common_user_count              : int  0 0 1 7 3 0 0 1 231 0 ...
##  $ live_id.y                      : num  8.96e+09 8.96e+09 8.96e+09 8.96e+09 8.96e+09 ...
##  $ other_gender                   : chr  "F" "M" "F" "F" ...
##  $ other_fans_user_num            : int  3476 1275 1040 20216 479 845 760 12912 17324 804 ...
##  $ other_fans_range               : chr  "1k-1w" "1k-1w" "1k-1w" "1w-10w" ...
##  $ other_fans_group_fans_num      : int  38 4 48 111 0 66 28 247 180 2 ...
##  $ other_author_type              : chr  "秀场主播" "秀场主播" "秀场主播" "秀场主播" ...
##  $ other_live_operation_tag       : chr  "才艺技能展示" "闲聊互动" "才艺技能展示" "颜值" ...
##  $ other_live_stream_category     : chr  "1002=才艺技能展示,2020=唱歌,3045=热歌金曲" "1009=闲聊互动,2108=美女闲聊互动,3428=其他美女闲聊互动" "1002=才艺技能展示,2020=唱歌,3045=热歌金曲" "1009=闲聊互动,2108=美女闲聊互动,3426=颜值美女闲聊互动" ...
##  $ other_author_income_range      : chr  "(50,500]" "(10,50]" "(500,1000]" "(500,1000]" ...
##  $ other_age_range                : chr  "24-30" "18-23" "31-40" "50+" ...
##  $ other_fre_country_region       : chr  "南方" "南方" "北方" "北方" ...
##  $ other_fre_city_level           : chr  "二线城市" "二线城市" "三线城市" "新一线城市" ...
##  $ other_reg_day_cnt              : int  504 1387 31 456 987 1857 2172 1992 2080 1826 ...
##  $ other_follow_user_num          : int  8 306 534 138 253 126 76 88 66 475 ...
##  $ other_before_fans_count        : int  3473 1273 1040 20220 497 849 772 13003 17335 807 ...
##  $ other_common_user_count        : int  0 0 1 7 3 0 0 1 231 0 ...
##  $ live_id_order                  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ new_author_index               : int  1 1 1 1 1 1 1 1 1 1 ...

自变量

### 竞争
# (1) 对方粉丝数
# table(data$other_before_fans_count)

# (2) 对方粉丝数/本人粉丝数
data$relative_fans= (data$other_before_fans_count)/(data$before_fans_count + 1)

# (3) 粉丝数overlap
data$fans_overlap_from_author = (data$common_user_count)/(data$before_fans_count + 1)
data$fans_overlap_from_pair = (data$common_user_count)/(data$before_fans_count + data$other_before_fans_count + 1)

# (4) 两个粉丝都大:竞争大
data = data %>%mutate(fans_pair_range=case_when(
  (before_fans_count>100000) & (other_before_fans_count>100000)~1,
  TRUE~0
))

data = data %>%mutate(fans_piar_cat = case_when(
  (before_fans_count>10000) & (other_before_fans_count>10000)~3,
  (before_fans_count>10000) & (other_before_fans_count<10000)~2,
  (before_fans_count<10000) & (other_before_fans_count>10000)~1,
  (before_fans_count<10000) & (other_before_fans_count<10000)~0,
))

data <- data %>% 
  mutate(fans_piar_cat = factor(fans_piar_cat, level = c(0,1,2,3), labels = c("Small vs Small ", "Small vs Big", "Big vs Small", "Big vs Big")))

### 合作
# (1) 按照直播类型来划分:同类:1
data <- data %>% 
  mutate(is_cooperative = case_when(
    (live_operation_tag == other_live_operation_tag) ~ 1,  
    TRUE ~ 0
  ))

table(data$fans_piar_cat)
## 
## Small vs Small     Small vs Big    Big vs Small      Big vs Big 
##          149898           10906           10701           15620
table(data$is_cooperative)
## 
##      0      1 
## 121925  65201
summary(data$ before_fans_count)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0      342      986    14119     3881 10848875

因变量

# table(data$total_cost_amt) # 119056 为0
data$avg_fan_total_cost_amt= (data$total_cost_amt)/(data$before_fans_count + 1)

2.1 DV: 打赏:log(total_cost_amt+1)

(1) 自变量:fans_pair:3:大大,2:大小,1:小大,0:小小

model <- feols(log(avg_fan_total_cost_amt+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 1 observation removed because of NA values (RHS: 1).
summary(model)
## OLS estimation, Dep. Var.: log(avg_fan_total_cost_amt + 1)
## Observations: 187,125
## Fixed-effects: p_date: 975
## Standard-errors: Clustered (author_id) 
##                                                   Estimate Std. Error   t value
## factor(fans_piar_cat)Small vs Big                 0.028320   0.002953   9.58926
## factor(fans_piar_cat)Big vs Small                -0.024452   0.001229 -19.89522
## factor(fans_piar_cat)Big vs Big                  -0.025267   0.001082 -23.34969
## is_cooperative                                    0.009500   0.001349   7.04310
## factor(fans_piar_cat)Small vs Big:is_cooperative  0.005994   0.005015   1.19523
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.011357   0.001836  -6.18599
## factor(fans_piar_cat)Big vs Big:is_cooperative   -0.007411   0.001793  -4.13344
##                                                    Pr(>|t|)    
## factor(fans_piar_cat)Small vs Big                 < 2.2e-16 ***
## factor(fans_piar_cat)Big vs Small                 < 2.2e-16 ***
## factor(fans_piar_cat)Big vs Big                   < 2.2e-16 ***
## is_cooperative                                   1.8878e-12 ***
## factor(fans_piar_cat)Small vs Big:is_cooperative 2.3200e-01    
## factor(fans_piar_cat)Big vs Small:is_cooperative 6.1867e-10 ***
## factor(fans_piar_cat)Big vs Big:is_cooperative   3.5755e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.219571     Adj. R2: 0.005954
##                  Within R2: 0.003705

3.plot

df=data[(data$before_fans_count>100 & data$before_fans_count<100000),]
df=df[(df$other_before_fans_count>100 & df$other_before_fans_count<100000),]
df = df%>% mutate(A_fan_count = before_fans_count,
         B_fan_count = other_before_fans_count,
         log_A_fan_count = log(before_fans_count+1),
         log_B_fan_count = log(other_before_fans_count+1))

Model-free

# Step 1: Group the data by A_fan_count and B_fan_count, and calculate the average money for A
agg_df <- df %>%
  group_by(A_fan_count, B_fan_count) %>%
  summarize(avg_money_A = mean(avg_fan_total_cost_amt, na.rm = TRUE))
## `summarise()` has grouped output by 'A_fan_count'. You can override using the
## `.groups` argument.
# Step 2: Create the 3D surface plot
fig <- plot_ly(agg_df, 
               x = ~log(A_fan_count), 
               y = ~log(B_fan_count), 
               z = ~avg_money_A, 
               type = 'surface', 
               colorscale = 'Viridis')

# Step 3: Customize the layout for better visualization
fig <- fig %>%
  layout(title = '3D Surface Plot of Avg Money for A',
         scene = list(
           xaxis = list(title = "Log(A's Fan Count)"),
           yaxis = list(title = "Log(B's Fan Count)"),
           zaxis = list(title = "Avg Money for A")
         ))

# Display the plot
fig
fig <- plot_ly(df, 
               x = ~log(A_fan_count), 
               y = ~log(B_fan_count), 
               z = ~avg_fan_total_cost_amt, 
               type = 'scatter3d', 
               mode = 'markers', 
               marker = list(size = 2, color = ~avg_fan_total_cost_amt, colorscale = 'Viridis', showscale = TRUE))

fig <- fig %>% layout(title = '3D Scatter Plot of Avg Money for A',
                      scene = list(
                        xaxis = list(title = "Log(A's Fan Count)"),
                        yaxis = list(title = "Log(B's Fan Count)"),
                        zaxis = list(title = "Avg Money for A")
                      ))

fig
table(df$fans_range)
## 
##    0-100   100-1k 10w-100w    1k-1w   1w-10w 
##      336    60466      732    69652    22682
table(df$other_fans_range)
## 
##    0-100   100-1k 10w-100w    1k-1w   1w-10w 
##      349    60439      734    69536    22810
# Step 1: Set the correct order for the categories
df <- df %>%
  mutate(fans_range = factor(fans_range, levels = c("0-100", "100-1k", "1k-1w", "1w-10w", "10w-100w")),
         other_fans_range = factor(other_fans_range, levels = c("0-100", "100-1k", "1k-1w", "1w-10w", "10w-100w")))

# Step 2: Aggregate the data to calculate the mean and confidence intervals
agg_df <- df %>%
  group_by(fans_range, other_fans_range) %>%
  summarize(
    avg_fan_total_cost_amt = mean(avg_fan_total_cost_amt, na.rm = TRUE),
    ci_lower = avg_fan_total_cost_amt - qt(0.975, n()) * sd(avg_fan_total_cost_amt, na.rm = TRUE) / sqrt(n()),
    ci_upper = avg_fan_total_cost_amt + qt(0.975, n()) * sd(avg_fan_total_cost_amt, na.rm = TRUE) / sqrt(n())
  )
## `summarise()` has grouped output by 'fans_range'. You can override using the
## `.groups` argument.
# Step 3: Plot the data in a heatmap-like format with CI as shading or error bars
ggplot(agg_df, aes(x = fans_range, y = other_fans_range, fill = avg_fan_total_cost_amt)) +
  geom_tile() +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
  scale_fill_viridis_c() +
  labs(title = "Average Fan Total Cost Amount by Fan Ranges",
       x = "Fans Range",
       y = "Other Fans Range",
       fill = "Avg Fan Total Cost Amt") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(dplyr)
library(ggplot2)

# Step 1: Log-transform and bin the variables into 20 categories with new names
df <- df %>%
  mutate(log_fans_user_num = log10(fans_user_num + 1),  # Apply log10 transformation
         log_other_fans_user_num = log10(other_fans_user_num + 1),  # Log-transform
         log_fans_bins = cut(log_fans_user_num, breaks = 20, labels = FALSE),  # Bin into 20 categories
         log_other_fans_bins = cut(log_other_fans_user_num, breaks = 20, labels = FALSE))  # Bin into 20 categories

# Step 2: Aggregate the data to calculate the mean and confidence intervals
agg_df <- df %>%
  group_by(log_fans_bins, log_other_fans_bins) %>%
  summarize(
    avg_fan_total_cost_amt = mean(avg_fan_total_cost_amt, na.rm = TRUE),
    ci_lower = avg_fan_total_cost_amt - qt(0.975, n()) * sd(avg_fan_total_cost_amt, na.rm = TRUE) / sqrt(n()),
    ci_upper = avg_fan_total_cost_amt + qt(0.975, n()) * sd(avg_fan_total_cost_amt, na.rm = TRUE) / sqrt(n()),
    count = n()
  )
## `summarise()` has grouped output by 'log_fans_bins'. You can override using the
## `.groups` argument.
# Calculate the total number of data points for the percentage plot
total_data_points <- nrow(df)

# Step 3: Plot the heatmap with log-transformed bins
ggplot(agg_df, aes(x = log_fans_bins, y = log_other_fans_bins, fill = avg_fan_total_cost_amt)) +
  geom_tile() +
  scale_fill_gradientn(colors = c("white", "red"), 
                       trans = "log",  # Log-transform the fill scale
                       breaks = c(0,0.000001, 0.00001, 0.001, 0.01, 0.1, 0.2,0.5,1,10),  # Manually specify breaks
                       labels = scales::comma_format()) +  # Adjust formatting for better readability
  labs(title = "Heatmap of Avg Fan Total Cost Amt by Log-Transformed Fan Bins",
       x = "Log(Binned Fans User Num)",
       y = "Log(Binned Other Fans User Num)",
       fill = "Avg Fan Total Cost Amt") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in scale_fill_gradientn(colors = c("white", "red"), trans = "log", :
## log-2.718282 transformation introduced infinite values.

# Step 3: Plot the heatmap with log-transformed bins but exponential labels
ggplot(agg_df, aes(x = as.numeric(log_fans_bins), y = as.numeric(log_other_fans_bins), fill = avg_fan_total_cost_amt)) +
  geom_tile() +
   scale_fill_gradientn(colors = c("white", "red"), 
                       trans = "log",  # Log-transform the fill scale
                       breaks = c(0,0.000001, 0.00001, 0.001, 0.01, 0.1, 0.2,0.5,1,10),  # Manually specify breaks
                       labels = scales::comma_format()) +  # Adjust formatting for better readability
  scale_x_continuous(breaks = seq(1, 20), labels = function(x) round(10^((x - 1) * (log10(max(df$fans_user_num + 1)) / 19)), 2)) +
  scale_y_continuous(breaks = seq(1, 20), labels = function(y) round(10^((y - 1) * (log10(max(df$other_fans_user_num + 1)) / 19)), 2)) +
  labs(title = "Heatmap of Avg Fan Total Cost Amt by Log-Transformed Fan Bins",
       x = "Fans User Num (Original Scale)",
       y = "Other Fans User Num (Original Scale)",
       fill = "Avg Fan Total Cost Amt") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in scale_fill_gradientn(colors = c("white", "red"), trans = "log", :
## log-2.718282 transformation introduced infinite values.

# Step 4: Create a heatmap showing the percentage of the total data in each block
agg_df <- agg_df %>%
  mutate(pct_of_total = (count / total_data_points) * 100)

ggplot(agg_df, aes(x = as.numeric(log_fans_bins), y = as.numeric(log_other_fans_bins), fill = pct_of_total)) +
  geom_tile() +
  scale_fill_viridis_c() +
  scale_x_continuous(breaks = seq(1, 20), labels = function(x) round(10^((x - 1) * (log10(max(df$fans_user_num + 1)) / 19)), 2)) +
  scale_y_continuous(breaks = seq(1, 20), labels = function(y) round(10^((y - 1) * (log10(max(df$other_fans_user_num + 1)) / 19)), 2)) +
  labs(title = "Percentage of Data in Each Log-Transformed Bin",
       x = "Fans User Num (Original Scale)",
       y = "Other Fans User Num (Original Scale)",
       fill = "Percentage of Total Data") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))