setwd("C:/Users/ASUS/Desktop/快手")
数据:2022.01.01–2024.09.01 全部的random PK pair
data= read.csv("Within_Random_Pk_final_clean.csv")
summary(data)
## pk_id p_date other_author_id author_id
## Min. :2.470e+09 Min. :20220101 Min. :1.313e+03 Min. :1.313e+03
## 1st Qu.:2.928e+09 1st Qu.:20220716 1st Qu.:6.435e+08 1st Qu.:6.476e+08
## Median :5.028e+09 Median :20230617 Median :1.123e+09 Median :1.125e+09
## Mean :4.456e+09 Mean :20229670 Mean :1.327e+09 Mean :1.329e+09
## 3rd Qu.:5.450e+09 3rd Qu.:20240127 3rd Qu.:1.893e+09 3rd Qu.:1.897e+09
## Max. :5.886e+09 Max. :20240901 Max. :4.345e+09 Max. :4.343e+09
##
## live_id valid_play_duration valid_play_user_num
## Min. :8.958e+09 Min. :0.000e+00 Min. : 0.0
## 1st Qu.:9.760e+09 1st Qu.:1.225e+05 1st Qu.: 9.0
## Median :1.113e+10 Median :1.120e+06 Median : 40.0
## Mean :1.092e+10 Mean :5.116e+07 Mean : 601.1
## 3rd Qu.:1.195e+10 3rd Qu.:6.723e+06 3rd Qu.: 160.0
## Max. :1.276e+10 Max. :9.158e+10 Max. :555052.0
##
## avg_valid_play_duration total_cost_amt total_cost_user_num
## Min. : 0 Min. : 0.0 Min. : 0.000
## 1st Qu.: 10022 1st Qu.: 0.0 1st Qu.: 0.000
## Median : 23524 Median : 0.0 Median : 0.000
## Mean : 41674 Mean : 156.5 Mean : 1.581
## 3rd Qu.: 49260 3rd Qu.: 5.0 3rd Qu.: 1.000
## Max. :8290602 Max. :330122.0 Max. :1218.000
##
## avg_total_cost_amt comment_cnt comment_user_num avg_comment_cnt
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.00 Median : 2.00 Median : 1.000 Median : 1.333
## Mean : 35.54 Mean : 37.16 Mean : 9.214 Mean : 2.504
## 3rd Qu.: 3.00 3rd Qu.: 19.00 3rd Qu.: 5.000 3rd Qu.: 3.400
## Max. :35599.64 Max. :223684.00 Max. :13554.000 Max. :190.998
##
## like_cnt like_user_num avg_like_cnt share_success_cnt
## Min. : 0.0 Min. : 0.000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.0 1st Qu.: 0.000
## Median : 11.0 Median : 2.000 Median : 4.0 Median : 0.000
## Mean : 730.7 Mean : 9.546 Mean : 43.6 Mean : 3.276
## 3rd Qu.: 157.0 3rd Qu.: 6.000 3rd Qu.: 26.8 3rd Qu.: 0.000
## Max. :1026119.0 Max. :8047.000 Max. :107281.7 Max. :25255.000
##
## share_success_user_num avg_share_success_cnt follow_author_cnt
## Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.000 Median : 0.0000 Median : 0.000
## Mean : 1.924 Mean : 0.2342 Mean : 5.142
## 3rd Qu.: 0.000 3rd Qu.: 0.0000 3rd Qu.: 1.000
## Max. :11274.000 Max. :208.9979 Max. :8101.000
##
## cancel_follow_author_cnt follow_user_cnt cancel_follow_user_cnt
## Min. : 0.000 Min. : 0.00 Min. : 0.00000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.00000
## Median : 0.000 Median : 0.00 Median : 0.00000
## Mean : 1.462 Mean : 0.78 Mean : 0.05411
## 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.00000
## Max. :4251.000 Max. :6141.00 Max. :66.00000
##
## join_fans_group_cnt live_new_user_num report_live_cnt report_user_cnt
## Min. : 0.000 Min. : 0.00000 Min. : 0.0000 Min. : 0.00000
## 1st Qu.: 0.000 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.: 0.00000
## Median : 0.000 Median : 0.00000 Median : 0.0000 Median : 0.00000
## Mean : 0.323 Mean : 0.05062 Mean : 0.2084 Mean : 0.01576
## 3rd Qu.: 0.000 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.00000
## Max. :919.000 Max. :114.00000 Max. :917.0000 Max. :109.00000
##
## is_follow_each_other pk_duration pk_play_user_num pk_play_duration
## Min. :0.000 Min. : 10 Min. : 0.00 Min. :0.000e+00
## 1st Qu.:0.000 1st Qu.: 12446 1st Qu.: 1.00 1st Qu.:3.746e+03
## Median :0.000 Median : 48760 Median : 5.00 Median :4.042e+04
## Mean :0.004 Mean :126488 Mean : 38.97 Mean :6.088e+05
## 3rd Qu.:0.000 3rd Qu.:254913 3rd Qu.: 23.00 3rd Qu.:4.092e+05
## Max. :1.000 Max. :446413 Max. :115900.00 Max. :1.419e+09
## NA's :6983
## avg_pk_play_duration pk_follow_author_cnt pk_cancel_follow_author_cnt
## Min. : 0 Min. : 0.0000 Min. : 0.00000
## 1st Qu.: 2133 1st Qu.: 0.0000 1st Qu.: 0.00000
## Median : 7110 Median : 0.0000 Median : 0.00000
## Mean : 13047 Mean : 0.1816 Mean : 0.06546
## 3rd Qu.: 16535 3rd Qu.: 0.0000 3rd Qu.: 0.00000
## Max. :718371 Max. :2003.0000 Max. :136.00000
##
## pk_follow_user_cnt pk_cancel_follow_user_cnt pk_like_cnt
## Min. : 0.00000 Min. :0.000000 Min. : 0.000
## 1st Qu.: 0.00000 1st Qu.:0.000000 1st Qu.: 0.000
## Median : 0.00000 Median :0.000000 Median : 0.000
## Mean : 0.04517 Mean :0.002961 Mean : 8.021
## 3rd Qu.: 0.00000 3rd Qu.:0.000000 3rd Qu.: 1.000
## Max. :90.00000 Max. :5.000000 Max. :5960.000
##
## pk_like_user_num avg_pk_like_cnt pk_comment_cnt pk_comment_user_num
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.000 Median : 0.000 Median : 0.000 Median : 0.0000
## Mean : 0.621 Mean : 3.686 Mean : 1.107 Mean : 0.4891
## 3rd Qu.: 1.000 3rd Qu.: 1.000 3rd Qu.: 0.000 3rd Qu.: 0.0000
## Max. :291.000 Max. :453.665 Max. :1304.000 Max. :243.0000
##
## avg_pk_comment_cnt pk_report_cnt pk_report_user_cnt avg_pk_report_cnt
## Min. : 0.0000 Min. : 0.00000 Min. :0.00e+00 Min. : 0
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.:0.00e+00 1st Qu.: 0
## Median : 0.0000 Median : 0.00000 Median :0.00e+00 Median : 0
## Mean : 0.5544 Mean : 0.01089 Mean :5.93e-04 Mean : 1066
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.:0.00e+00 3rd Qu.: 0
## Max. :45.9995 Max. :71.00000 Max. :1.40e+01 Max. :7100000
##
## pk_total_cost_amt pk_total_cost_cnt avg_pk_total_cost_amt
## Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.00 Median : 0.000 Median : 0.000
## Mean : 29.29 Mean : 1.566 Mean : 9.553
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :130013.00 Max. :947.000 Max. :28887.711
##
## follow_author_fans_count unfollow_author_fans_count
## Min. : 0.00 Min. : 0.0000
## 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 0.00 Median : 0.0000
## Mean : 1.66 Mean : 0.7025
## 3rd Qu.: 0.00 3rd Qu.: 0.0000
## Max. :5209.00 Max. :2109.0000
##
## already_follow_other_fans_count gender fans_user_num
## Min. : 0.00000 Length:187126 Min. : 0
## 1st Qu.: 0.00000 Class :character 1st Qu.: 414
## Median : 0.00000 Mode :character Median : 1198
## Mean : 0.01516 Mean : 17161
## 3rd Qu.: 0.00000 3rd Qu.: 4708
## Max. :125.00000 Max. :11401415
##
## fans_range fans_group_fans_num author_type live_operation_tag
## Length:187126 Min. : 0.0 Length:187126 Length:187126
## Class :character 1st Qu.: 1.0 Class :character Class :character
## Mode :character Median : 5.0 Mode :character Mode :character
## Mean : 254.6
## 3rd Qu.: 65.0
## Max. :332787.0
##
## live_stream_category author_income_range age_range fre_country_region
## Length:187126 Length:187126 Length:187126 Length:187126
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## fre_city_level reg_day_cnt follow_user_num before_fans_count
## Length:187126 Min. : 0 Min. : 0 Min. : 0
## Class :character 1st Qu.:1036 1st Qu.: 139 1st Qu.: 342
## Mode :character Median :1624 Median : 592 Median : 986
## Mean :1565 Mean :1269 Mean : 14119
## 3rd Qu.:2109 3rd Qu.:1835 3rd Qu.: 3881
## Max. :4387 Max. :5023 Max. :10848875
##
## common_user_count live_id.y other_gender other_fans_user_num
## Min. : 0.0 Min. :8.959e+09 Length:187126 Min. : 0
## 1st Qu.: 0.0 1st Qu.:9.760e+09 Class :character 1st Qu.: 413
## Median : 0.0 Median :1.113e+10 Mode :character Median : 1190
## Mean : 103.8 Mean :1.092e+10 Mean : 17269
## 3rd Qu.: 0.0 3rd Qu.:1.195e+10 3rd Qu.: 4675
## Max. :986301.0 Max. :1.276e+10 Max. :37195072
##
## other_fans_range other_fans_group_fans_num other_author_type
## Length:187126 Min. : 0 Length:187126
## Class :character 1st Qu.: 1 Class :character
## Mode :character Median : 5 Mode :character
## Mean : 272
## 3rd Qu.: 65
## Max. :1065934
##
## other_live_operation_tag other_live_stream_category other_author_income_range
## Length:187126 Length:187126 Length:187126
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## other_age_range other_fre_country_region other_fre_city_level
## Length:187126 Length:187126 Length:187126
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## other_reg_day_cnt other_follow_user_num other_before_fans_count
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.:1039 1st Qu.: 138 1st Qu.: 339
## Median :1625 Median : 592 Median : 982
## Mean :1567 Mean :1264 Mean : 14276
## 3rd Qu.:2110 3rd Qu.:1833 3rd Qu.: 3855
## Max. :4577 Max. :5022 Max. :36176448
##
## other_common_user_count live_id_order new_author_index
## Min. : 0.0 Min. : 1.00 Min. :1
## 1st Qu.: 0.0 1st Qu.: 1.00 1st Qu.:1
## Median : 0.0 Median : 1.00 Median :1
## Mean : 103.8 Mean : 2.18 Mean :1
## 3rd Qu.: 0.0 3rd Qu.: 1.00 3rd Qu.:1
## Max. :986301.0 Max. :218.00 Max. :1
##
str(data)
## 'data.frame': 187126 obs. of 85 variables:
## $ pk_id : num 2.47e+09 2.47e+09 2.47e+09 2.47e+09 2.47e+09 ...
## $ p_date : int 20220101 20220101 20220101 20220101 20220101 20220101 20220101 20220101 20220101 20220101 ...
## $ other_author_id : num 2.06e+09 8.90e+08 2.67e+09 2.10e+09 1.33e+09 ...
## $ author_id : num 9.15e+08 1.27e+09 2.66e+09 2.05e+07 1.45e+09 ...
## $ live_id : num 8.96e+09 8.96e+09 8.96e+09 8.96e+09 8.96e+09 ...
## $ valid_play_duration : num 4525836 49630 306248 4608075 486888 ...
## $ valid_play_user_num : int 155 1 9 30 13 9 16 53 38 14 ...
## $ avg_valid_play_duration : num 29199 49630 34028 153602 37453 ...
## $ total_cost_amt : int 10 0 0 0 0 0 208 208 0 0 ...
## $ total_cost_user_num : int 1 0 0 0 0 0 1 1 0 0 ...
## $ avg_total_cost_amt : num 10 0 0 0 0 ...
## $ comment_cnt : int 32 0 0 0 0 4 16 21 4 4 ...
## $ comment_user_num : int 5 0 0 0 0 1 4 5 1 3 ...
## $ avg_comment_cnt : num 6.4 0 0 0 0 ...
## $ like_cnt : int 306 0 0 0 0 0 12 20 0 2 ...
## $ like_user_num : int 9 0 0 0 0 0 3 1 0 1 ...
## $ avg_like_cnt : num 34 0 0 0 0 ...
## $ share_success_cnt : int 3 0 0 1 0 0 0 0 0 0 ...
## $ share_success_user_num : int 3 0 0 1 0 0 0 0 0 0 ...
## $ avg_share_success_cnt : num 1 0 0 1 0 ...
## $ follow_author_cnt : int 4 0 0 0 1 0 0 0 0 0 ...
## $ cancel_follow_author_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ follow_user_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cancel_follow_user_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ join_fans_group_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ live_new_user_num : int 0 0 0 0 0 0 0 0 0 0 ...
## $ report_live_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ report_user_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ is_follow_each_other : int NA NA NA NA 0 NA NA NA NA NA ...
## $ pk_duration : int 427197 390710 373737 334503 371514 310907 323632 333151 326283 370738 ...
## $ pk_play_user_num : int 3 2 2 11 9 4 1 6 13 5 ...
## $ pk_play_duration : int 67778 22072 30992 5038 344217 98164 39802 214399 290473 612846 ...
## $ avg_pk_play_duration : num 22593 11036 15496 458 38246 ...
## $ pk_follow_author_cnt : int 0 0 0 0 1 0 0 0 0 0 ...
## $ pk_cancel_follow_author_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pk_follow_user_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pk_cancel_follow_user_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pk_like_cnt : int 0 0 0 0 0 0 0 2 0 0 ...
## $ pk_like_user_num : int 0 0 0 0 0 0 0 1 0 0 ...
## $ avg_pk_like_cnt : num 0 0 0 0 0 ...
## $ pk_comment_cnt : int 0 0 0 0 0 1 0 0 1 1 ...
## $ pk_comment_user_num : int 0 0 0 0 0 1 0 0 1 1 ...
## $ avg_pk_comment_cnt : num 0 0 0 0 0 ...
## $ pk_report_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pk_report_user_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ avg_pk_report_cnt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pk_total_cost_amt : int 0 0 0 0 0 0 0 208 0 0 ...
## $ pk_total_cost_cnt : int 0 0 0 0 0 0 0 1 0 0 ...
## $ avg_pk_total_cost_amt : num 0 0 0 0 0 ...
## $ follow_author_fans_count : int 3 0 0 0 0 0 0 0 0 0 ...
## $ unfollow_author_fans_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ already_follow_other_fans_count: int 0 0 0 0 0 0 0 0 0 0 ...
## $ gender : chr "F" "M" "F" "F" ...
## $ fans_user_num : int 899 872 677 8199 1000 15916 183 3118 138061 1615 ...
## $ fans_range : chr "100-1k" "100-1k" "100-1k" "1k-1w" ...
## $ fans_group_fans_num : int 32 1 38 4588 2 877 14 15 1044 5 ...
## $ author_type : chr "秀场主播" "秀场主播" "秀场主播" "秀场主播" ...
## $ live_operation_tag : chr "闲聊互动" "UNKNOWN" "颜值" "闲聊互动" ...
## $ live_stream_category : chr "1002=才艺技能展示,2020=唱歌,3045=热歌金曲" "1009=闲聊互动,2109=帅哥闲聊互动,3431=其他帅哥闲聊互动" "" "" ...
## $ author_income_range : chr "(50,500]" "(10,50]" "(50,500]" "(500,1000]" ...
## $ age_range : chr "50+" "18-23" "18-23" "24-30" ...
## $ fre_country_region : chr "南方" "南方" "南方" "北方" ...
## $ fre_city_level : chr "一线城市" "新一线城市" "二线城市" "三线城市" ...
## $ reg_day_cnt : int 1363 1037 37 2633 881 662 1400 2162 2711 1196 ...
## $ follow_user_num : int 112 428 53 49 679 3518 44 1 878 3026 ...
## $ before_fans_count : int 901 873 684 8296 999 15900 189 3126 138040 1630 ...
## $ common_user_count : int 0 0 1 7 3 0 0 1 231 0 ...
## $ live_id.y : num 8.96e+09 8.96e+09 8.96e+09 8.96e+09 8.96e+09 ...
## $ other_gender : chr "F" "M" "F" "F" ...
## $ other_fans_user_num : int 3476 1275 1040 20216 479 845 760 12912 17324 804 ...
## $ other_fans_range : chr "1k-1w" "1k-1w" "1k-1w" "1w-10w" ...
## $ other_fans_group_fans_num : int 38 4 48 111 0 66 28 247 180 2 ...
## $ other_author_type : chr "秀场主播" "秀场主播" "秀场主播" "秀场主播" ...
## $ other_live_operation_tag : chr "才艺技能展示" "闲聊互动" "才艺技能展示" "颜值" ...
## $ other_live_stream_category : chr "1002=才艺技能展示,2020=唱歌,3045=热歌金曲" "1009=闲聊互动,2108=美女闲聊互动,3428=其他美女闲聊互动" "1002=才艺技能展示,2020=唱歌,3045=热歌金曲" "1009=闲聊互动,2108=美女闲聊互动,3426=颜值美女闲聊互动" ...
## $ other_author_income_range : chr "(50,500]" "(10,50]" "(500,1000]" "(500,1000]" ...
## $ other_age_range : chr "24-30" "18-23" "31-40" "50+" ...
## $ other_fre_country_region : chr "南方" "南方" "北方" "北方" ...
## $ other_fre_city_level : chr "二线城市" "二线城市" "三线城市" "新一线城市" ...
## $ other_reg_day_cnt : int 504 1387 31 456 987 1857 2172 1992 2080 1826 ...
## $ other_follow_user_num : int 8 306 534 138 253 126 76 88 66 475 ...
## $ other_before_fans_count : int 3473 1273 1040 20220 497 849 772 13003 17335 807 ...
## $ other_common_user_count : int 0 0 1 7 3 0 0 1 231 0 ...
## $ live_id_order : int 1 1 1 1 1 1 1 1 1 1 ...
## $ new_author_index : int 1 1 1 1 1 1 1 1 1 1 ...
### 竞争
# (1) 对方粉丝数
# table(data$other_before_fans_count)
# (2) 对方粉丝数/本人粉丝数
data$relative_fans= (data$other_before_fans_count)/(data$before_fans_count + 1)
# (3) 粉丝数overlap
data$fans_overlap_from_author = (data$common_user_count)/(data$before_fans_count + 1)
data$fans_overlap_from_pair = (data$common_user_count)/(data$before_fans_count + data$other_before_fans_count + 1)
# (4) 两个粉丝都大:竞争大
data = data %>%mutate(fans_pair_range=case_when(
(before_fans_count>100000) & (other_before_fans_count>100000)~1,
TRUE~0
))
data = data %>%mutate(fans_piar_cat = case_when(
(before_fans_count>10000) & (other_before_fans_count>10000)~3,
(before_fans_count>10000) & (other_before_fans_count<10000)~2,
(before_fans_count<10000) & (other_before_fans_count>10000)~1,
(before_fans_count<10000) & (other_before_fans_count<10000)~0,
))
data <- data %>%
mutate(fans_piar_cat = factor(fans_piar_cat, level = c(0,1,2,3), labels = c("Small vs Small ", "Small vs Big", "Big vs Small", "Big vs Big")))
### 合作
# (1) 按照直播类型来划分:同类:1
data <- data %>%
mutate(is_cooperative = case_when(
(live_operation_tag == other_live_operation_tag) ~ 1,
TRUE ~ 0
))
table(data$fans_piar_cat)
##
## Small vs Small Small vs Big Big vs Small Big vs Big
## 149898 10906 10701 15620
table(data$is_cooperative)
##
## 0 1
## 121925 65201
summary(data$ before_fans_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 342 986 14119 3881 10848875
# table(data$total_cost_amt) # 119056 为0
data$avg_fan_total_cost_amt= (data$total_cost_amt)/(data$before_fans_count + 1)
model <- feols(log(avg_fan_total_cost_amt+1) ~ factor(fans_piar_cat) + is_cooperative + factor(fans_piar_cat)*is_cooperative |p_date, data = data,vcov = ~author_id)
## NOTE: 1 observation removed because of NA values (RHS: 1).
summary(model)
## OLS estimation, Dep. Var.: log(avg_fan_total_cost_amt + 1)
## Observations: 187,125
## Fixed-effects: p_date: 975
## Standard-errors: Clustered (author_id)
## Estimate Std. Error t value
## factor(fans_piar_cat)Small vs Big 0.028320 0.002953 9.58926
## factor(fans_piar_cat)Big vs Small -0.024452 0.001229 -19.89522
## factor(fans_piar_cat)Big vs Big -0.025267 0.001082 -23.34969
## is_cooperative 0.009500 0.001349 7.04310
## factor(fans_piar_cat)Small vs Big:is_cooperative 0.005994 0.005015 1.19523
## factor(fans_piar_cat)Big vs Small:is_cooperative -0.011357 0.001836 -6.18599
## factor(fans_piar_cat)Big vs Big:is_cooperative -0.007411 0.001793 -4.13344
## Pr(>|t|)
## factor(fans_piar_cat)Small vs Big < 2.2e-16 ***
## factor(fans_piar_cat)Big vs Small < 2.2e-16 ***
## factor(fans_piar_cat)Big vs Big < 2.2e-16 ***
## is_cooperative 1.8878e-12 ***
## factor(fans_piar_cat)Small vs Big:is_cooperative 2.3200e-01
## factor(fans_piar_cat)Big vs Small:is_cooperative 6.1867e-10 ***
## factor(fans_piar_cat)Big vs Big:is_cooperative 3.5755e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.219571 Adj. R2: 0.005954
## Within R2: 0.003705
df=data[(data$before_fans_count>100 & data$before_fans_count<100000),]
df=df[(df$other_before_fans_count>100 & df$other_before_fans_count<100000),]
df = df%>% mutate(A_fan_count = before_fans_count,
B_fan_count = other_before_fans_count,
log_A_fan_count = log(before_fans_count+1),
log_B_fan_count = log(other_before_fans_count+1))
# Step 1: Group the data by A_fan_count and B_fan_count, and calculate the average money for A
agg_df <- df %>%
group_by(A_fan_count, B_fan_count) %>%
summarize(avg_money_A = mean(avg_fan_total_cost_amt, na.rm = TRUE))
## `summarise()` has grouped output by 'A_fan_count'. You can override using the
## `.groups` argument.
# Step 2: Create the 3D surface plot
fig <- plot_ly(agg_df,
x = ~log(A_fan_count),
y = ~log(B_fan_count),
z = ~avg_money_A,
type = 'surface',
colorscale = 'Viridis')
# Step 3: Customize the layout for better visualization
fig <- fig %>%
layout(title = '3D Surface Plot of Avg Money for A',
scene = list(
xaxis = list(title = "Log(A's Fan Count)"),
yaxis = list(title = "Log(B's Fan Count)"),
zaxis = list(title = "Avg Money for A")
))
# Display the plot
fig
fig <- plot_ly(df,
x = ~log(A_fan_count),
y = ~log(B_fan_count),
z = ~avg_fan_total_cost_amt,
type = 'scatter3d',
mode = 'markers',
marker = list(size = 2, color = ~avg_fan_total_cost_amt, colorscale = 'Viridis', showscale = TRUE))
fig <- fig %>% layout(title = '3D Scatter Plot of Avg Money for A',
scene = list(
xaxis = list(title = "Log(A's Fan Count)"),
yaxis = list(title = "Log(B's Fan Count)"),
zaxis = list(title = "Avg Money for A")
))
fig
table(df$fans_range)
##
## 0-100 100-1k 10w-100w 1k-1w 1w-10w
## 336 60466 732 69652 22682
table(df$other_fans_range)
##
## 0-100 100-1k 10w-100w 1k-1w 1w-10w
## 349 60439 734 69536 22810
# Step 1: Set the correct order for the categories
df <- df %>%
mutate(fans_range = factor(fans_range, levels = c("0-100", "100-1k", "1k-1w", "1w-10w", "10w-100w")),
other_fans_range = factor(other_fans_range, levels = c("0-100", "100-1k", "1k-1w", "1w-10w", "10w-100w")))
# Step 2: Aggregate the data to calculate the mean and confidence intervals
agg_df <- df %>%
group_by(fans_range, other_fans_range) %>%
summarize(
avg_fan_total_cost_amt = mean(avg_fan_total_cost_amt, na.rm = TRUE),
ci_lower = avg_fan_total_cost_amt - qt(0.975, n()) * sd(avg_fan_total_cost_amt, na.rm = TRUE) / sqrt(n()),
ci_upper = avg_fan_total_cost_amt + qt(0.975, n()) * sd(avg_fan_total_cost_amt, na.rm = TRUE) / sqrt(n())
)
## `summarise()` has grouped output by 'fans_range'. You can override using the
## `.groups` argument.
# Step 3: Plot the data in a heatmap-like format with CI as shading or error bars
ggplot(agg_df, aes(x = fans_range, y = other_fans_range, fill = avg_fan_total_cost_amt)) +
geom_tile() +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2) +
scale_fill_viridis_c() +
labs(title = "Average Fan Total Cost Amount by Fan Ranges",
x = "Fans Range",
y = "Other Fans Range",
fill = "Avg Fan Total Cost Amt") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(dplyr)
library(ggplot2)
# Step 1: Log-transform and bin the variables into 20 categories with new names
df <- df %>%
mutate(log_fans_user_num = log10(fans_user_num + 1), # Apply log10 transformation
log_other_fans_user_num = log10(other_fans_user_num + 1), # Log-transform
log_fans_bins = cut(log_fans_user_num, breaks = 20, labels = FALSE), # Bin into 20 categories
log_other_fans_bins = cut(log_other_fans_user_num, breaks = 20, labels = FALSE)) # Bin into 20 categories
# Step 2: Aggregate the data to calculate the mean and confidence intervals
agg_df <- df %>%
group_by(log_fans_bins, log_other_fans_bins) %>%
summarize(
avg_fan_total_cost_amt = mean(avg_fan_total_cost_amt, na.rm = TRUE),
ci_lower = avg_fan_total_cost_amt - qt(0.975, n()) * sd(avg_fan_total_cost_amt, na.rm = TRUE) / sqrt(n()),
ci_upper = avg_fan_total_cost_amt + qt(0.975, n()) * sd(avg_fan_total_cost_amt, na.rm = TRUE) / sqrt(n()),
count = n()
)
## `summarise()` has grouped output by 'log_fans_bins'. You can override using the
## `.groups` argument.
# Calculate the total number of data points for the percentage plot
total_data_points <- nrow(df)
# Step 3: Plot the heatmap with log-transformed bins
ggplot(agg_df, aes(x = log_fans_bins, y = log_other_fans_bins, fill = avg_fan_total_cost_amt)) +
geom_tile() +
scale_fill_gradientn(colors = c("white", "red"),
trans = "log", # Log-transform the fill scale
breaks = c(0,0.000001, 0.00001, 0.001, 0.01, 0.1, 0.2,0.5,1,10), # Manually specify breaks
labels = scales::comma_format()) + # Adjust formatting for better readability
labs(title = "Heatmap of Avg Fan Total Cost Amt by Log-Transformed Fan Bins",
x = "Log(Binned Fans User Num)",
y = "Log(Binned Other Fans User Num)",
fill = "Avg Fan Total Cost Amt") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in scale_fill_gradientn(colors = c("white", "red"), trans = "log", :
## log-2.718282 transformation introduced infinite values.
# Step 3: Plot the heatmap with log-transformed bins but exponential labels
ggplot(agg_df, aes(x = as.numeric(log_fans_bins), y = as.numeric(log_other_fans_bins), fill = avg_fan_total_cost_amt)) +
geom_tile() +
scale_fill_gradientn(colors = c("white", "red"),
trans = "log", # Log-transform the fill scale
breaks = c(0,0.000001, 0.00001, 0.001, 0.01, 0.1, 0.2,0.5,1,10), # Manually specify breaks
labels = scales::comma_format()) + # Adjust formatting for better readability
scale_x_continuous(breaks = seq(1, 20), labels = function(x) round(10^((x - 1) * (log10(max(df$fans_user_num + 1)) / 19)), 2)) +
scale_y_continuous(breaks = seq(1, 20), labels = function(y) round(10^((y - 1) * (log10(max(df$other_fans_user_num + 1)) / 19)), 2)) +
labs(title = "Heatmap of Avg Fan Total Cost Amt by Log-Transformed Fan Bins",
x = "Fans User Num (Original Scale)",
y = "Other Fans User Num (Original Scale)",
fill = "Avg Fan Total Cost Amt") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in scale_fill_gradientn(colors = c("white", "red"), trans = "log", :
## log-2.718282 transformation introduced infinite values.
# Step 4: Create a heatmap showing the percentage of the total data in each block
agg_df <- agg_df %>%
mutate(pct_of_total = (count / total_data_points) * 100)
ggplot(agg_df, aes(x = as.numeric(log_fans_bins), y = as.numeric(log_other_fans_bins), fill = pct_of_total)) +
geom_tile() +
scale_fill_viridis_c() +
scale_x_continuous(breaks = seq(1, 20), labels = function(x) round(10^((x - 1) * (log10(max(df$fans_user_num + 1)) / 19)), 2)) +
scale_y_continuous(breaks = seq(1, 20), labels = function(y) round(10^((y - 1) * (log10(max(df$other_fans_user_num + 1)) / 19)), 2)) +
labs(title = "Percentage of Data in Each Log-Transformed Bin",
x = "Fans User Num (Original Scale)",
y = "Other Fans User Num (Original Scale)",
fill = "Percentage of Total Data") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))