setwd("C:/Users/ASUS/Desktop/快手")
data <- read_csv("Author_live_data.csv")
# 修改列名:使用sub()函数移除列名前面的"data."前缀
new_column_names <- sub("^author_live_downsample\\.", "", names(data))
names(data) <- new_column_names
查看数据结构
colnames(data)
## [1] "author_id" "live_id"
## [3] "p_date" "is_pk_live"
## [5] "valid_play_duration" "valid_play_user_num"
## [7] "avg_valid_play_duration" "total_cost_amt"
## [9] "total_cost_user_num" "avg_total_cost_amt"
## [11] "comment_cnt" "comment_user_num"
## [13] "avg_comment_cnt" "like_cnt"
## [15] "like_user_num" "avg_like_cnt"
## [17] "share_success_cnt" "share_success_user_num"
## [19] "avg_share_success_cnt" "follow_author_cnt"
## [21] "gender" "age_range"
## [23] "fre_country_region" "fre_city_level"
## [25] "fans_user_num" "follow_user_num"
## [27] "reg_day_cnt" "is_big_v"
## [29] "author_type" "live_type"
## [31] "is_shop_car_live" "is_paid_show_live"
## [33] "live_content_category"
head(data)
## # A tibble: 6 × 33
## author_id live_id p_date is_pk_live valid_play_duration valid_play_user_num
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1926380389 9.44e9 2.02e7 0 11945 5
## 2 1804013348 9.47e9 2.02e7 0 15520318 246
## 3 1797821702 9.47e9 2.02e7 1 1015002 17
## 4 2081027986 9.44e9 2.02e7 0 17058 3
## 5 931438415 9.44e9 2.02e7 0 574724 31
## 6 230462094 9.45e9 2.02e7 1 604570 23
## # ℹ 27 more variables: avg_valid_play_duration <dbl>, total_cost_amt <dbl>,
## # total_cost_user_num <dbl>, avg_total_cost_amt <dbl>, comment_cnt <dbl>,
## # comment_user_num <dbl>, avg_comment_cnt <dbl>, like_cnt <dbl>,
## # like_user_num <dbl>, avg_like_cnt <dbl>, share_success_cnt <dbl>,
## # share_success_user_num <dbl>, avg_share_success_cnt <dbl>,
## # follow_author_cnt <dbl>, gender <chr>, age_range <chr>,
## # fre_country_region <chr>, fre_city_level <chr>, fans_user_num <dbl>, …
summary(data)
## author_id live_id p_date is_pk_live
## Min. :3.000e+00 Min. :9.427e+09 Min. :20220421 Min. :0.0000
## 1st Qu.:5.112e+08 1st Qu.:9.443e+09 1st Qu.:20220423 1st Qu.:0.0000
## Median :9.324e+08 Median :9.452e+09 Median :20220425 Median :0.0000
## Mean :1.095e+09 Mean :9.452e+09 Mean :20220425 Mean :0.1514
## 3rd Qu.:1.499e+09 3rd Qu.:9.461e+09 3rd Qu.:20220427 3rd Qu.:0.0000
## Max. :2.859e+09 Max. :9.470e+09 Max. :20220429 Max. :1.0000
## valid_play_duration valid_play_user_num avg_valid_play_duration
## Min. :0.000e+00 Min. : 0 Min. : 0
## 1st Qu.:3.133e+04 1st Qu.: 4 1st Qu.: 6119
## Median :7.069e+05 Median : 22 Median : 25499
## Mean :1.104e+08 Mean : 784 Mean : 62373
## 3rd Qu.:6.667e+06 3rd Qu.: 100 3rd Qu.: 67457
## Max. :1.861e+12 Max. :5845708 Max. :21966802
## total_cost_amt total_cost_user_num avg_total_cost_amt comment_cnt
## Min. : -575.0 Min. : 0.000 Min. : -574.99 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.0
## Median : 0.0 Median : 0.000 Median : 0.00 Median : 1.0
## Mean : 238.8 Mean : 2.585 Mean : 23.59 Mean : 50.4
## 3rd Qu.: 1.0 3rd Qu.: 1.000 3rd Qu.: 1.00 3rd Qu.: 14.0
## Max. :2665249.0 Max. :18918.000 Max. :99999.00 Max. :800939.0
## comment_user_num avg_comment_cnt like_cnt like_user_num
## Min. : 0.00 Min. : 0.000 Min. : 0 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0 1st Qu.: 0.00
## Median : 1.00 Median : 1.000 Median : 2 Median : 1.00
## Mean : 11.62 Mean : 2.209 Mean : 1364 Mean : 15.56
## 3rd Qu.: 4.00 3rd Qu.: 3.000 3rd Qu.: 92 3rd Qu.: 5.00
## Max. :74953.00 Max. :382.996 Max. :16165318 Max. :113578.00
## avg_like_cnt share_success_cnt share_success_user_num
## Min. : 0.0 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 1.5 Median : 0.00 Median : 0.000
## Mean : 32.2 Mean : 3.78 Mean : 1.835
## 3rd Qu.: 16.6 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :594909.1 Max. :36956.00 Max. :14619.000
## avg_share_success_cnt follow_author_cnt gender age_range
## Min. : 0.0000 Min. : 0.00 Length:958541 Length:958541
## 1st Qu.: 0.0000 1st Qu.: 0.00 Class :character Class :character
## Median : 0.0000 Median : 0.00 Mode :character Mode :character
## Mean : 0.2367 Mean : 9.41
## 3rd Qu.: 0.0000 3rd Qu.: 1.00
## Max. :207.9979 Max. :42538.00
## fre_country_region fre_city_level fans_user_num follow_user_num
## Length:958541 Length:958541 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 267 1st Qu.: 107.0
## Mode :character Mode :character Median : 777 Median : 423.0
## Mean : 20572 Mean : 930.6
## 3rd Qu.: 2809 3rd Qu.:1225.0
## Max. :40103191 Max. :5033.0
## reg_day_cnt is_big_v author_type live_type
## Min. : 0 Min. :0.0000000 Length:958541 Min. :1.000
## 1st Qu.: 951 1st Qu.:0.0000000 Class :character 1st Qu.:1.000
## Median :1459 Median :0.0000000 Mode :character Median :1.000
## Mean :1374 Mean :0.0002326 Mean :1.461
## 3rd Qu.:1809 3rd Qu.:0.0000000 3rd Qu.:2.000
## Max. :3870 Max. :1.0000000 Max. :3.000
## is_shop_car_live is_paid_show_live live_content_category
## Min. :0.00000 Min. :0 Length:958541
## 1st Qu.:0.00000 1st Qu.:0 Class :character
## Median :0.00000 Median :0 Mode :character
## Mean :0.05704 Mean :0
## 3rd Qu.:0.00000 3rd Qu.:0
## Max. :1.00000 Max. :0
print(length(data$is_pk_live))
## [1] 958541
print(length(unique(data$author_id)))
## [1] 719070
sum(data$is_pk_live == 1)
## [1] 145132
Data结构:Author_id数:719070; live_id数:719070;有PK的live_id数:145132
处理类别变量函数
assign_numbers <- function(data, column) {
unique_categories <- unique(data[[column]])
category_numbers <- match(data[[column]], unique_categories)
data[[paste0(column, "_number")]] <- category_numbers - 1 # 将编号减1,使得"M"对应1,"F"对应0
return(data)
}
性别:gender
data <- assign_numbers(data, "gender")
unique(data$gender_number)
## [1] 0 1 2
年龄:age_range
data <- assign_numbers(data, "age_range")
unique(data$age_range_number)
## [1] 0 1 2 3 4 5 6 7
地区:fre_country_region
data <- assign_numbers(data, "fre_country_region")
unique(data$fre_country_region_number)
## [1] 0 1 2
城市:fre_city_level
data <- assign_numbers(data, "fre_city_level")
unique(data$fre_city_level_number)
## [1] 0 1 2 3 4 5 6
作者类型:author_type
data <- assign_numbers(data, "author_type")
unique(data$author_type_number)
## [1] 0 1 2 3
直播内容类型:live_content_category
data <- assign_numbers(data, "live_content_category")
unique(data$live_content_category_number)
## [1] 0 1
library(ebal)
library(tidyverse)
# 提取因变量
Y <- data %>%pull(total_cost_amt)
# 提取处理变量
D <- data %>%pull(is_pk_live)
# 提取匹配变量:Author feature
X <- data %>%
select(gender_number,age_range_number,fre_country_region_number,fre_city_level_number,
fans_user_num,follow_user_num,reg_day_cnt,author_type_number) %>%
as.matrix()
# 计算权重
eb <- ebalance(D, X)
## Converged within tolerance
# 生成加权数据集
data_treat <- data %>% filter(D == 1) %>% mutate(weights = 1) # 对于处理组,权重为1
data_con <- data %>% filter(D == 0) %>% mutate(weights = eb$w) # 对于对照组,使用 eBalance 计算的权重
# 合并处理组和对照组的数据
data_weighted <- bind_rows(data_treat, data_con)
head(data_weighted)
## # A tibble: 6 × 41
## author_id live_id p_date is_pk_live valid_play_duration valid_play_user_num
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1797821702 9.47e9 2.02e7 1 1015002 17
## 2 230462094 9.45e9 2.02e7 1 604570 23
## 3 688725060 9.45e9 2.02e7 1 13182275 274
## 4 586073164 9.47e9 2.02e7 1 18683824825 85252
## 5 57903715 9.45e9 2.02e7 1 44059975 298
## 6 665432520 9.46e9 2.02e7 1 3727673 89
## # ℹ 35 more variables: avg_valid_play_duration <dbl>, total_cost_amt <dbl>,
## # total_cost_user_num <dbl>, avg_total_cost_amt <dbl>, comment_cnt <dbl>,
## # comment_user_num <dbl>, avg_comment_cnt <dbl>, like_cnt <dbl>,
## # like_user_num <dbl>, avg_like_cnt <dbl>, share_success_cnt <dbl>,
## # share_success_user_num <dbl>, avg_share_success_cnt <dbl>,
## # follow_author_cnt <dbl>, gender <chr>, age_range <chr>,
## # fre_country_region <chr>, fre_city_level <chr>, fans_user_num <dbl>, …