library(ggplot2)
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read_excel("Downloads/AI书 & 非AI书(10.1-10.21).xlsx")
## New names:
## • `` -> `...9`
df <- df %>% select(1:8)
df <- df %>%
mutate(what = ifelse(what == "非AI书", "non-ai", "ai"))
然后看一下数据情况
library(dplyr)
# 计算描述性统计量
df_summary <- df %>%
group_by(what) %>%
summarise(
show_mean = mean(show_pv, na.rm = TRUE),
show_sd = sd(show_pv, na.rm = TRUE),
show_median = median(show_pv, na.rm = TRUE),
show_min = min(show_pv, na.rm = TRUE),
show_max = max(show_pv, na.rm = TRUE),
click_mean = mean(click_pv, na.rm = TRUE),
click_sd = sd(click_pv, na.rm = TRUE),
click_median = median(click_pv, na.rm = TRUE),
click_min = min(click_pv, na.rm = TRUE),
click_max = max(click_pv, na.rm = TRUE),
ctr_mean = mean(ctr, na.rm = TRUE),
ctr_sd = sd(ctr, na.rm = TRUE),
ctr_median = median(ctr, na.rm = TRUE),
ctr_min = min(ctr, na.rm = TRUE),
ctr_max = max(ctr, na.rm = TRUE)
)
t(df_summary)
## [,1] [,2]
## what "ai" "non-ai"
## show_mean "30042.36" "25517.64"
## show_sd "346462.9" "305441.6"
## show_median "1281.5" " 419.0"
## show_min "0" "0"
## show_max " 8266336" "23044253"
## click_mean "1349.892" "1566.818"
## click_sd "14815.51" "18050.82"
## click_median "12" " 5"
## click_min "0" "0"
## click_max " 312236" "1281045"
## ctr_mean "0.01462412" "0.03175295"
## ctr_sd "0.03780759" "0.06805258"
## ctr_median "0.006993331" "0.008289583"
## ctr_min "0" "0"
## ctr_max "0.6666667" "2.0000000"
箱线图中有两个非常有用的特性:可变宽度和凹槽。
可变宽度:这个特性让箱线图的宽度与样本量成比例。简单来说,样本量多的组宽度会更大,样本量少的则更窄。这可以帮助你直观地看到每个组的数据量大小。
凹槽:箱线图中的凹槽用于对比中位数的统计显著性。如果两个箱线图的凹槽不重叠,通常可以认为它们的中位数在统计上有显著差异。凹槽的宽度随着样本量和数据的变异性变化,有助于判断数据组之间的差异。
ggplot(df, aes(x = what, y = show_pv, fill = what)) +
geom_boxplot(outlier.shape = NA, notch = TRUE, varwidth = TRUE) +
ylim(0, quantile(df$show_pv, 0.95)) + # 只显示95%分位数以下
labs(title = "Show PV Boxplot", x = "Category", y = "Show PV")
## Warning: Removed 859 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggplot(df, aes(x = what, y = click_pv, fill = what)) +
geom_boxplot(outlier.shape = NA, notch = TRUE, varwidth = TRUE) +
ylim(0, quantile(df$click_pv, 0.85)) + # 只显示95%分位数以下
labs(title = "Click PV Boxplot", x = "Category", y = "Click PV")
## Warning: Removed 2575 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggplot(df, aes(x = what, y = ctr, fill = what)) +
geom_boxplot(outlier.shape = NA, notch = TRUE, varwidth = TRUE) +
ylim(0, quantile(df$ctr, 0.95)) + # 只显示95%分位数以下
labs(title = "CTR Boxplot", x = "Category", y = "CTR")
## Warning: Removed 858 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# 检验 show_pv
wilcox.test(show_pv ~ what, data = df)
##
## Wilcoxon rank sum test with continuity correction
##
## data: show_pv by what
## W = 7999579, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# 检验 click_pv
wilcox.test(click_pv ~ what, data = df)
##
## Wilcoxon rank sum test with continuity correction
##
## data: click_pv by what
## W = 7299582, p-value = 1.875e-06
## alternative hypothesis: true location shift is not equal to 0
# 检验 ctr
wilcox.test(ctr ~ what, data = df)
##
## Wilcoxon rank sum test with continuity correction
##
## data: ctr by what
## W = 6093618, p-value = 2.95e-05
## alternative hypothesis: true location shift is not equal to 0