library(ggplot2)
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read_excel("Downloads/AI书 & 非AI书(10.1-10.21).xlsx")
## New names:
## • `` -> `...9`
df <- df %>% select(1:8)
df <- df %>%
  mutate(what = ifelse(what == "非AI书", "non-ai", "ai"))

然后看一下数据情况

library(dplyr)

# 计算描述性统计量
df_summary <- df %>%
  group_by(what) %>%
  summarise(
    show_mean = mean(show_pv, na.rm = TRUE),
    show_sd = sd(show_pv, na.rm = TRUE),
    show_median = median(show_pv, na.rm = TRUE),
    show_min = min(show_pv, na.rm = TRUE),
    show_max = max(show_pv, na.rm = TRUE),
    click_mean = mean(click_pv, na.rm = TRUE),
    click_sd = sd(click_pv, na.rm = TRUE),
    click_median = median(click_pv, na.rm = TRUE),
    click_min = min(click_pv, na.rm = TRUE),
    click_max = max(click_pv, na.rm = TRUE),
    ctr_mean = mean(ctr, na.rm = TRUE),
    ctr_sd = sd(ctr, na.rm = TRUE),
    ctr_median = median(ctr, na.rm = TRUE),
    ctr_min = min(ctr, na.rm = TRUE),
    ctr_max = max(ctr, na.rm = TRUE)
  )

t(df_summary)
##              [,1]          [,2]         
## what         "ai"          "non-ai"     
## show_mean    "30042.36"    "25517.64"   
## show_sd      "346462.9"    "305441.6"   
## show_median  "1281.5"      " 419.0"     
## show_min     "0"           "0"          
## show_max     " 8266336"    "23044253"   
## click_mean   "1349.892"    "1566.818"   
## click_sd     "14815.51"    "18050.82"   
## click_median "12"          " 5"         
## click_min    "0"           "0"          
## click_max    " 312236"     "1281045"    
## ctr_mean     "0.01462412"  "0.03175295" 
## ctr_sd       "0.03780759"  "0.06805258" 
## ctr_median   "0.006993331" "0.008289583"
## ctr_min      "0"           "0"          
## ctr_max      "0.6666667"   "2.0000000"

箱线图中有两个非常有用的特性:可变宽度和凹槽。

可变宽度:这个特性让箱线图的宽度与样本量成比例。简单来说,样本量多的组宽度会更大,样本量少的则更窄。这可以帮助你直观地看到每个组的数据量大小。

凹槽:箱线图中的凹槽用于对比中位数的统计显著性。如果两个箱线图的凹槽不重叠,通常可以认为它们的中位数在统计上有显著差异。凹槽的宽度随着样本量和数据的变异性变化,有助于判断数据组之间的差异。

ggplot(df, aes(x = what, y = show_pv, fill = what)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE, varwidth = TRUE) +
  ylim(0, quantile(df$show_pv, 0.95)) +  # 只显示95%分位数以下
  labs(title = "Show PV Boxplot", x = "Category", y = "Show PV")
## Warning: Removed 859 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(df, aes(x = what, y = click_pv, fill = what)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE, varwidth = TRUE) +
  ylim(0, quantile(df$click_pv, 0.85)) +  # 只显示95%分位数以下
  labs(title = "Click PV Boxplot", x = "Category", y = "Click PV")
## Warning: Removed 2575 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(df, aes(x = what, y = ctr, fill = what)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE, varwidth = TRUE) +
  ylim(0, quantile(df$ctr, 0.95)) +  # 只显示95%分位数以下
  labs(title = "CTR Boxplot", x = "Category", y = "CTR")
## Warning: Removed 858 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# 检验 show_pv
wilcox.test(show_pv ~ what, data = df)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  show_pv by what
## W = 7999579, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# 检验 click_pv
wilcox.test(click_pv ~ what, data = df)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  click_pv by what
## W = 7299582, p-value = 1.875e-06
## alternative hypothesis: true location shift is not equal to 0
# 检验 ctr
wilcox.test(ctr ~ what, data = df)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ctr by what
## W = 6093618, p-value = 2.95e-05
## alternative hypothesis: true location shift is not equal to 0