1. 数据导入与数据处理

# 载入所需包 
library(pipeR)
library(plyr)
library(dplyr)
library(data.table)
library(stringr)
library(ggplot2)
# 读入数据
data <- fread("FiveMajorEuropeanFootballLeague.txt")    

数据概览:

head(data) 
      season  league round            date home.team score away.team
1: 2004-2005 Serie A     1 2004/9/11 23:59     切 沃  2:02     国 米
2: 2004-2005 Serie A     1  2004/9/12 2:30    AC米兰  2:02    利沃诺
3: 2004-2005 Serie A     1 2004/9/12 21:00    布雷西  0:03     尤 文
4: 2004-2005 Serie A     1 2004/9/12 21:00    巴勒莫  1:00    锡耶纳
5: 2004-2005 Serie A     1 2004/9/12 21:00    雷吉纳  0:00    乌迪内
6: 2004-2005 Serie A     1 2004/9/12 21:00    帕尔马  0:00    梅西纳
   result          odds
1:     平  5.023.201.68
2:     平 1.175.8812.93
3:     负  4.523.161.76
4:     胜  1.723.154.88
5:     平  2.682.872.62
6:     平  1.543.416.21

变量odds包含了“胜平负”三个赔率,需要分开表示;变量score显示方式有误,需要处理,并分两个变量表示。

odds.separated <- 
  str_extract_all(data$odds, pattern = "([0-9][0-9]|[0-9])\\.[0-9][0-9]") %>>%
  unlist() %>>%
  matrix(ncol = 3, byrow = TRUE) %>>%
  data.frame()
colnames(odds.separated) <- c("win.odds","draw.odds","lose.odds") 

score.separated <- 
  str_split(data$score, pattern = ":0") %>>%
  unlist() %>>%
  matrix(ncol = 2, byrow = TRUE) %>>%
  data.frame()
colnames(score.separated) <- c("home.score","away.score") 
data[, win.odds  := odds.separated$win.odds %>>%
                    as.character %>>%
                    as.numeric]   # 主胜赔率
data[, draw.odds := odds.separated$draw.odds %>>%
                    as.character %>>%
                    as.numeric]   # 平局赔率
data[, lose.odds := odds.separated$lose.odds %>>%
                    as.character %>>%
                    as.numeric]   # 主负赔率
data[, odds := NULL]              # 删除原变量
data[, home.score := score.separated$home.score %>>%
                     as.character %>>%
                     as.integer]  # 主队得分
data[, away.score := score.separated$away.score %>>%
                     as.character %>>%
                     as.integer]  # 客队得分
data[, score := NULL]             # 删除原变量
head(data)  # 处理完毕
      season  league round            date home.team away.team result
1: 2004-2005 Serie A     1 2004/9/11 23:59     切 沃     国 米     平
2: 2004-2005 Serie A     1  2004/9/12 2:30    AC米兰    利沃诺     平
3: 2004-2005 Serie A     1 2004/9/12 21:00    布雷西     尤 文     负
4: 2004-2005 Serie A     1 2004/9/12 21:00    巴勒莫    锡耶纳     胜
5: 2004-2005 Serie A     1 2004/9/12 21:00    雷吉纳    乌迪内     平
6: 2004-2005 Serie A     1 2004/9/12 21:00    帕尔马    梅西纳     平
   win.odds draw.odds lose.odds home.score away.score
1:     5.02      3.20      1.68          2          2
2:     1.17      5.88     12.93          2          2
3:     4.52      3.16      1.76          0          3
4:     1.72      3.15      4.88          1          0
5:     2.68      2.87      2.62          0          0
6:     1.54      3.41      6.21          0          0

2. 自定义函数

DecimalToPercentage <- function(x){
# 将小数转换为百分比
# 参数:
#   x: 待转换的小数
  x %>>%
  "*"(100) %>>%  
  round(2) %>>% 
  as.character %>>%
  paste0("%")
}
  
PlotLeagueTop10 <- function(leaguename){
# 绘图函数,输出某个联赛最近十赛季总平局率top 10
# 参数:
#   leaguename: 联赛名
  DrawRatio[leaguename, list(total.draw.ratio = mean(season.draw.ratio)), 
                        by = team] %>>%
  with(ggplot(., aes(x = team, y = total.draw.ratio, 
                     fill = team)) +  
       geom_bar(stat = "identity", width = 0.8) +
       geom_text(aes(label = DecimalToPercentage(total.draw.ratio)), 
                 vjust = 1.2, color = "black", size = 3) +
       scale_x_discrete(limits = rev(levels(reorder(team, total.draw.ratio)))[1:10]) +  # 横坐标按平局率从大到小顺序显示,取前10名
       labs(title = paste0(leaguename, "最近十赛季总平局率top 10"), x = "球队", y = "平局率") +
       scale_fill_discrete(limits = rev(levels(reorder(team, total.draw.ratio)))[1:10], guide = "none") +  # 对应横坐标修改图例
       theme(axis.title.y = element_text(angle = 0)))
} 

PlotSeasonTendency <- function(leaguename){
# 绘图函数,输出某个联赛最近十赛季平局率变化趋势
# 参数:
#   leaguename: 联赛名
  DrawRatio[leaguename]  %>>%
  with(ggplot(., aes(x = season, y = season.draw.ratio)) + 
       geom_boxplot() + 
       geom_smooth(aes(group = 1), se = FALSE, size = 1, color = "blue") +
       stat_summary(fun.y = "mean", geom = "point", shape = 23, size = 3, fill = "white") +  # 添加均值点
       labs(title = paste0(leaguename, "最近十赛季平局率变化趋势"), x = "赛季", y = "平局率") +
       theme(axis.title.y = element_text(angle = 0)))
}

PlotDrawIncome <- function(leaguename){
# 绘图函数,输出某个联赛最近十赛季全选平局收益
# 参数:
#   leaguename: 联赛名
  DrawIncome[leaguename, list(revenuebyseason = sum(revenue)), 
                         by = list(season, league)] %>>%
  with(ggplot(., aes(x = season, y = revenuebyseason)) +  
       geom_bar(stat = "identity", fill = "#FF8C00", width = 0.8) +
       geom_text(aes(label = round(revenuebyseason, 2)), vjust = 1.2, color = "black", size = 3.5) +
       labs(title = paste0(leaguename, "最近十赛季全选平局收益"), x = "赛季", y = "收益") +
       theme(axis.title.y = element_text(angle = 0)))
}

CalculateIncome <- function(x){
# 由一组赔率计算奖金
# 参数:
#   x: 向量,一组赔率
  if (length(x) <= 1) return(0)
  if (length(x) > 1){
    x %>>%
    combn(2) %>>%  # 从x中选取2个元素的所有可能结果,以矩阵形式返回 
    apply(2, prod) %>>%  # 每个结果中的元素相乘 
    "*"(2) %>>%  # 再乘2
    sum  # 再求和,为最后结果
  }
}

3. 五大联赛横向分析

data[, list(draw.ratio = sum(result == "平")/.N), 
       by = league] %>>%
  with(ggplot(., aes(x = league, y = draw.ratio, fill = league)) +
       geom_bar(stat = "identity", width = 0.7) +
       geom_text(aes(label = DecimalToPercentage(draw.ratio)), 
                 vjust = 1.2, color = "black", size = 5) +
       labs(title = "最近十赛季五大联赛总平局率", x = "联赛", y = "平局率") +
       scale_fill_brewer(palette = "Set1", guide = "none") +  # 设置调色板,删除图例
       scale_x_discrete(limits = rev(levels(reorder(league, draw.ratio)))) +  # 调整横坐标顺序,按平局率从大到小排列
       theme(axis.title.y = element_text(angle = 0)))  # 调整纵坐标标签,横向显示

data[, list(draw.ratio = sum(result == "平")/.N), 
       by = list(season, league)] %>>%
  with(ggplot(., aes(x = season, y = draw.ratio, group = league, color = league)) +
       geom_line(size = 1) +
       geom_point(size = 3) +
       geom_text(aes(label = DecimalToPercentage(draw.ratio)), 
                 vjust = 1, color = "black", size = 3.5) +
       labs(title = "最近十赛季五大联赛平局率趋势", x = "赛季", y = "平局率") +
       scale_color_brewer(palette = "Set1", limits = c("Bundesliga","La Liga","Ligue 1","Premier League","Serie A")) +  # 调整图例颜色
       theme(axis.title.y = element_text(angle = 0)))


4. 五大联赛纵向分析

# 数据准备
HomeDrawRatio <- data[, list(home.draw.ratio = sum(result == "平")/.N),
                        by = list(season, league, home.team)]  # 计算主队平局率

AwayDrawRatio <- data[, list(away.draw.ratio = sum(result == "平")/.N),
                        by = list(season, league, away.team)]  # 计算客队平局率

setkey(HomeDrawRatio, season, league, home.team) 
setkey(AwayDrawRatio, season, league, away.team)

DrawRatio <- HomeDrawRatio[AwayDrawRatio]  # 合并数据
colnames(DrawRatio)[colnames(DrawRatio) == "home.team"] <- "team"  # 修改列名
DrawRatio[, season.draw.ratio := (home.draw.ratio + away.draw.ratio)/2]   # 计算赛季总平局率
setkey(DrawRatio, league)
head(DrawRatio)
      season     league   team home.draw.ratio away.draw.ratio
1: 2004-2005 Bundesliga  拜 仁       0.1176471      0.17647059
2: 2004-2005 Bundesliga 比勒费       0.1764706      0.23529412
3: 2004-2005 Bundesliga  波 鸿       0.2941176      0.17647059
4: 2004-2005 Bundesliga 不来梅       0.2352941      0.05882353
5: 2004-2005 Bundesliga  多 特       0.2941176      0.29411765
6: 2004-2005 Bundesliga 弗赖堡       0.3529412      0.17647059
   season.draw.ratio
1:         0.1470588
2:         0.2058824
3:         0.2352941
4:         0.1470588
5:         0.2941176
6:         0.2647059

4.1 意甲

PlotLeagueTop10("Serie A")

PlotSeasonTendency("Serie A")


4.2 西甲

PlotLeagueTop10("La Liga")

PlotSeasonTendency("La Liga")


4.3 德甲

PlotLeagueTop10("Bundesliga")

PlotSeasonTendency("Bundesliga")


4.4 英超

PlotLeagueTop10("Premier League")

PlotSeasonTendency("Premier League")


4.5 法甲

PlotLeagueTop10("Ligue 1")

PlotSeasonTendency("Ligue 1")


5. 五大联赛平局收益分析

# 数据准备
DrawIncome <- data[, list(bookies = choose(.N, 2)*2,
                          income =  .SD[, list(income.all = CalculateIncome(draw.odds)), by = result][result == "平", income.all]),
                     by = list(season, league, round)]

DrawIncome[is.na(income), income := 0]  # 当轮无平局,则收入为0
DrawIncome[, revenue := income - bookies]  # 计算收益

setkey(DrawIncome, league)
head(DrawIncome)
      season     league round bookies   income  revenue
1: 2004-2005 Bundesliga     1      72  62.8478  -9.1522
2: 2004-2005 Bundesliga     2      72  72.5752   0.5752
3: 2004-2005 Bundesliga     3      72  75.0622   3.0622
4: 2004-2005 Bundesliga     4      72  62.3324  -9.6676
5: 2004-2005 Bundesliga     5      72 219.8770 147.8770
6: 2004-2005 Bundesliga     6      72  21.3826 -50.6174

每个赛季每个联赛每轮比赛都全选平局,变量bookies为每次购买成本,变量income为每次奖金,变量revenue为每次购买最终收益。


5.1 意甲

PlotDrawIncome("Serie A")

意甲近十赛季总收益为-6931.64


5.2 西甲

PlotDrawIncome("La Liga")

西甲近十赛季总收益为-10482.49


5.3 德甲

PlotDrawIncome("Bundesliga")

德甲近十赛季总收益为-5653.59


5.4 英超

PlotDrawIncome("Premier League")

英超近十赛季总收益为-6128.99


5.5 法甲

PlotDrawIncome("Ligue 1")

法甲近十赛季总收益为-3246.78