# 载入所需包
library(pipeR)
library(plyr)
library(dplyr)
library(data.table)
library(stringr)
library(ggplot2)
# 读入数据
data <- fread("FiveMajorEuropeanFootballLeague.txt")
数据概览:
head(data)
season league round date home.team score away.team
1: 2004-2005 Serie A 1 2004/9/11 23:59 切 沃 2:02 国 米
2: 2004-2005 Serie A 1 2004/9/12 2:30 AC米兰 2:02 利沃诺
3: 2004-2005 Serie A 1 2004/9/12 21:00 布雷西 0:03 尤 文
4: 2004-2005 Serie A 1 2004/9/12 21:00 巴勒莫 1:00 锡耶纳
5: 2004-2005 Serie A 1 2004/9/12 21:00 雷吉纳 0:00 乌迪内
6: 2004-2005 Serie A 1 2004/9/12 21:00 帕尔马 0:00 梅西纳
result odds
1: 平 5.023.201.68
2: 平 1.175.8812.93
3: 负 4.523.161.76
4: 胜 1.723.154.88
5: 平 2.682.872.62
6: 平 1.543.416.21
变量odds包含了“胜平负”三个赔率,需要分开表示;变量score显示方式有误,需要处理,并分两个变量表示。
odds.separated <-
str_extract_all(data$odds, pattern = "([0-9][0-9]|[0-9])\\.[0-9][0-9]") %>>%
unlist() %>>%
matrix(ncol = 3, byrow = TRUE) %>>%
data.frame()
colnames(odds.separated) <- c("win.odds","draw.odds","lose.odds")
score.separated <-
str_split(data$score, pattern = ":0") %>>%
unlist() %>>%
matrix(ncol = 2, byrow = TRUE) %>>%
data.frame()
colnames(score.separated) <- c("home.score","away.score")
data[, win.odds := odds.separated$win.odds %>>%
as.character %>>%
as.numeric] # 主胜赔率
data[, draw.odds := odds.separated$draw.odds %>>%
as.character %>>%
as.numeric] # 平局赔率
data[, lose.odds := odds.separated$lose.odds %>>%
as.character %>>%
as.numeric] # 主负赔率
data[, odds := NULL] # 删除原变量
data[, home.score := score.separated$home.score %>>%
as.character %>>%
as.integer] # 主队得分
data[, away.score := score.separated$away.score %>>%
as.character %>>%
as.integer] # 客队得分
data[, score := NULL] # 删除原变量
head(data) # 处理完毕
season league round date home.team away.team result
1: 2004-2005 Serie A 1 2004/9/11 23:59 切 沃 国 米 平
2: 2004-2005 Serie A 1 2004/9/12 2:30 AC米兰 利沃诺 平
3: 2004-2005 Serie A 1 2004/9/12 21:00 布雷西 尤 文 负
4: 2004-2005 Serie A 1 2004/9/12 21:00 巴勒莫 锡耶纳 胜
5: 2004-2005 Serie A 1 2004/9/12 21:00 雷吉纳 乌迪内 平
6: 2004-2005 Serie A 1 2004/9/12 21:00 帕尔马 梅西纳 平
win.odds draw.odds lose.odds home.score away.score
1: 5.02 3.20 1.68 2 2
2: 1.17 5.88 12.93 2 2
3: 4.52 3.16 1.76 0 3
4: 1.72 3.15 4.88 1 0
5: 2.68 2.87 2.62 0 0
6: 1.54 3.41 6.21 0 0
DecimalToPercentage <- function(x){
# 将小数转换为百分比
# 参数:
# x: 待转换的小数
x %>>%
"*"(100) %>>%
round(2) %>>%
as.character %>>%
paste0("%")
}
PlotLeagueTop10 <- function(leaguename){
# 绘图函数,输出某个联赛最近十赛季总平局率top 10
# 参数:
# leaguename: 联赛名
DrawRatio[leaguename, list(total.draw.ratio = mean(season.draw.ratio)),
by = team] %>>%
with(ggplot(., aes(x = team, y = total.draw.ratio,
fill = team)) +
geom_bar(stat = "identity", width = 0.8) +
geom_text(aes(label = DecimalToPercentage(total.draw.ratio)),
vjust = 1.2, color = "black", size = 3) +
scale_x_discrete(limits = rev(levels(reorder(team, total.draw.ratio)))[1:10]) + # 横坐标按平局率从大到小顺序显示,取前10名
labs(title = paste0(leaguename, "最近十赛季总平局率top 10"), x = "球队", y = "平局率") +
scale_fill_discrete(limits = rev(levels(reorder(team, total.draw.ratio)))[1:10], guide = "none") + # 对应横坐标修改图例
theme(axis.title.y = element_text(angle = 0)))
}
PlotSeasonTendency <- function(leaguename){
# 绘图函数,输出某个联赛最近十赛季平局率变化趋势
# 参数:
# leaguename: 联赛名
DrawRatio[leaguename] %>>%
with(ggplot(., aes(x = season, y = season.draw.ratio)) +
geom_boxplot() +
geom_smooth(aes(group = 1), se = FALSE, size = 1, color = "blue") +
stat_summary(fun.y = "mean", geom = "point", shape = 23, size = 3, fill = "white") + # 添加均值点
labs(title = paste0(leaguename, "最近十赛季平局率变化趋势"), x = "赛季", y = "平局率") +
theme(axis.title.y = element_text(angle = 0)))
}
PlotDrawIncome <- function(leaguename){
# 绘图函数,输出某个联赛最近十赛季全选平局收益
# 参数:
# leaguename: 联赛名
DrawIncome[leaguename, list(revenuebyseason = sum(revenue)),
by = list(season, league)] %>>%
with(ggplot(., aes(x = season, y = revenuebyseason)) +
geom_bar(stat = "identity", fill = "#FF8C00", width = 0.8) +
geom_text(aes(label = round(revenuebyseason, 2)), vjust = 1.2, color = "black", size = 3.5) +
labs(title = paste0(leaguename, "最近十赛季全选平局收益"), x = "赛季", y = "收益") +
theme(axis.title.y = element_text(angle = 0)))
}
CalculateIncome <- function(x){
# 由一组赔率计算奖金
# 参数:
# x: 向量,一组赔率
if (length(x) <= 1) return(0)
if (length(x) > 1){
x %>>%
combn(2) %>>% # 从x中选取2个元素的所有可能结果,以矩阵形式返回
apply(2, prod) %>>% # 每个结果中的元素相乘
"*"(2) %>>% # 再乘2
sum # 再求和,为最后结果
}
}
data[, list(draw.ratio = sum(result == "平")/.N),
by = league] %>>%
with(ggplot(., aes(x = league, y = draw.ratio, fill = league)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = DecimalToPercentage(draw.ratio)),
vjust = 1.2, color = "black", size = 5) +
labs(title = "最近十赛季五大联赛总平局率", x = "联赛", y = "平局率") +
scale_fill_brewer(palette = "Set1", guide = "none") + # 设置调色板,删除图例
scale_x_discrete(limits = rev(levels(reorder(league, draw.ratio)))) + # 调整横坐标顺序,按平局率从大到小排列
theme(axis.title.y = element_text(angle = 0))) # 调整纵坐标标签,横向显示
data[, list(draw.ratio = sum(result == "平")/.N),
by = list(season, league)] %>>%
with(ggplot(., aes(x = season, y = draw.ratio, group = league, color = league)) +
geom_line(size = 1) +
geom_point(size = 3) +
geom_text(aes(label = DecimalToPercentage(draw.ratio)),
vjust = 1, color = "black", size = 3.5) +
labs(title = "最近十赛季五大联赛平局率趋势", x = "赛季", y = "平局率") +
scale_color_brewer(palette = "Set1", limits = c("Bundesliga","La Liga","Ligue 1","Premier League","Serie A")) + # 调整图例颜色
theme(axis.title.y = element_text(angle = 0)))
# 数据准备
HomeDrawRatio <- data[, list(home.draw.ratio = sum(result == "平")/.N),
by = list(season, league, home.team)] # 计算主队平局率
AwayDrawRatio <- data[, list(away.draw.ratio = sum(result == "平")/.N),
by = list(season, league, away.team)] # 计算客队平局率
setkey(HomeDrawRatio, season, league, home.team)
setkey(AwayDrawRatio, season, league, away.team)
DrawRatio <- HomeDrawRatio[AwayDrawRatio] # 合并数据
colnames(DrawRatio)[colnames(DrawRatio) == "home.team"] <- "team" # 修改列名
DrawRatio[, season.draw.ratio := (home.draw.ratio + away.draw.ratio)/2] # 计算赛季总平局率
setkey(DrawRatio, league)
head(DrawRatio)
season league team home.draw.ratio away.draw.ratio
1: 2004-2005 Bundesliga 拜 仁 0.1176471 0.17647059
2: 2004-2005 Bundesliga 比勒费 0.1764706 0.23529412
3: 2004-2005 Bundesliga 波 鸿 0.2941176 0.17647059
4: 2004-2005 Bundesliga 不来梅 0.2352941 0.05882353
5: 2004-2005 Bundesliga 多 特 0.2941176 0.29411765
6: 2004-2005 Bundesliga 弗赖堡 0.3529412 0.17647059
season.draw.ratio
1: 0.1470588
2: 0.2058824
3: 0.2352941
4: 0.1470588
5: 0.2941176
6: 0.2647059
PlotLeagueTop10("Serie A")
PlotSeasonTendency("Serie A")
PlotLeagueTop10("La Liga")
PlotSeasonTendency("La Liga")
PlotLeagueTop10("Bundesliga")
PlotSeasonTendency("Bundesliga")
PlotLeagueTop10("Premier League")
PlotSeasonTendency("Premier League")
PlotLeagueTop10("Ligue 1")
PlotSeasonTendency("Ligue 1")
# 数据准备
DrawIncome <- data[, list(bookies = choose(.N, 2)*2,
income = .SD[, list(income.all = CalculateIncome(draw.odds)), by = result][result == "平", income.all]),
by = list(season, league, round)]
DrawIncome[is.na(income), income := 0] # 当轮无平局,则收入为0
DrawIncome[, revenue := income - bookies] # 计算收益
setkey(DrawIncome, league)
head(DrawIncome)
season league round bookies income revenue
1: 2004-2005 Bundesliga 1 72 62.8478 -9.1522
2: 2004-2005 Bundesliga 2 72 72.5752 0.5752
3: 2004-2005 Bundesliga 3 72 75.0622 3.0622
4: 2004-2005 Bundesliga 4 72 62.3324 -9.6676
5: 2004-2005 Bundesliga 5 72 219.8770 147.8770
6: 2004-2005 Bundesliga 6 72 21.3826 -50.6174
每个赛季每个联赛每轮比赛都全选平局,变量bookies为每次购买成本,变量income为每次奖金,变量revenue为每次购买最终收益。
PlotDrawIncome("Serie A")
意甲近十赛季总收益为-6931.64。
PlotDrawIncome("La Liga")
西甲近十赛季总收益为-10482.49。
PlotDrawIncome("Bundesliga")
德甲近十赛季总收益为-5653.59。
PlotDrawIncome("Premier League")
英超近十赛季总收益为-6128.99。
PlotDrawIncome("Ligue 1")
法甲近十赛季总收益为-3246.78。