Présentation du sujet et contexte de l’étude
Démarche utilisée
Difficultées
Limite des modeles et du pouvoir prédictif
Ce qui pourrait être amélioré à l’avenir
Lorsque nous parlons de premier League, nous faisons référence au championnat national de Football Anglais.
C’est le championnat le plus connu au monde, il est considéré comme le plus compliqué car aucune équipe n’a le monopole.
Notre démarche est consitutée de deux étapes :
from bs4 import BeautifulSoup
import urllib3
import re
import time
import requests
from collections import defaultdict
import numpy as np
import tqdm
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib.patches as patches
urlpage_4 = 'https://www.skysports.com/premier-league-table/2023'
def get_page(urlpage_4,element,html_class):
# avoir la page en html
req_5 = urllib3.PoolManager()
res_5 = req_5.request('GET', urlpage_4)
row_html_5 = BeautifulSoup(res_5.data, 'html.parser')
# Renvoie les éléments correspondant à la classe HTML dans une liste
PL23 = row_html_5.find_all(element ,
class_= html_class)
return(PL23)
PL23 = str(get_page(urlpage_4, 'tr', 'row-body'))#définir une fonction pour récolté de l'information ssur les équipes
def lien_PL23 (PL23, team):
team= team.title()
teams = re.findall('<span class="team-name">(.*?)</span>',
str(PL23))
end = PL23.index("</tr>", start)
team_data_20 = PL23[start:end]
match_played= 38
data = [int(s) for s in re.findall(r'<td.*?>(\d+)</td>', team_data_20)]
points= data[0]
wins= data [1]
drawns= data [2]
loses =data [3]
goals_for = data [4]
goals_against = data [5]
team_stats20 = {'match_played': match_played,
'position': position,'points': points,
'wins': wins,'loses': loses ,
'drawns': drawns,'goals_for': goals_for,
'goals_against':goals_against
}
return team_stats20
#on défini un dictionnaire
team_stats_20 = {}
#On créer une loop pour l'ensemble des équipes
for team in list_team_20:
# obtenir les stats des équipes
team_stats = stat23(PL19, team)
# on met en dataframe les stats des équipes
team_stats_df = pd.DataFrame(team_stats, index=[0])
# On crée des noms de variables dans la dataframe
team_stats_df['team'] = team
team_stats_df['year'] = 2023
# On ajoute au dictionnaire la dataframe
team_stats_20[team] = team_stats_df# on défini une fonction pour scraper d'une autre manière
def scrape_PL(year):
url = f"https://www.skysports.com/premier-league-table/{year}"
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to retrieve data for {year}.")
return None
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', class_='standing-table__table')
if table is None:
print("Failed to find the standings table.")
return None
#On crée une boucle pour définir chaque variables dans le tableau
standings_data = []
for row in table.find_all('tr')[1:]:
columns = row.find_all('td')
team_name = columns[1].text.strip()
matches_played = int(columns[2].text.strip())
wins = int(columns[3].text.strip())
draws = int(columns[4].text.strip())
losses = int(columns[5].text.strip())
goals_for = int(columns[6].text.strip())
goals_against = int(columns[7].text.strip())
goal_difference = int(columns[8].text.strip())
points = int(columns[9].text.strip())
#On nome ces variables
standings_data.append({
'Team': team_name,
'Matches Played': matches_played,
'Wins': wins,
'Draws': draws,
'Losses': losses,
'Goals For': goals_for,
'Goals Against': goals_against,
'Goal Difference': goal_difference,
'Points': points
})
return standings_datadef stat23(standings, year):
team_stats_20 = {}
for team_data in standings:
team_name = team_data['Team']
stats = extract_team_stats_single(team_data, year)
team_stats_df = pd.DataFrame(stats, index=[0])
team_stats_df['team'] = team_name
team_stats_df['year'] = year
team_stats_20[team_name] = team_stats_df
if not team_stats_20:
print("No data extracted.")
return None
return pd.concat(team_stats_20.values(), ignore_index=True)## year team ... goals_for goals_against
## 0 2023 Liverpool ... 67 27
## 1 2023 Arsenal ... 70 24
## 2 2023 Manchester City ... 63 28
## 3 2023 Aston Villa ... 62 42
## 4 2023 Tottenham Hotspur ... 62 44
## 5 2023 Manchester United ... 40 40
## 6 2023 West Ham United ... 50 55
## 7 2023 Newcastle United ... 64 52
## 8 2023 Brighton and Hove Albion ... 51 46
## 9 2023 Wolverhampton Wanderers ... 43 47
## 10 2023 Bournemouth ... 44 53
## 11 2023 Chelsea ... 49 47
## 12 2023 Fulham ... 47 50
## 13 2023 Crystal Palace ... 34 50
## 14 2023 Brentford ... 42 55
## 15 2023 Everton * ... 31 42
## 16 2023 Nottingham Forest ** ... 39 53
## 17 2023 Luton Town ... 43 62
## 18 2023 Burnley ... 32 66
## 19 2023 Sheffield United ... 27 77
##
## [20 rows x 10 columns]
#On défini l'url du site où l'on trouve le budget pour chaque équipe
url_7 = "https://sportune.20minutes.fr/sport-business/football/les-budgets-des-clubs-de-la-premier-league-2023-2024-312241/2"
#On crée une fonction pour scraper l'url en faisant attention de se renomer
def scrape_premier_league_budgets(url):
user_agent = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}
res = requests.get(url, headers=user_agent)
if res.status_code != 200:
print("Failed to retrieve data.")
return None
content = res.text
soup = BeautifulSoup(content, "lxml")
contents = re.findall('<tbody>.*?\n</tbody>\n</table>', str(soup), re.DOTALL)
#On extrait les données que l'on veut
all_headers = []
for html_content in contents:
html_soup = BeautifulSoup(html_content, 'html.parser')
headers = html_soup.find_all("th")
all_headers.extend(headers)
Titles = [i.text for i in all_headers]
df = pd.DataFrame(columns=Titles)
#On ajoute les données dans la liste rows_data
all_rows = []
for html_content in contents:
html_soup = BeautifulSoup(html_content, 'html.parser')
rows = html_soup.find_all("tr")
rows_data = []
for row in rows:
cells = row.find_all("td")
cell_data = [cell.get_text(strip=True) for cell in cells]
rows_data.append(cell_data)
all_rows.extend(rows_data)
if all_rows:
all_rows.pop(-1)
#On crée un dataframe pandas
df = pd.concat([df, pd.DataFrame(all_rows, columns=Titles)], ignore_index=True)
return df
## Club Budget year ... drawns goals_for goals_against
## 0 Manchester City 800 M€ 2023 ... 7 63 28
## 1 Manchester United 720 M€ 2023 ... 3 40 40
## 2 Liverpool 690 M€ 2023 ... 7 67 27
## 3 Chelsea 600 M€ 2023 ... 7 49 47
## 4 Arsenal 525 M€ 2023 ... 5 70 24
## 5 Aston Villa 220 M€ 2023 ... 5 62 42
## 6 Crystal Palace 200 M€ 2023 ... 9 34 50
## 7 Brentford 175 M€ 2023 ... 6 42 55
## 8 Bournemouth 160 M€ 2023 ... 8 44 53
## 9 Fulham 155 M€ 2023 ... 6 47 50
## 10 Sheffield United 145 M€ 2023 ... 6 27 77
## 11 Burnley 125 M€ 2023 ... 7 32 66
## 12 Luton Town 90 M€ 2023 ... 7 43 62
##
## [13 rows x 10 columns]
# Suppression des caractères indésirables et les convertir en valeurs numériques
merged_df['Budget'] = pd.to_numeric(merged_df['Budget'].str.replace('M€', '')) * 1000000
# Affichage du DataFrame mis à jour
print(merged_df)## year Club match_played ... goals_for goals_against Budget
## 0 2023 Liverpool 29 ... 67 27 690000000
## 1 2023 Arsenal 29 ... 70 24 525000000
## 2 2023 Manchester City 29 ... 63 28 800000000
## 3 2023 Aston Villa 30 ... 62 42 220000000
## 4 2023 Tottenham 30 ... 62 44 550000000
## 5 2023 Manchester United 29 ... 40 40 720000000
## 6 2023 West Ham 31 ... 50 55 290000000
## 7 2023 Newcastle 30 ... 64 52 305000000
## 8 2023 Brighton 29 ... 51 46 200000000
## 9 2023 Wolverhampton 30 ... 43 47 185000000
## 10 2023 Bournemouth 30 ... 44 53 160000000
## 11 2023 Chelsea 28 ... 49 47 600000000
## 12 2023 Fulham 31 ... 47 50 155000000
## 13 2023 Crystal Palace 30 ... 34 50 200000000
## 14 2023 Brentford 30 ... 42 55 175000000
## 15 2023 Everton 30 ... 31 42 230000000
## 16 2023 Nottingham Forest 31 ... 39 53 120000000
## 17 2023 Luton Town 30 ... 43 62 90000000
## 18 2023 Burnley 31 ... 32 66 125000000
## 19 2023 Sheffield United 29 ... 27 77 145000000
##
## [20 rows x 11 columns]
from sklearn.linear_model import LinearRegression
#Debut de la regression linéaire
X = merged_df[['wins', 'loses', 'drawns', 'goals_for', 'goals_against', 'Budget']]
y = merged_df['points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
# On souhaite obtenir les performances du modèle sur les données de testLinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
## Coefficient de détermination R^2 : 0.9866555232529355
## Generalized Linear Model Regression Results
## ==============================================================================
## Dep. Variable: points No. Observations: 20
## Model: GLM Df Residuals: 13
## Model Family: Poisson Df Model: 6
## Link Function: Log Scale: 1.0000
## Method: IRLS Log-Likelihood: -57.226
## Date: Wed, 03 Apr 2024 Deviance: 5.0185
## Time: 22:06:44 Pearson chi2: 4.88
## No. Iterations: 4 Pseudo R-squ. (CS): 0.9969
## Covariance Type: nonrobust
## =================================================================================
## coef std err z P>|z| [0.025 0.975]
## ---------------------------------------------------------------------------------
## const 2.5598 1.868 1.370 0.171 -1.101 6.221
## wins 0.0808 0.061 1.324 0.185 -0.039 0.200
## loses -0.0089 0.064 -0.139 0.889 -0.135 0.117
## drawns 0.0207 0.065 0.319 0.749 -0.106 0.148
## goals_for -0.0014 0.007 -0.210 0.834 -0.015 0.012
## goals_against 0.0042 0.007 0.587 0.557 -0.010 0.018
## Budget -6.905e-11 2.81e-10 -0.245 0.806 -6.21e-10 4.83e-10
## =================================================================================
# Nombre de matchs à prédire
nb_matchs = 38
# Coefficients du modèle
coefficients = poisson_results.params
# Prédictions des points pour chaque équipe
predicted_points = poisson_results.predict(X)
# Calcul des points supplémentaires pour chaque équipe en fonction des résultats de matchs prévus
predicted_wins_points = coefficients['wins'] * nb_matchs * 3
predicted_draws_points = coefficients['drawns'] * nb_matchs
predicted_losses_points = 0 # Les défaites ne contribuent pas aux points
# Ajouter les points supplémentaires aux prédictions de points
predicted_points += predicted_wins_points + predicted_draws_points + predicted_losses_points
# Classer les équipes en fonction des points prédits
predicted_points_ranking = predicted_points.sort_values(ascending=False)## Team Predicted Points
## 0 Liverpool 81.660832
## 1 Arsenal 77.173682
## 2 Manchester City 75.662777
## 3 Aston Villa 71.973528
## 4 Tottenham 67.523994
## 5 Manchester United 54.511413
## 6 West Ham 53.150569
## 7 Newcastle 50.907270
## 9 Wolverhampton 49.194036
## 10 Bournemouth 48.995915
## 8 Brighton 48.860644
## 11 Chelsea 46.200437
## 12 Fulham 45.831204
## 15 Everton 38.847485
## 13 Crystal Palace 38.027723
## 16 Nottingham Forest 37.264880
## 14 Brentford 35.934365
## 17 Luton Town 33.092185
## 18 Burnley 31.560241
## 19 Sheffield United 30.514139
## ([<matplotlib.patches.Wedge object at 0x7f9de2e53c70>, <matplotlib.patches.Wedge object at 0x7f9db0ce4040>, <matplotlib.patches.Wedge object at 0x7f9db038a6a0>, <matplotlib.patches.Wedge object at 0x7f9dd1287fd0>, <matplotlib.patches.Wedge object at 0x7f9de3752070>, <matplotlib.patches.Wedge object at 0x7f9de37521f0>, <matplotlib.patches.Wedge object at 0x7f9de3725ca0>, <matplotlib.patches.Wedge object at 0x7f9de3725880>, <matplotlib.patches.Wedge object at 0x7f9de373c520>, <matplotlib.patches.Wedge object at 0x7f9dc0565f10>, <matplotlib.patches.Wedge object at 0x7f9de2e53370>, <matplotlib.patches.Wedge object at 0x7f9dc0546e50>, <matplotlib.patches.Wedge object at 0x7f9de3767a30>, <matplotlib.patches.Wedge object at 0x7f9db0cd1100>, <matplotlib.patches.Wedge object at 0x7f9db0cd1790>, <matplotlib.patches.Wedge object at 0x7f9db0cd1e20>, <matplotlib.patches.Wedge object at 0x7f9db0cd64f0>, <matplotlib.patches.Wedge object at 0x7f9db0cd6b80>, <matplotlib.patches.Wedge object at 0x7f9de376e250>, <matplotlib.patches.Wedge object at 0x7f9de376e8e0>], [Text(-0.9924703036396021, 0.47434449126506784, 'Liverpool'), Text(-1.0988933743603344, -0.049329015669869254, 'Arsenal'), Text(-0.9562170798069254, -0.5437360538768018, 'Manchester City'), Text(-0.6189714638830981, -0.9093263038636983, 'Aston Villa'), Text(-0.18250392958570028, -1.084754495582193, 'Tottenham'), Text(0.22966656669604427, -1.0757570674376495, 'Manchester United'), Text(0.5683288987129241, -0.9418079755914976, 'West Ham'), Text(0.8367957093067998, -0.7139838519782713, 'Newcastle'), Text(1.0143989308742982, -0.4254348469990214, 'Wolverhampton'), Text(1.095168072689031, -0.10298976921322552, 'Bournemouth'), Text(1.0761640754758193, 0.227751800553312, 'Brighton'), Text(0.9641574431877884, 0.5295284928552824, 'Chelsea'), Text(0.7769102981335405, 0.7787235636951365, 'Fulham'), Text(0.5490715891751276, 0.9531633595353423, 'Everton'), Text(0.3094047509951481, 1.0555892667423399, 'Crystal Palace'), Text(0.057738971740450694, 1.098483596209955, 'Nottingham Forest'), Text(-0.19003387108410233, 1.0834607181807705, 'Brentford'), Text(-0.4150321630138848, 1.018699319556078, 'Luton Town'), Text(-0.6089039587033693, 0.9160982311277354, 'Burnley'), Text(-0.7723501552759182, 0.7832466007875587, 'Sheffield United')], [Text(-0.5413474383488738, 0.25873335887185517, '8.0%'), Text(-0.5993963860147278, -0.02690673581992868, '7.6%'), Text(-0.5215729526219592, -0.2965833021146191, '7.4%'), Text(-0.3376207984816898, -0.4959961657438353, '7.1%'), Text(-0.0995475979558365, -0.5916842703175598, '6.6%'), Text(0.12527267274329684, -0.5867765822387179, '5.4%'), Text(0.3099975811161404, -0.5137134412317259, '5.2%'), Text(0.45643402325825433, -0.3894457374426934, '5.0%'), Text(0.5533085077496172, -0.23205537109037527, '4.8%'), Text(0.5973644032849259, -0.05617623775266846, '4.8%'), Text(0.5869985866231741, 0.12422825484726108, '4.8%'), Text(0.5259040599206117, 0.28883372337560853, '4.5%'), Text(0.4237692535273857, 0.4247583074700744, '4.5%'), Text(0.29949359409552406, 0.5199072870192776, '3.8%'), Text(0.1687662278155353, 0.5757759636776398, '3.7%'), Text(0.03149398458570037, 0.5991728706599754, '3.7%'), Text(-0.10365483877314673, 0.5909785735531474, '3.5%'), Text(-0.22638117982575529, 0.5556541743033152, '3.3%'), Text(-0.3321294320200196, 0.49968994425149194, '3.1%'), Text(-0.4212819028777735, 0.4272254186113956, '3.0%')])
## (-1.0999999945659023, 1.0999999984507463, -1.0999999971852474, 1.0999999995749006)
Cela peut s’expliquer par plusieurs facteurs :
Des caractéristiques intrinsèques aux joueurs sont inobservables.
Le nombre de variables pouvant être prises en compte est beaucoup trop grand.
La composition des équipes est changeante.
Pour une analyse plus poussée et pourquoi par une prédiction plus
‘sensée’, il serait intéressant de :
Multiplier le nombre et la variété des donneés
Essayer d’autres modèles statistiques
Il serait intéressant d’automatisé le code de façon à ce qu’il intègre automatiquement des nouvelles données dès que celle-ci seraient disponible.