3 dataframes total, filtering and cleaning
Load in raw csv files:
Filter out variables not related to our analysis
players <- playersraw[, (names(playersraw) %in% c("player_id", "name", "date_of_birth", "sub_position", "foot", "contract_expiration_date", "highest_market_value_in_eur"))]
games <- gamesraw[, (names(gamesraw) %in% c("game_id", "player_id", "date", "goals", "assists", "minutes_played"))]
player_market_values <- player_market_valuesraw[, !(names(player_market_valuesraw) %in% c("current_club_id"))]
players <- players %>% rename(position = sub_position, peak_value = highest_market_value_in_eur)
player_market_values <- player_market_values %>% rename(value = market_value_in_eur, league = player_club_domestic_competition_id)
Modify ‘games’ to include season: July-Dec of 1st year - Jan-Jun of 2nd year
games <- games %>%
mutate(
year = as.numeric(format(as.Date(date), "%Y")),
month = as.numeric(format(as.Date(date), "%m")),
season = ifelse(month >= 7,
paste0(year, "-", year + 1),
paste0(year - 1, "-", year))) %>%
select(-year, -month)
Get player stats by season; summarize the total goals, assists, and minutes played. Remove latest season (2024-25) stats as this season is incomplete.
player_season_stats <- games %>%
group_by(player_id, season) %>%
summarise(
season_goals = sum(goals),
season_assists = sum(assists),
season_minutes_played = sum(minutes_played)) %>%
filter(season != "2024-2025")
## `summarise()` has grouped output by 'player_id'. You can override using the
## `.groups` argument.
players_stats <- player_season_stats %>%
pivot_wider(names_from = season,
values_from = c(season_goals, season_assists, season_minutes_played),
names_sep = "_")
Sort player_market_values to get player values over time. Filter out any changes from 2024-25 season.
cutoff_date <- as.Date("2024-06-30")
player_market_values <- player_market_values %>%
filter(as.Date(date) <= cutoff_date) %>%
arrange(player_id) %>% group_by(player_id) %>%
mutate(observation = row_number()) %>%
ungroup()
players_market_value <- player_market_values %>%
pivot_wider(names_from = observation,
values_from = c(date, value, league))
Filter out past values, keeping only current values.
latest_player_values <- player_market_values %>%
group_by(player_id) %>%
slice_max(order_by = as.Date(date), n = 1) %>%
ungroup()
Combine player info, player seasonal stats, and player market value into one dataframe.
player_data <- players %>%
inner_join(players_stats, by = "player_id") %>%
inner_join(latest_player_values, by = "player_id")
Cleaning and filtering our combined data frame.
To filter for latest season: - Remove players who did not play at all in the latest season, 2023-24. - Remove players who did not have a market value update in the year 2024. - If a player has an expired contract (NA expiration value), set the expiry date to the last date of the latest full season (2024-06-30). - Additional: Filled in date of birth for player who was missing one.
# last season data filter
player_data <- player_data %>% filter(`season_minutes_played_2023-2024` > 0, year(as.Date(date)) == 2024) %>%
mutate(contract_expiration_date = if_else(is.na(contract_expiration_date),
as.Date("2024-06-30"), as.Date(contract_expiration_date)))
# adding date of birth for 1 player missing it:
player_data <- player_data %>%
mutate(date_of_birth = if_else(name == "Shinji Okazaki", as.Date("1986-04-16"), date_of_birth))
# # NA values checker
# na_all <- player_data %>% summarise(across(everything(), ~ sum(is.na(.))))
# print(na_all)
#
# na_count <- sum(is.na(player_data$date_of_birth))
# print(na_count)
#
# na_rows <- player_data %>%
# filter(is.na(date_of_birth))
# print(na_rows)
Filter for attackers who have at least 2000 career minutes. Create variables to track goals and assists per 90 minutes.
# should we include attacking midfield?
attackers <- player_data %>% filter(position %in% c("Centre-Forward", "Second Striker", "Attacking Midfield", "Left Winger", "Right Winger", "Winger"))
# Create total goals, assists, and minutes for all players. Remove players with less than 2000 minutes.
attackers <- attackers %>%
rowwise() %>%
mutate(total_goals = sum(c_across(starts_with("season_goals")), na.rm = TRUE),
total_assists = sum(c_across(starts_with("season_assists")), na.rm = TRUE),
total_minutes = sum(c_across(starts_with("season_minutes")), na.rm = TRUE)) %>%
filter(total_minutes >= 2000)
# Calculate per 90 minutes stats for all players.
attackers <- attackers %>%
mutate(career_goals_per90 = round((total_goals / total_minutes) * 90, 2),
career_assists_per90 = round((total_assists / total_minutes) * 90, 2))
Convert data so we get correct datatypes and values for age, position, contract length, player value. Change league to reflect the league’s world ranking coefficient. Ranking taken from here: https://www.transfermarkt.com/uefa/5jahreswertung/statistik/stat/plus/0?saison_id=2023 League names taken from here: https://www.kaggle.com/datasets/davidcariboo/player-scores/data?select=competitions.csv
# Convert date of birth into age
attackers <- attackers %>%
mutate(age = round((as.numeric(difftime(as.Date("2024-06-30"), date_of_birth, units = "days")) / 365.25), 2))
# Change position into a factor.
# Combine Left Winger/Right Winger, Second Striker/Attacking Midfielder
attackers <- attackers %>% mutate(position = case_when(
position %in% c("Left Winger", "Right Winger") ~ "Winger",
position %in% c("Second Striker", "Attacking Midfield") ~ "Attacking Midfielder",
position %in% c("Centre-Forward") ~ "Centre Forward")) %>%
mutate(position = as.factor(position))
# Convert contract expiry date into the number of years left.
attackers <- attackers %>%
mutate(contract_length = round((as.numeric(difftime(contract_expiration_date, as.Date("2024-06-30"), units = "days")) / 365.25), 2))
# Change player value units to be millions.
attackers <- attackers %>% mutate(value = value/1000000)
# Change leagues to reflect their world ranking coefficient.
# print(unique(attackers$league))
league_coefficients <- c("L1"=86.624, "RU1"=22.965, "ES1"=89.489, "NL1"=61.300,
"FR1"=66.831, "TR1"=38.600, "PO1"=56.316, "IT1"=90.284,
"GR1"=31.525, "GB1"=104.303, "DK1"=31.450,
"SC1"=36.050, "UKR1"=28.000, "BE1"=48.800)
attackers <- attackers %>%
mutate(league_coeff = league_coefficients[league])
Select relevant attributes for final dataframe.
# could also include: foot, peak_value,
data <- attackers %>% select(
player = name,
age,
position,
contract_length,
league_coeff,
season_goals = `season_goals_2023-2024`,
season_assists = `season_assists_2023-2024`,
season_minutes = `season_minutes_played_2023-2024`,
career_goals_per90,
career_assists_per90,
value)
# print(data %>% arrange(desc(value)))
Summarizing our data:
1496 observations, 11 variables.
Our preliminary model attempts to predict a player’s value given their age, position, the league they play in, and their latest season goals and assists total.
model <- lm(value ~ age + position + league_coeff + season_goals + season_assists, data)
summary(model)
##
## Call:
## lm(formula = value ~ age + position + league_coeff + season_goals +
## season_assists, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.359 -6.658 -1.182 4.611 128.392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.64285 2.80799 4.502 7.24e-06 ***
## age -0.88120 0.08930 -9.868 < 2e-16 ***
## positionCentre Forward -3.93898 1.00842 -3.906 9.80e-05 ***
## positionWinger -1.31364 0.91654 -1.433 0.152
## league_coeff 0.20959 0.01301 16.115 < 2e-16 ***
## season_goals 1.30014 0.07876 16.507 < 2e-16 ***
## season_assists 1.04012 0.12805 8.122 9.49e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.28 on 1489 degrees of freedom
## Multiple R-squared: 0.4835, Adjusted R-squared: 0.4814
## F-statistic: 232.3 on 6 and 1489 DF, p-value: < 2.2e-16
conduct a full analysis of the linear regression assumptions noting any violations and what led to your conclusions.