Gathering relevant information

3 dataframes total, filtering and cleaning

Load in raw csv files:

Filter out variables not related to our analysis

players <- playersraw[, (names(playersraw) %in% c("player_id", "name", "date_of_birth", "sub_position", "foot", "contract_expiration_date", "highest_market_value_in_eur"))]
games <- gamesraw[, (names(gamesraw) %in% c("game_id", "player_id", "date", "goals", "assists", "minutes_played"))]
player_market_values <- player_market_valuesraw[, !(names(player_market_valuesraw) %in% c("current_club_id"))]

players <- players %>% rename(position = sub_position, peak_value = highest_market_value_in_eur)
player_market_values <- player_market_values %>% rename(value = market_value_in_eur, league = player_club_domestic_competition_id)

Modify ‘games’ to include season: July-Dec of 1st year - Jan-Jun of 2nd year

games <- games %>%
  mutate(
    year = as.numeric(format(as.Date(date), "%Y")),
    month = as.numeric(format(as.Date(date), "%m")),
    season = ifelse(month >= 7, 
                    paste0(year, "-", year + 1),
                    paste0(year - 1, "-", year))) %>%
  select(-year, -month)

Get player stats by season; summarize the total goals, assists, and minutes played. Remove latest season (2024-25) stats as this season is incomplete.

player_season_stats <- games %>%
  group_by(player_id, season) %>%
  summarise(
    season_goals = sum(goals),
    season_assists = sum(assists),
    season_minutes_played = sum(minutes_played)) %>%
  filter(season != "2024-2025")
## `summarise()` has grouped output by 'player_id'. You can override using the
## `.groups` argument.
players_stats <- player_season_stats %>%
  pivot_wider(names_from = season,
    values_from = c(season_goals, season_assists, season_minutes_played),
    names_sep = "_")

Sort player_market_values to get player values over time. Filter out any changes from 2024-25 season.

cutoff_date <- as.Date("2024-06-30")
player_market_values <- player_market_values %>%
  filter(as.Date(date) <= cutoff_date) %>%
  arrange(player_id) %>% group_by(player_id) %>%
  mutate(observation = row_number()) %>%
  ungroup()

players_market_value <- player_market_values %>%
  pivot_wider(names_from = observation,
    values_from = c(date, value, league))

Filter out past values, keeping only current values.

latest_player_values <- player_market_values %>%
  group_by(player_id) %>%
  slice_max(order_by = as.Date(date), n = 1) %>%
  ungroup()

Combine player info, player seasonal stats, and player market value into one dataframe.

player_data <- players %>%
  inner_join(players_stats, by = "player_id") %>%
  inner_join(latest_player_values, by = "player_id")

Data cleaning

Cleaning and filtering our combined data frame.

To filter for latest season: - Remove players who did not play at all in the latest season, 2023-24. - Remove players who did not have a market value update in the year 2024. - If a player has an expired contract (NA expiration value), set the expiry date to the last date of the latest full season (2024-06-30). - Additional: Filled in date of birth for player who was missing one.

# last season data filter
player_data <- player_data %>% filter(`season_minutes_played_2023-2024` > 0, year(as.Date(date)) == 2024) %>%
    mutate(contract_expiration_date = if_else(is.na(contract_expiration_date), 
                                            as.Date("2024-06-30"), as.Date(contract_expiration_date)))

# adding date of birth for 1 player missing it:
player_data <- player_data %>%
  mutate(date_of_birth = if_else(name == "Shinji Okazaki", as.Date("1986-04-16"), date_of_birth))

# # NA values checker
# na_all <- player_data %>% summarise(across(everything(), ~ sum(is.na(.))))
# print(na_all)
# 
# na_count <- sum(is.na(player_data$date_of_birth))
# print(na_count)
# 
# na_rows <- player_data %>%
#   filter(is.na(date_of_birth))
# print(na_rows)

Filter for attackers who have at least 2000 career minutes. Create variables to track goals and assists per 90 minutes.

# should we include attacking midfield?
attackers <- player_data %>% filter(position %in% c("Centre-Forward", "Second Striker", "Attacking Midfield", "Left Winger", "Right Winger", "Winger"))

# Create total goals, assists, and minutes for all players. Remove players with less than 2000 minutes.
attackers <- attackers %>%
  rowwise() %>%
  mutate(total_goals = sum(c_across(starts_with("season_goals")), na.rm = TRUE),
    total_assists = sum(c_across(starts_with("season_assists")), na.rm = TRUE),
    total_minutes = sum(c_across(starts_with("season_minutes")), na.rm = TRUE)) %>% 
  filter(total_minutes >= 2000)

# Calculate per 90 minutes stats for all players.
attackers <- attackers %>%
  mutate(career_goals_per90 = round((total_goals / total_minutes) * 90, 2),
    career_assists_per90 = round((total_assists / total_minutes) * 90, 2))

Convert data so we get correct datatypes and values for age, position, contract length, player value. Change league to reflect the league’s world ranking coefficient. Ranking taken from here: https://www.transfermarkt.com/uefa/5jahreswertung/statistik/stat/plus/0?saison_id=2023 League names taken from here: https://www.kaggle.com/datasets/davidcariboo/player-scores/data?select=competitions.csv

# Convert date of birth into age
attackers <- attackers %>%
  mutate(age = round((as.numeric(difftime(as.Date("2024-06-30"), date_of_birth, units = "days")) / 365.25), 2))

# Change position into a factor. 
# Combine Left Winger/Right Winger, Second Striker/Attacking Midfielder
attackers <- attackers %>% mutate(position = case_when(
    position %in% c("Left Winger", "Right Winger") ~ "Winger",
    position %in% c("Second Striker", "Attacking Midfield") ~ "Attacking Midfielder",
    position %in% c("Centre-Forward") ~ "Centre Forward")) %>%
  mutate(position = as.factor(position))

# Convert contract expiry date into the number of years left.
attackers <- attackers %>% 
  mutate(contract_length = round((as.numeric(difftime(contract_expiration_date, as.Date("2024-06-30"), units = "days")) / 365.25), 2))

# Change player value units to be millions.
attackers <- attackers %>% mutate(value = value/1000000)

# Change leagues to reflect their world ranking coefficient.
# print(unique(attackers$league))
league_coefficients <- c("L1"=86.624, "RU1"=22.965, "ES1"=89.489, "NL1"=61.300, 
                         "FR1"=66.831, "TR1"=38.600, "PO1"=56.316, "IT1"=90.284, 
                         "GR1"=31.525, "GB1"=104.303, "DK1"=31.450, 
                         "SC1"=36.050, "UKR1"=28.000, "BE1"=48.800)

attackers <- attackers %>%
  mutate(league_coeff = league_coefficients[league])

Select relevant attributes for final dataframe.

# could also include: foot, peak_value, 
data <- attackers %>% select(
    player = name, 
    age, 
    position, 
    contract_length,
    league_coeff,
    season_goals = `season_goals_2023-2024`,
    season_assists = `season_assists_2023-2024`,
    season_minutes = `season_minutes_played_2023-2024`,
    career_goals_per90,
    career_assists_per90,
    value)

# print(data %>% arrange(desc(value)))

Data Description

Summarizing our data:

1496 observations, 11 variables.

Preliminary results

Our preliminary model attempts to predict a player’s value given their age, position, the league they play in, and their latest season goals and assists total.

model <- lm(value ~ age + position + league_coeff + season_goals + season_assists, data)
summary(model)
## 
## Call:
## lm(formula = value ~ age + position + league_coeff + season_goals + 
##     season_assists, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.359  -6.658  -1.182   4.611 128.392 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            12.64285    2.80799   4.502 7.24e-06 ***
## age                    -0.88120    0.08930  -9.868  < 2e-16 ***
## positionCentre Forward -3.93898    1.00842  -3.906 9.80e-05 ***
## positionWinger         -1.31364    0.91654  -1.433    0.152    
## league_coeff            0.20959    0.01301  16.115  < 2e-16 ***
## season_goals            1.30014    0.07876  16.507  < 2e-16 ***
## season_assists          1.04012    0.12805   8.122 9.49e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.28 on 1489 degrees of freedom
## Multiple R-squared:  0.4835, Adjusted R-squared:  0.4814 
## F-statistic: 232.3 on 6 and 1489 DF,  p-value: < 2.2e-16

conduct a full analysis of the linear regression assumptions noting any violations and what led to your conclusions.