Source: wikipedia.com
I loaded In My Datasets
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/Danny/OneDrive/Documents/Data_110/Final Project 2")
Bundesliga <- read_csv("bundesliga_player.csv")
## New names:
## Rows: 515 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (9): name, full_name, nationality, place_of_birth, position, foot, club... dbl
## (6): ...1, age, height, price, max_price, shirt_nr date (2): contract_expires,
## joined_club
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Filtered Out Any Columns I Wasn’t Going To Use
Bundesliga <- Bundesliga |>
select(-full_name, -player_agent, -outfitter, -height, -contract_expires, -joined_club, -outfitter)
Linear Between Price And Age For All Players In The Bundesliga
age_lnrg <-lm(price ~ age, data = Bundesliga)
summary(age_lnrg)
##
## Call:
## lm(formula = price ~ age, data = Bundesliga)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.130 -6.597 -4.428 0.678 109.320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.8570 3.6295 4.645 4.35e-06 ***
## age -0.3251 0.1387 -2.345 0.0194 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.59 on 508 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.01071, Adjusted R-squared: 0.008759
## F-statistic: 5.498 on 1 and 508 DF, p-value: 0.01942
Created A Prediction Model To Show How Age Effects Value
newdata <- data.frame(age = seq(min(Bundesliga$age), max(Bundesliga$age), length.out = 23))
newdata$predicted_price <- predict(age_lnrg, newdata = newdata)
Graphed All Players In The Bundesliga And There Value And Age
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
highchart() %>%
hc_add_series(Bundesliga, type = "scatter", hcaes(x = age, y = price)) %>%
hc_add_series(newdata, type = "line", hcaes(x = age, y = predicted_price)) %>%
hc_xAxis(title = list(text = "Age")) %>%
hc_yAxis(title = list(text = "Price")) %>%
hc_tooltip(formatter = JS("function() {
return 'Age: ' + this.x + '<br>' +
'Price: ' + this.y.toFixed(2);
}"))
I Decided That 500+ Points Was Too Much So I Filtered By Age Group
(Young, Middle Aged, Old)
Bundesliga_U23 <- Bundesliga %>%
filter(age >= 17 & age <= 23)
Bundesliga_U30 <- Bundesliga %>%
filter(age >= 24 & age <= 30)
Bundesliga_U39 <- Bundesliga %>%
filter(age >= 31 & age <= 39)
The Bundesliga Dataset Had A Lot Of Catergories For Position. So I
Made 13 Position Groups Into 3.
Bundesliga_U23$position[Bundesliga_U23$position == "Attack - Centre-Forward"] <- "Forward"
Bundesliga_U23$position[Bundesliga_U23$position == "Attack - Left Winger"] <- "Forward"
Bundesliga_U23$position[Bundesliga_U23$position == "Attack - Right Winger"] <- "Forward"
Bundesliga_U23$position[Bundesliga_U23$position == "Attack - Second Striker"] <- "Forward"
Bundesliga_U23$position[Bundesliga_U23$position == "midfield - Attacking Midfield"] <- "Midfielder"
Bundesliga_U23$position[Bundesliga_U23$position == "midfield - Left Midfield"] <- "Midfielder"
Bundesliga_U23$position[Bundesliga_U23$position == "midfield - Right Midfield"] <- "Midfielder"
Bundesliga_U23$position[Bundesliga_U23$position == "midfield - Central Midfield"] <- "Midfielder"
Bundesliga_U23$position[Bundesliga_U23$position == "midfield - Defensive Midfield"] <- "Midfielder"
Bundesliga_U23$position[Bundesliga_U23$position == "Defender - Left-Back"] <- "Defender"
Bundesliga_U23$position[Bundesliga_U23$position == "Defender - Centre-Back"] <- "Defender"
Bundesliga_U23$position[Bundesliga_U23$position == "Defender - Right-Back"] <- "Defender"
I graphed the value for players under 23
highchart() %>%
hc_title(text = "Value For Bundesliga Players Under 23") %>%
hc_xAxis(title = list(text = "Age")) %>%
hc_yAxis(title = list(text = "Value In Millions")) %>%
hc_tooltip(pointFormat = "<b>Name:</b> {point.name}<br>
<b>Value:</b> {point.x}<br>
<b>Age:</b> {point.y}") %>%
hc_colors(colors = c("#1E90FF", "#3366FF", "#CD5C5C","#DC143C")) %>%
hc_add_series(data = Bundesliga_U23 %>% filter(position == "Goalkeeper"),
type = "scatter",
hcaes(x = price, y = age),
name = "Goalkeeper") %>%
hc_add_series(data = Bundesliga_U23 %>% filter(position == "Defender"),
type = "scatter",
hcaes(x = price, y = age),
name = "Defender") %>%
hc_add_series(data = Bundesliga_U23 %>% filter(position == "Midfielder"),
type = "scatter",
hcaes(x = price, y = age),
name = "Midfielder") %>%
hc_add_series(data = Bundesliga_U23 %>% filter(position == "Forward"),
type = "scatter",
hcaes(x = price, y = age),
name = "Forward")
Grouped the age and mean value together and rounded the number to
give me a cleaner answer
mean_price_u23 <- Bundesliga_U23 %>%
group_by(age) %>%
summarize(mean_price = mean(price, na.rm = TRUE))
newdata$predicted_price <- round(newdata$predicted_price, digits = 1)
mean_price_u23$mean_price <- round(mean_price_u23$mean_price, digits = 1)
newdata_u23 <- newdata %>%
filter(age >= 17 & age <= 23)
Graphed the Mean Value For Players In The Bundesliga Aged 17-23
mean_U23 <- mean_price_u23$mean_price
new_U23 <- newdata_u23$predicted_price
age_u23 <- mean_price_u23$age
highchart() %>%
hc_title(text = "Bundesliga Mean Value For Ages 17-23") %>%
hc_xAxis(title = list(text = "Age")) %>%
hc_yAxis(title = list(text = "Mean Value In Millions")) %>%
hc_add_series(data = mean_price_u23, type = "line", hcaes(x = age, y = mean_price), name = "Mean Value In Millions") %>%
hc_add_series(data = newdata_u23, type = "line", hcaes(x = age, y = predicted_price), name = "Predicted Value In Millions") %>%
hc_legend(layout = "horizontal", verticalAlign = "bottom") %>%
hc_colors(colors = c("#3366FF", "#DC143c"))
Copy and pasted the code to filter the position group and make it
easier for graphing purposes
Bundesliga_U30$position[Bundesliga_U30$position == "Attack - Centre-Forward"] <- "Forward"
Bundesliga_U30$position[Bundesliga_U30$position == "Attack - Left Winger"] <- "Forward"
Bundesliga_U30$position[Bundesliga_U30$position == "Attack - Right Winger"] <- "Forward"
Bundesliga_U30$position[Bundesliga_U30$position == "Attack - Second Striker"] <- "Forward"
Bundesliga_U30$position[Bundesliga_U30$position == "midfield - Attacking Midfield"] <- "Midfielder"
Bundesliga_U30$position[Bundesliga_U30$position == "midfield - Left Midfield"] <- "Midfielder"
Bundesliga_U30$position[Bundesliga_U30$position == "midfield - Right Midfield"] <- "Midfielder"
Bundesliga_U30$position[Bundesliga_U30$position == "midfield - Central Midfield"] <- "Midfielder"
Bundesliga_U30$position[Bundesliga_U30$position == "midfield - Defensive Midfield"] <- "Midfielder"
Bundesliga_U30$position[Bundesliga_U30$position == "Defender - Left-Back"] <- "Defender"
Bundesliga_U30$position[Bundesliga_U30$position == "Defender - Centre-Back"] <- "Defender"
Bundesliga_U30$position[Bundesliga_U30$position == "Defender - Right-Back"] <- "Defender"
I wanted to see the correlation between older players and how there
value is tied to there age (players aged 31-39)
price_lnrg <- lm(price ~ age, data = Bundesliga_U39)
summary(price_lnrg)
##
## Call:
## lm(formula = price ~ age, data = Bundesliga_U39)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.427 -1.983 -1.127 -0.127 40.873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.9610 9.4467 2.325 0.0224 *
## age -0.5753 0.2862 -2.010 0.0475 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.197 on 86 degrees of freedom
## Multiple R-squared: 0.04488, Adjusted R-squared: 0.03378
## F-statistic: 4.041 on 1 and 86 DF, p-value: 0.04753
Graphed age And value but also added max value to see where they
once were at there peak value to today and see the difference
highchart() %>%
hc_chart(type = "scatter") %>%
hc_title(text = "Value vs Max Value for Players Aged 31-39 In Bundesliga") %>%
hc_xAxis(title = list(text = "Price In Millions")) %>%
hc_yAxis(title = list(text = "Age")) %>%
hc_tooltip(pointFormat = "<b>Name:</b> {point.name}<br>
<b>Price:</b> {point.x}<br>
<b>Age:</b> {point.y}") %>%
hc_add_series(name = "Price", color = "#fb9a99", data = Bundesliga_U39, type = "scatter", hcaes(x = price, y = age,)) %>%
hc_add_series(name = "Max Price", color = "#a6cee3", data = Bundesliga_U39, type = "scatter", hcaes(x = max_price, y = age))
Final Paragraph
This dataset did have some interest points. Lets start from the top
the prediction model I created by calucating the mean value for each age
was right in predicting the decline in value as you age. With that said
it wasn’t accurate in predicting the exact value. We best see this in
the first graph calcuating the mean value for players aged 17-23. 17
years olds are only 1.4 million and the prediction model says they are
worth 11.3 million. As for the second graph there I decided to see if
foot preference really does effect your value and it does. I thought it
was going to be left footers with a higher value since the common saying
is lefty are rare and they are this dataset proved it. There was only 63
out 248 that used there left foot but there valuation was 3 million
lower on average than right footers. The Third graph I wanted to see how
much age really does effect your value so I compared age and there
current value and there max value. I was not suprised by the results
only 1 player was worth more than 20 million currently compared to the
max value which had 19 players over that. This data was filtered to show
players from the ages of 31 and up currently playing so age really does
effect your value. I wish I could have worked on a correlation between a
player position and there value but since this is catergorical it would
have been more time consuming to find a correlation.
I did some research on this topic. (https://www.researchgate.net/publication/361721165_Players_Market_Value_at_Risk_A_Model_and_Some_Evidence_for_the_German_Bundesliga)
In this article they go over the 2015/16 Bundesliga season. It goes more
individual players market values to help teams better create tools for
risk management when assessing talent. It found that goals and assists
have a higher impact on your value than playing time does. They used
age, market value, matches played, goals and assists and what position
they played. This could create situations where times over value these
stats and choose the wrong player. For example some who played 15 games
but scored 11 goals and got 3 assists has a higher value than a player
who played 34 games who got the same goal and assist involvements. but
the player who played 15 games was in and out of the time because of
injuries. On paper he looks better but when you add in injury history it
makes a difference. This is where the authors tried to implement the
risk management which is just looking at other statistical factors than
just goals and assists to better evalate players that your spending
millions on.