knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
setwd("~/Desktop/DATA110")
newtable <- read_csv("data.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## X1 = col_double(),
## ID = col_double(),
## Age = col_double(),
## Overall = col_double(),
## Potential = col_double(),
## Special = col_double(),
## `International Reputation` = col_double(),
## `Weak Foot` = col_double(),
## `Skill Moves` = col_double(),
## `Jersey Number` = col_double(),
## Crossing = col_double(),
## Finishing = col_double(),
## HeadingAccuracy = col_double(),
## ShortPassing = col_double(),
## Volleys = col_double(),
## Dribbling = col_double(),
## Curve = col_double(),
## FKAccuracy = col_double(),
## LongPassing = col_double(),
## BallControl = col_double()
## # ... with 24 more columns
## )
## ℹ Use `spec()` for the full column specifications.
names(newtable) <- str_replace_all(names(newtable), c(" " = "_"))
newtable
## # A tibble: 18,207 x 89
## X1 ID Name Age Photo Nationality Flag Overall Potential Club
## <dbl> <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 0 158023 L. M… 31 http… Argentina http… 94 94 FC B…
## 2 1 20801 Cris… 33 http… Portugal http… 94 94 Juve…
## 3 2 190871 Neym… 26 http… Brazil http… 92 93 Pari…
## 4 3 193080 De G… 27 http… Spain http… 91 93 Manc…
## 5 4 192985 K. D… 27 http… Belgium http… 91 92 Manc…
## 6 5 183277 E. H… 27 http… Belgium http… 91 91 Chel…
## 7 6 177003 L. M… 32 http… Croatia http… 91 91 Real…
## 8 7 176580 L. S… 31 http… Uruguay http… 91 91 FC B…
## 9 8 155862 Serg… 32 http… Spain http… 91 91 Real…
## 10 9 200389 J. O… 25 http… Slovenia http… 90 93 Atlé…
## # … with 18,197 more rows, and 79 more variables: Club_Logo <chr>, Value <chr>,
## # Wage <chr>, Special <dbl>, Preferred_Foot <chr>,
## # International_Reputation <dbl>, Weak_Foot <dbl>, Skill_Moves <dbl>,
## # Work_Rate <chr>, Body_Type <chr>, Real_Face <chr>, Position <chr>,
## # Jersey_Number <dbl>, Joined <chr>, Loaned_From <chr>,
## # Contract_Valid_Until <chr>, Height <chr>, Weight <chr>, LS <chr>, ST <chr>,
## # RS <chr>, LW <chr>, LF <chr>, CF <chr>, RF <chr>, RW <chr>, LAM <chr>,
## # CAM <chr>, RAM <chr>, LM <chr>, LCM <chr>, CM <chr>, RCM <chr>, RM <chr>,
## # LWB <chr>, LDM <chr>, CDM <chr>, RDM <chr>, RWB <chr>, LB <chr>, LCB <chr>,
## # CB <chr>, RCB <chr>, RB <chr>, Crossing <dbl>, Finishing <dbl>,
## # HeadingAccuracy <dbl>, ShortPassing <dbl>, Volleys <dbl>, Dribbling <dbl>,
## # Curve <dbl>, FKAccuracy <dbl>, LongPassing <dbl>, BallControl <dbl>,
## # Acceleration <dbl>, SprintSpeed <dbl>, Agility <dbl>, Reactions <dbl>,
## # Balance <dbl>, ShotPower <dbl>, Jumping <dbl>, Stamina <dbl>,
## # Strength <dbl>, LongShots <dbl>, Aggression <dbl>, Interceptions <dbl>,
## # Positioning <dbl>, Vision <dbl>, Penalties <dbl>, Composure <dbl>,
## # Marking <dbl>, StandingTackle <dbl>, SlidingTackle <dbl>, GKDiving <dbl>,
## # GKHandling <dbl>, GKKicking <dbl>, GKPositioning <dbl>, GKReflexes <dbl>,
## # Release_Clause <chr>
euro = "\u20AC"
newtable$Wage = gsub(euro, '', newtable$Wage)
newtable$Wage = gsub("K", '', newtable$Wage)
newtable <- newtable %>%
filter(Wage > 0)
newtable <- newtable %>%
mutate(Wage = as.double(newtable$Wage))
newtable$Age_Group = ifelse(newtable$Age < 20, "10 - 19",
ifelse(newtable$Age < 26, "20 - 25",
ifelse(newtable$Age < 31, "26 - 30",
ifelse(newtable$Age < 36, "31 - 35",
ifelse(newtable$Age < 41, "36 - 40", "41 - 50")))))
p1 <- newtable%>%
ggplot(mapping = aes (x = Age, fill=Age_Group)) +
ggtitle("Player Count by Age Group") +
xlab("Player Age") +
ylab("Count") +
geom_histogram() + scale_fill_manual(values = c("10 - 19" = "#74b2ad",
"20 - 25" = "#aaa4d6",
"26 - 30" = "#ef9151",
"31 - 35" = "#ef91d6",
"36 - 40" = "#efc115",
"41 - 50" = "#d11913"))
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
newtable$International_Reputation[newtable$International_Reputation == 1]<- "Below Avg Rank"
newtable$International_Reputation[newtable$International_Reputation == 2]<- "Avg Rank"
newtable$International_Reputation[newtable$International_Reputation == 3]<- "Above Avg Rank"
newtable$International_Reputation[newtable$International_Reputation == 4]<- "High Rank"
newtable$International_Reputation[newtable$International_Reputation == 5]<- "Highest Rank"
newtable$International_Reputation<-factor(newtable$International_Reputation, levels=c("Below Avg Rank", "Avg Rank","Above Avg Rank", "High Rank", "Highest Rank"))
IReputation <- newtable %>%
filter(!is.na(International_Reputation))
p2 <- IReputation %>%
ggplot(aes(Age, Wage, color = International_Reputation)) +
geom_point() +
ggtitle("Wage as a Function of Age") +
xlab("Player Age") +
ylab("Player Wage") +
scale_fill_discrete()
p2
Source: https://www.kaggle.com/karangadiya/fifa19. Data scraped from https://sofifa.com/ and inspired from this dataset: https://www.kaggle.com/thec03u5/fifa-18-demo-player-dataset. For this project, I worked with a football analytics dataset titled “FIFA 19 Complete Player Dataset”.
The dataset consists of 89 columns in total comprising of Integer, String and Url variables. They include lastest edition FIFA 2019 players attributes like Wage, Age, Nationality, Potential, Club, Value and International Reputation among others.
I decided to work with the Wage column which is a chr variable formatted as €565K, for instance. In order to clean the data, I started with removing euro symbol and the letter K to make it easier to work with numerical data and perform operations on that data. I looked up the unicode for euro, stored it into a variable and removed each instance of the symbol and replaced it with an empty string using the gsub function. I repeated this process to get rid of character K in the same column. In the next step, I filtered out the values that were less than or equal to 0 and stored the Wage column as a dbl using mutate back into the dataset.
In my first plot I was looking to get a sense of the player distribution with respect to age. Unsurprisingly perhaps, a large percentage of players are between the age of 20 and 30 with the numbers dropping sharply around age 35 along with the earning potential. This age however is quite close to how old the five highest earning players are, who are between 31 - 33 years old which is an interesting contrast. But generally the numbers fall sharply for players around age 35 and beyond both in terms of the number of players that are active as well as their earning potential.
For my second plot, I thought it was interesting to see how at first glance there appears to be a strong association between wage and international player ranking. Intuitively, it also makes sense that some of the lowest earning players will have a lower ranking while the high earners will have a higher ranking. However, this trend isn’t as straightforward as it may appear at first. On close inspection it appears that while those with international ranking equal to 1 are earning less it is not always the case that those ranked the highest are also the highest earners. This is especially true for wages between 150 - 300 million where there is no obvious trend and there are 2nd, 3rd, 4th and even 5th highest ranking players in that range. An hypothesis could be that there are other variables involved such as, playing for a more well-known club, that are influencing the underlying trend. Overall, though it seems that player earnings increase progressively as they move up from ranking 1 to 5.
I would have liked to show player name as a point is hovered over in the scatterplot above. I know we will be learning how to do that soon so I hope to incorporate that into a future assignment. I also wish I was able to work with some of the more interesting visualizations that we have seen in class such as, Treemaps Heatmaps Streamgraphs or Alluvials. Unfortunately, because of spending time looking up how to clean my data I chose to go with a plot that I had a bit more experience with already in order to be able to finish everything on time, but I look forward to working with a more advanced visualization for my next project!