e1071
tidyverse
This dataset includes all the international football results from 1872 till 2019
Data <- read_csv("results.csv")
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## home_team = col_character(),
## away_team = col_character(),
## home_score = col_double(),
## away_score = col_double(),
## tournament = col_character(),
## city = col_character(),
## country = col_character(),
## neutral = col_logical()
## )
This dataset includes all the countries in the world. It is need to be able to include region in the other dataset.
Countries <- read_csv("countries.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## `alpha-2` = col_character(),
## `alpha-3` = col_character(),
## `country-code` = col_character(),
## `iso_3166-2` = col_character(),
## region = col_character(),
## `sub-region` = col_character(),
## `intermediate-region` = col_character(),
## `region-code` = col_character(),
## `sub-region-code` = col_character(),
## `intermediate-region-code` = col_character()
## )
Checking the data
summary(Data)
## date home_team away_team
## Min. :1872-11-30 Length:40839 Length:40839
## 1st Qu.:1977-08-02 Class :character Class :character
## Median :1997-04-27 Mode :character Mode :character
## Mean :1990-06-19
## 3rd Qu.:2008-09-10
## Max. :2019-07-20
## home_score away_score tournament city
## Min. : 0.000 Min. : 0.000 Length:40839 Length:40839
## 1st Qu.: 1.000 1st Qu.: 0.000 Class :character Class :character
## Median : 1.000 Median : 1.000 Mode :character Mode :character
## Mean : 1.746 Mean : 1.188
## 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :31.000 Max. :21.000
## country neutral
## Length:40839 Mode :logical
## Class :character FALSE:30680
## Mode :character TRUE :10159
##
##
##
In this first summary, we can already start to see some important points, for example the teams leading the most home and away games or the city or countrt where most games have taken place.
After the first data exploration, creating a new variable for more important tournaments will be very useful as there is currently over 150 torunaments.
#According what we want to analyse in the future, we pick the tournaments that are the most relevant
# define relevant tournaments
rel_tour <- c("Copa América","FIFA World Cup qualification", "FIFA World Cup",
"UEFA Euro", "UEFA Euro qualification", "FIFA World Cup","Copa América qualification")
#Making a new variable for major tournaments
Data %>%
mutate(MajorTournament = if_else(tournament %in% rel_tour, "True", "False")) -> Data
Data$MajorTournament <- as.factor(Data$MajorTournament)
To reduce the amount of variables, we decide to create our own dataset with just the tournaments we selected earlier
#New dataset only for major tournaments
Data %>%
filter(MajorTournament == "True" ) -> MajorTournaments
Next we will create a new variable to know what the region of the home and away country are.
#Adding the continent to the dataset
#loading the errors data(this is after doing it the first time and finding out the errors, the code is found later on)
errors <- read_csv2("errors (2).csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Parsed with column specification:
## cols(
## home_team = col_character(),
## region = col_character()
## )
#Creating the dataset only with country name and region
Countries %>%
select(home_team = name, region) -> SelectedCountries
#adding the countries with different names or errors encountered
ReadyData <- bind_rows(SelectedCountries, errors)
#Joining the dataset to add region to the dataset
MajorTournaments %>%
left_join(y = ReadyData, by = "home_team") -> AllCountries
#How i found the errors the first time
AllCountries %>%
filter(is.na(region)) %>%
distinct(home_team) -> errors
#saving the errors to be able to fix them in excel, as is the easiest and quickest way.
#write.csv(errors, file = "errors.csv")
#Checking with the new dataset for errors
AllCountries %>%
filter(is.na(region)) %>%
distinct(home_team)
#Creating a region for away team
ReadyData %>%
rename( away_team = home_team, region2= region) -> AwayTeam
#Creating a dataset with regions for home and away team
AllCountries %>%
left_join(y = AwayTeam, by = "away_team") -> HomeAndAway
After creating the variable earlier to know who won, we want to create a variable that says H(if home win), D(if draw) and A(if away win) so it will be easier to do the visualizations
#With this variable, if the home team won, the number will be positive, if is 0 it means its a draw, and if it is negative the away team won.
HomeAndAway$GoalsDifference <- HomeAndAway$home_score - HomeAndAway$away_score
#Creating a different data set for home win, draw and away win.
HomeAndAway %>%
filter(GoalsDifference >0) -> HomeWin
HomeAndAway %>%
filter(GoalsDifference ==0) -> Draw
HomeAndAway %>%
filter(GoalsDifference <0) -> AwayWin
#Creating a new variable called result where it would reflect what was the score of the match
mutate(HomeWin, result = "H") ->HomeWin
mutate(AwayWin, result = "A") ->AwayWin
mutate(Draw, result = "D") ->Draw
#Putting together all the results with the diffent outcomes
bind_rows(HomeWin, AwayWin, Draw)->Results
#Reordering the results so they are ordered by date.
ResultsOrder <- order(Results$date)
WorldCup <- Results[ResultsOrder,]
Quick comparison of games played at the world cup between American and European teams
#Checking the outcome when american teams play european at home at the world cup
WorldCup %>%
filter(region == "Americas") %>%
filter(region2 == "Europe") -> AmericaHome
ggplot(AmericaHome, aes(x = result))+ geom_bar(aes(fill= result))
#Checking the outcome when european teams play american teams at home at the world cup
WorldCup %>%
filter(region == "Europe") %>%
filter(region2 == "Americas") -> EuropeHome
ggplot(EuropeHome, aes(x = result))+ geom_bar(aes(fill= result))