Analysis of the International football results

Comparison between American and European teams

Libraries

e1071

tidyverse

Loading the datasets

This dataset includes all the international football results from 1872 till 2019

Data <- read_csv("results.csv")
## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   home_team = col_character(),
##   away_team = col_character(),
##   home_score = col_double(),
##   away_score = col_double(),
##   tournament = col_character(),
##   city = col_character(),
##   country = col_character(),
##   neutral = col_logical()
## )

This dataset includes all the countries in the world. It is need to be able to include region in the other dataset.

Countries <- read_csv("countries.csv")
## Parsed with column specification:
## cols(
##   name = col_character(),
##   `alpha-2` = col_character(),
##   `alpha-3` = col_character(),
##   `country-code` = col_character(),
##   `iso_3166-2` = col_character(),
##   region = col_character(),
##   `sub-region` = col_character(),
##   `intermediate-region` = col_character(),
##   `region-code` = col_character(),
##   `sub-region-code` = col_character(),
##   `intermediate-region-code` = col_character()
## )

-First analysis

Checking the data

summary(Data)
##       date             home_team          away_team        
##  Min.   :1872-11-30   Length:40839       Length:40839      
##  1st Qu.:1977-08-02   Class :character   Class :character  
##  Median :1997-04-27   Mode  :character   Mode  :character  
##  Mean   :1990-06-19                                        
##  3rd Qu.:2008-09-10                                        
##  Max.   :2019-07-20                                        
##    home_score       away_score      tournament            city          
##  Min.   : 0.000   Min.   : 0.000   Length:40839       Length:40839      
##  1st Qu.: 1.000   1st Qu.: 0.000   Class :character   Class :character  
##  Median : 1.000   Median : 1.000   Mode  :character   Mode  :character  
##  Mean   : 1.746   Mean   : 1.188                                        
##  3rd Qu.: 2.000   3rd Qu.: 2.000                                        
##  Max.   :31.000   Max.   :21.000                                        
##    country           neutral       
##  Length:40839       Mode :logical  
##  Class :character   FALSE:30680    
##  Mode  :character   TRUE :10159    
##                                    
##                                    
## 

In this first summary, we can already start to see some important points, for example the teams leading the most home and away games or the city or countrt where most games have taken place.

Ideas from first analysis

After the first data exploration, creating a new variable for more important tournaments will be very useful as there is currently over 150 torunaments.

#According what we want to analyse in the future, we pick the tournaments that are the most relevant

# define relevant tournaments
rel_tour <- c("Copa América","FIFA World Cup qualification", "FIFA World Cup",
              "UEFA Euro", "UEFA Euro qualification", "FIFA World Cup","Copa América qualification")

#Making a new variable for major tournaments
Data %>% 
  mutate(MajorTournament = if_else(tournament %in% rel_tour, "True", "False")) -> Data

Data$MajorTournament <-  as.factor(Data$MajorTournament)

To reduce the amount of variables, we decide to create our own dataset with just the tournaments we selected earlier

#New dataset only for major tournaments
Data %>% 
  filter(MajorTournament == "True" ) -> MajorTournaments

Next we will create a new variable to know what the region of the home and away country are.

#Adding the continent to the dataset


#loading the errors data(this is after doing it the first time and finding out the errors, the code is found later on)
errors <- read_csv2("errors (2).csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Parsed with column specification:
## cols(
##   home_team = col_character(),
##   region = col_character()
## )
#Creating the dataset only with country name and region
Countries %>% 
  select(home_team = name, region) -> SelectedCountries

#adding the countries with different names or errors encountered
ReadyData <- bind_rows(SelectedCountries, errors)

#Joining the dataset to add region to the dataset
MajorTournaments %>% 
  left_join(y = ReadyData, by = "home_team") -> AllCountries

#How i found the errors the first time
AllCountries %>% 
  filter(is.na(region)) %>% 
  distinct(home_team) -> errors

#saving the errors to be able to fix them in excel, as is the easiest and quickest way.
#write.csv(errors, file = "errors.csv")

#Checking with the new dataset for errors

AllCountries %>% 
  filter(is.na(region)) %>% 
  distinct(home_team)
#Creating a region for away team
ReadyData %>% 
  rename( away_team = home_team, region2= region) -> AwayTeam

#Creating a dataset with regions for home and away team
AllCountries %>% 
  left_join(y = AwayTeam, by = "away_team") -> HomeAndAway

After creating the variable earlier to know who won, we want to create a variable that says H(if home win), D(if draw) and A(if away win) so it will be easier to do the visualizations

#With this variable, if the home team won, the number will be positive, if is 0 it means its a draw, and if it is negative the away team won.
HomeAndAway$GoalsDifference <- HomeAndAway$home_score - HomeAndAway$away_score

#Creating a different data set for home win, draw and away win.
HomeAndAway %>% 
  filter(GoalsDifference >0) -> HomeWin

HomeAndAway %>% 
  filter(GoalsDifference ==0) -> Draw

HomeAndAway %>% 
  filter(GoalsDifference <0) -> AwayWin

#Creating a new variable called result where it would reflect what was the score of the match
mutate(HomeWin, result = "H") ->HomeWin

mutate(AwayWin, result = "A") ->AwayWin

mutate(Draw, result = "D") ->Draw

#Putting together all the results with the diffent outcomes
bind_rows(HomeWin, AwayWin, Draw)->Results

#Reordering the results so they are ordered by date.
ResultsOrder <- order(Results$date)
WorldCup <- Results[ResultsOrder,]

Quick comparison of games played at the world cup between American and European teams

#Checking the outcome when american teams play european at home at the world cup
WorldCup %>% 
  filter(region == "Americas") %>% 
  filter(region2 == "Europe") -> AmericaHome


ggplot(AmericaHome, aes(x = result))+ geom_bar(aes(fill= result))

#Checking the outcome when european teams play american teams at home at the world cup
WorldCup %>% 
  filter(region == "Europe") %>% 
  filter(region2 == "Americas") -> EuropeHome


ggplot(EuropeHome, aes(x = result))+ geom_bar(aes(fill= result))