Import Data

# excel filer
games <- read_excel("../00_data/MyData_charts.xlsx")
games
## # A tibble: 988 × 15
##     year country city    stage home_team away_team home_score away_score outcome
##    <dbl> <chr>   <chr>   <chr> <chr>     <chr>          <dbl>      <dbl> <chr>  
##  1  1930 Uruguay Montev… Grou… France    Mexico             4          1 H      
##  2  1930 Uruguay Montev… Grou… Belgium   United S…          0          3 A      
##  3  1930 Uruguay Montev… Grou… Brazil    Yugoslav…          1          2 A      
##  4  1930 Uruguay Montev… Grou… Peru      Romania            1          3 A      
##  5  1930 Uruguay Montev… Grou… Argentina France             1          0 H      
##  6  1930 Uruguay Montev… Grou… Chile     Mexico             3          0 H      
##  7  1930 Uruguay Montev… Grou… Bolivia   Yugoslav…          0          4 A      
##  8  1930 Uruguay Montev… Grou… Paraguay  United S…          0          3 A      
##  9  1930 Uruguay Montev… Grou… Uruguay   Peru               1          0 H      
## 10  1930 Uruguay Montev… Grou… Argentina Mexico             6          3 H      
## # ℹ 978 more rows
## # ℹ 6 more variables: win_conditions <chr>, winning_team <chr>,
## #   losing_team <chr>, date <dttm>, month <chr>, dayofweek <chr>

Introduction

Questions

Variation

Visualizing distributions

ggplot(data = games) +
  geom_bar(mapping = aes(x = country)) +
    labs(x = "Country", y = "Count") +
    theme(axis.text.x = element_text(angle = 45, hjust= 1, size = 10))

ggplot(data = games) +
  geom_histogram(mapping = aes(x = year),binwidth = 4)
## Warning: Removed 88 rows containing non-finite values (`stat_bin()`).

ggplot(data = games, mapping = aes(x = home_score + away_score, colour = country)) +
  geom_freqpoly() +
    labs(x = "Total score", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (`stat_bin()`).

Typical values

games %>%
    
    # Filter out games > than 3
    filter(year > 1950) %>%
    
    # Plot
    ggplot(aes(x = year)) + 
    geom_histogram(binwidth = 4)

games %>%
    ggplot(aes(year)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 88 rows containing non-finite values (`stat_bin()`).

Unusual values

games %>%
    ggplot(aes(away_score)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (`stat_bin()`).

games %>%
    ggplot(aes(away_score)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0, 50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (`stat_bin()`).

Missing Values

games %>%
   
    # filter(year < 1960 | year > 2000) %>%
    mutate(year = ifelse(away_score > 3, NA, away_score)) %>%
    
    #Plot
    ggplot(aes(x = away_team, y = away_score)) +
    geom_point() +
    labs(x = "Away team", y = "Away score") +
    theme(axis.text.x = element_text(angle = 90, hjust= 1, size = 4))
## Warning: Removed 62 rows containing missing values (`geom_point()`).

Covariation

A categorical and continuous variable

games %>%
    ggplot(aes(x = dayofweek, y = home_score + away_score)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(angle = 45, hjust= 1, size = 10))
## Warning: Removed 62 rows containing non-finite values (`stat_boxplot()`).

Two categorical variables

games %>%
    count(winning_team, losing_team) %>%
    ggplot(mapping= aes(x = winning_team, y = losing_team, fill = n)) + 
    geom_tile() +
    theme(axis.text.x = element_text(angle = 90, hjust= 1, size = 4)) +
    theme(axis.text.y = element_text(hjust= 1, size = 4))

Two continous variables

games %>%
    ggplot() +
    geom_point(mapping = aes(x = year, y = away_score), alpha = 1 / 1)
## Warning: Removed 88 rows containing missing values (`geom_point()`).

games %>%
    ggplot() +
    geom_bin2d(mapping = aes(x = year, y = home_score))
## Warning: Removed 88 rows containing non-finite values (`stat_bin2d()`).

games %>%
    filter(year > 1950) %>%
    ggplot(aes(x = year, y = home_score)) +
    geom_boxplot(aes(group = cut_width(year, 0.1)))

Patterns and models