Import Data

# excel file
data <- read_excel("../00_data/MyData.xlsx")

data
## # A tibble: 900 × 15
##     year country city    stage home_team away_team home_score away_score outcome
##    <dbl> <chr>   <chr>   <chr> <chr>     <chr>          <dbl>      <dbl> <chr>  
##  1  1930 Uruguay Montev… Grou… France    Mexico             4          1 H      
##  2  1930 Uruguay Montev… Grou… Belgium   United S…          0          3 A      
##  3  1930 Uruguay Montev… Grou… Brazil    Yugoslav…          1          2 A      
##  4  1930 Uruguay Montev… Grou… Peru      Romania            1          3 A      
##  5  1930 Uruguay Montev… Grou… Argentina France             1          0 H      
##  6  1930 Uruguay Montev… Grou… Chile     Mexico             3          0 H      
##  7  1930 Uruguay Montev… Grou… Bolivia   Yugoslav…          0          4 A      
##  8  1930 Uruguay Montev… Grou… Paraguay  United S…          0          3 A      
##  9  1930 Uruguay Montev… Grou… Uruguay   Peru               1          0 H      
## 10  1930 Uruguay Montev… Grou… Argentina Mexico             6          3 H      
## # ℹ 890 more rows
## # ℹ 6 more variables: win_conditions <chr>, winning_team <chr>,
## #   losing_team <chr>, date <dttm>, month <chr>, dayofweek <chr>


## Introduction

## Questions

## Variation

### Visualizing distributions


``` r
ggplot(data = data) +
  geom_bar(mapping = aes(x = year))

data %>%
    count(year)
## # A tibble: 21 × 2
##     year     n
##    <dbl> <int>
##  1  1930    18
##  2  1934    17
##  3  1938    18
##  4  1950    22
##  5  1954    26
##  6  1958    35
##  7  1962    32
##  8  1966    32
##  9  1970    32
## 10  1974    38
## # ℹ 11 more rows
ggplot(data = data) +
  geom_histogram(mapping = aes(x = home_score))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, mapping = aes(x = year, colour = outcome)) + 
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

data %>%
    
    filter(year > 2000) %>%
    
    ggplot(aes(x = year)) +
    geom_histogram(binwidth = 1)

Unusual values

data %>%
    ggplot(aes(home_score)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

data %>%
    mutate(home_score = ifelse(home_score > 3, NA, home_score)) %>%
    
    ggplot(aes(x = home_team, y = home_score)) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    geom_point() 
## Warning: Removed 86 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

data %>%
    ggplot(aes(x = outcome, y = away_score)) +
    geom_boxplot()

Two categorical variables

data %>%
    count(outcome, home_team) %>%
    ggplot(aes(x = outcome, y = home_team, fill = n)) + 
    geom_tile()

Two continous variables

data %>%
    ggplot(aes(x = away_score, y = home_score)) +
    geom_hex()

Patterns and models

library(modelr)

mod <- lm(log(away_score + 1) ~ log(home_score + 1), data = data)

data4 <- data %>%
    add_residuals(mod) %>%
    mutate(resid = exp(resid))

data4 %>%
    ggplot(aes(x = home_score, resid)) +
    geom_point()