Import Data
# excel file
data <- read_excel("../00_data/MyData.xlsx")
data
## # A tibble: 900 × 15
## year country city stage home_team away_team home_score away_score outcome
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 1930 Uruguay Montev… Grou… France Mexico 4 1 H
## 2 1930 Uruguay Montev… Grou… Belgium United S… 0 3 A
## 3 1930 Uruguay Montev… Grou… Brazil Yugoslav… 1 2 A
## 4 1930 Uruguay Montev… Grou… Peru Romania 1 3 A
## 5 1930 Uruguay Montev… Grou… Argentina France 1 0 H
## 6 1930 Uruguay Montev… Grou… Chile Mexico 3 0 H
## 7 1930 Uruguay Montev… Grou… Bolivia Yugoslav… 0 4 A
## 8 1930 Uruguay Montev… Grou… Paraguay United S… 0 3 A
## 9 1930 Uruguay Montev… Grou… Uruguay Peru 1 0 H
## 10 1930 Uruguay Montev… Grou… Argentina Mexico 6 3 H
## # ℹ 890 more rows
## # ℹ 6 more variables: win_conditions <chr>, winning_team <chr>,
## # losing_team <chr>, date <dttm>, month <chr>, dayofweek <chr>
## Introduction
## Questions
## Variation
### Visualizing distributions
``` r
ggplot(data = data) +
geom_bar(mapping = aes(x = year))

data %>%
count(year)
## # A tibble: 21 × 2
## year n
## <dbl> <int>
## 1 1930 18
## 2 1934 17
## 3 1938 18
## 4 1950 22
## 5 1954 26
## 6 1958 35
## 7 1962 32
## 8 1966 32
## 9 1970 32
## 10 1974 38
## # ℹ 11 more rows
ggplot(data = data) +
geom_histogram(mapping = aes(x = home_score))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, mapping = aes(x = year, colour = outcome)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values
data %>%
filter(year > 2000) %>%
ggplot(aes(x = year)) +
geom_histogram(binwidth = 1)

Unusual values
data %>%
ggplot(aes(home_score)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
data %>%
mutate(home_score = ifelse(home_score > 3, NA, home_score)) %>%
ggplot(aes(x = home_team, y = home_score)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_point()
## Warning: Removed 86 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation
A categorical and continuous variable
data %>%
ggplot(aes(x = outcome, y = away_score)) +
geom_boxplot()

Two categorical variables
data %>%
count(outcome, home_team) %>%
ggplot(aes(x = outcome, y = home_team, fill = n)) +
geom_tile()

Two continous variables
data %>%
ggplot(aes(x = away_score, y = home_score)) +
geom_hex()

Patterns and models
library(modelr)
mod <- lm(log(away_score + 1) ~ log(home_score + 1), data = data)
data4 <- data %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
data4 %>%
ggplot(aes(x = home_score, resid)) +
geom_point()
