Import Data

data <- read_csv("../00_data/MyData.csv")
## New names:
## Rows: 380 Columns: 23
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (6): Date, HomeTeam, AwayTeam, FTR, HTR, Referee dbl (17): ...1, FTHG, FTAG,
## HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Introduction

Questions

Variation

Visualizing distributions

ggplot(data = data) + 
    geom_bar(mapping = aes(x = FTR))

ggplot(data = data) +
    geom_histogram(mapping = aes(x = HC), binwidth = 0.5)

ggplot(data = data,mapping = aes(x = HC, color = FTR)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

data %>%
    
    filter(HC < 5) %>%
    
    ggplot(aes(x = HC)) +
    geom_histogram(binwidth = 1)

Unusual values

data %>%
    ggplot(aes(FTHG)) +
    geom_histogram(binwidth = 1)

Missing Values

data %>%
    
    mutate(HC = ifelse(HC < 1 | HC > 10, NA, AC)) %>%
    
    ggplot(aes( x = HC, y = AC)) +
    geom_point()
## Warning: Removed 32 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = FTR, y = HC)) +
    geom_boxplot()

Two categorical variables

data %>%
    
    count(FTR, HTR) %>%
    
    ggplot(aes(x = FTR, y = HTR, fill = n)) +
    geom_tile()

Two continous variables

data %>%
    ggplot(aes(x = HC, y = AC)) +
    geom_hex()

Patterns and models

library(modelr)
mod <- lm(HC ~ HS, data = data)

data2 <- data %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

data2 %>%
    ggplot(aes(FTHG, resid)) + 
    geom_point()