Import Data
data <- read_csv("../00_data/MyData.csv")
## New names:
## Rows: 380 Columns: 23
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (6): Date, HomeTeam, AwayTeam, FTR, HTR, Referee dbl (17): ...1, FTHG, FTAG,
## HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Introduction
Questions
Variation
Visualizing distributions
ggplot(data = data) +
geom_bar(mapping = aes(x = FTR))

ggplot(data = data) +
geom_histogram(mapping = aes(x = HC), binwidth = 0.5)

ggplot(data = data,mapping = aes(x = HC, color = FTR)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values
data %>%
filter(HC < 5) %>%
ggplot(aes(x = HC)) +
geom_histogram(binwidth = 1)

Unusual values
data %>%
ggplot(aes(FTHG)) +
geom_histogram(binwidth = 1)

Missing Values
data %>%
mutate(HC = ifelse(HC < 1 | HC > 10, NA, AC)) %>%
ggplot(aes( x = HC, y = AC)) +
geom_point()
## Warning: Removed 32 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation
A categorical and continuous variable
data %>%
ggplot(aes(x = FTR, y = HC)) +
geom_boxplot()

Two categorical variables
data %>%
count(FTR, HTR) %>%
ggplot(aes(x = FTR, y = HTR, fill = n)) +
geom_tile()

Two continous variables
data %>%
ggplot(aes(x = HC, y = AC)) +
geom_hex()

Patterns and models
library(modelr)
mod <- lm(HC ~ HS, data = data)
data2 <- data %>%
modelr::add_residuals(mod) %>%
mutate(resid = exp(resid))
data2 %>%
ggplot(aes(FTHG, resid)) +
geom_point()
