Introduction

Questions

Variation

Visualizing distributions

Batting  %>%
    ggplot(aes(x = teamID)) +
    geom_bar()

Batting %>%        
    ggplot(mapping = aes(x = H)) +
    geom_histogram(binwidth = 100)

Batting %>%
        
        filter(X2B < 100) %>%
    
        ggplot(aes(x = H)) +
        geom_histogram(binwidth = 100)

Batting %>%
        ggplot(aes(x = H, color = H)) +
        geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

Batting %>%
    
    # filter out diamonds > 3 carat 
    filter(X2B > 0) %>%
    
    # plot
    ggplot(aes(x = X2B)) +
    geom_histogram(binwidth = 50)

Batting %>%
    ggplot(aes(HR)) +
    geom_histogram(binwidth = 10)

Unusual values

Batting %>%
    ggplot(aes(RBI)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 756 rows containing non-finite values (stat_bin).

Batting %>%
    ggplot(aes(RBI)) +
    geom_histogram() +
    coord_cartesian(ylim = c(50, 100))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 756 rows containing non-finite values (stat_bin).

Missing Values

Covariation

A categorical and continuous variable

Batting %>%
    
    ggplot(aes(x = AB, y = HR)) + 
    geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

Two categorical variables

Batting %>%
    
    count(AB, X3B) %>%
    
    ggplot(aes(x = AB, y = X3B, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
Batting %>%
    ggplot(aes(x = AB, y = HR)) +
    geom_hex()

Batting %>%
    filter(HR < 10) %>%
    ggplot(aes(x = AB, y = RBI)) +
    geom_boxplot(aes(HR, 20))

Patterns and models