Import Data

data <- read_excel("../00_data/myData.xlsx")
data
## # A tibble: 236 × 20
##    TEAMID TEAM   PAKE PAKERANK  PASE PASERANK GAMES     W     L WINPERCENT   R64
##     <dbl> <chr> <dbl>    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl>      <dbl> <dbl>
##  1      1 Abil…   0.7       45   0.7       52     3     1     2      0.333     2
##  2      2 Akron  -0.9      179  -1.1      187     4     0     4      0         4
##  3      3 Alab…  -2.1      211  -2.9      220    10     5     5      0.5       5
##  4      4 Alba…  -0.4      147  -0.3      138     3     0     3      0         3
##  5      6 Amer…  -0.5      160  -0.4      150     3     0     3      0         3
##  6      8 Ariz…  -1.7      206  -2.5      216    28    17    11      0.607    11
##  7      9 Ariz…  -2        209  -1.9      206     5     1     4      0.2       4
##  8     10 Arka…   4.3       11   3.5       16    18    11     7      0.611     7
##  9     11 Arka…   0         76   0         78     1     0     1      0         1
## 10     12 Aubu…   0.6       53   1.4       30    11     7     4      0.636     4
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## #   CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>

Variation

ggplot(data = data) + 
    geom_bar(mapping = aes(x = TEAM))

Visualizing distributions

ggplot(data = data) + 
    geom_histogram(mapping = aes(x = GAMES))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>% count(GAMES)
## # A tibble: 38 × 2
##    GAMES     n
##    <dbl> <int>
##  1     1    53
##  2     2    32
##  3     3    29
##  4     4    16
##  5     5    16
##  6     6    15
##  7     7     8
##  8     8     2
##  9     9     8
## 10    10     8
## # ℹ 28 more rows

Typical values

data %>%
    
    # Filter out PASE > 3 PAKE
   filter(PAKE > 3) %>%
    
    # Plot
     ggplot(aes(x = PAKE)) +
    geom_histogram(binwidth = 0.01)

data %>%
    ggplot(aes(PAKE)) +
    geom_histogram(binwidth = 0.25)

Unusual values

data %>%
    ggplot(aes(GAMES)) +
    geom_histogram() + 
    coord_cartesian(ylim = c(0, 50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = PASE, y = GAMES)) +
    geom_boxplot() 
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

Two categorical variables

data %>% count(PASE, PAKE) %>%
    ggplot(aes(x = PASE, y = PAKE, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
data %>%
    ggplot(aes(x = CHAMP, y = GAMES)) +
    geom_hex()

data %>%
    ggplot(aes(x = CHAMP, y = GAMES)) +
    geom_boxplot(aes(group = cut_width(CHAMP, 0.1)))

Patterns and models