Import Data

data <- read_excel("../00_data/myData.xlsx")
## New names:
## • `` -> `...1`
data
## # A tibble: 4,810 × 24
##     ...1  rank position hand  player   years total…¹ status yr_st…² season   age
##    <dbl> <dbl> <chr>    <chr> <chr>    <chr>   <dbl> <chr>    <dbl> <chr>  <dbl>
##  1     1     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1978-…    18
##  2     2     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1978-…    18
##  3     3     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1978-…    18
##  4     4     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1979-…    19
##  5     5     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1980-…    20
##  6     6     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1981-…    21
##  7     7     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1982-…    22
##  8     8     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1983-…    23
##  9     9     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1984-…    24
## 10    10     1 C        Left  Wayne G… 1979…     894 Retir…    1979 1985-…    25
## # … with 4,800 more rows, 13 more variables: team <chr>, league <chr>,
## #   season_games <dbl>, goals <dbl>, assists <dbl>, points <dbl>,
## #   plus_minus <chr>, penalty_min <dbl>, goals_even <chr>,
## #   goals_power_play <chr>, goals_short_handed <chr>, goals_game_winner <chr>,
## #   headshot <chr>, and abbreviated variable names ¹​total_goals, ²​yr_start

Introduction

Questions

Variation

Visualizing distributions

data %>%
    ggplot(aes(x = age)) +
    geom_bar()

Typical values

data %>%
    
    filter(age < 41) %>%
    ggplot(aes(x = age)) +
    geom_histogram(binwidth = 0.5)

Unusual values

data %>%
    ggplot(aes(goals)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>%
    ggplot(aes(goals)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0,400))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

data %>%
    
    mutate(y = ifelse(age < 41, NA, age)) %>%
    
    ggplot(aes(x = goals, y = age)) +
    geom_point()

Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = goals, y = position)) +
    geom_boxplot()

Two categorical variables

data %>% 
    
    count(position, hand) %>%
    
    ggplot(aes(x = position, y = hand, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
data %>%
    ggplot(aes(x = goals, y = assists)) +
    geom_hex()

Patterns and models

could not get this one to work as it says I have NA in my y category, and when i tried to filter them out, it didnt change the error.