Introduction

# excel file
data <- read_csv("C:/Users/ejp14/OneDrive/Desktop/PSU_DAT3000_IntroToDA/01_module4/Data/myData.csv")
## Rows: 81525 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): name, team, position
## dbl (21): game_year, game_week, rush_att, rush_yds, rush_avg, rush_tds, rush...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
## # A tibble: 81,525 × 24
##    name            team  game_year game_week rush_att rush_yds rush_avg rush_tds
##    <chr>           <chr>     <dbl>     <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1 Duce Staley     PHI        2000         1       26      201      7.7        1
##  2 Lamar Smith     MIA        2000         1       27      145      5.4        1
##  3 Tiki Barber     NYG        2000         1       13      144     11.1        2
##  4 Stephen Davis   WAS        2000         1       23      133      5.8        1
##  5 Edgerrin James  IND        2000         1       28      124      4.4        1
##  6 Priest Holmes   BAL        2000         1       27      119      4.4        0
##  7 Curtis Martin   NYJ        2000         1       30      110      3.7        1
##  8 Robert Smith    MIN        2000         1       14      109      7.8        0
##  9 Tim Biakabutuka CAR        2000         1       15       88      5.9        0
## 10 Cade McNown     CHI        2000         1       10       87      8.7        1
## # ℹ 81,515 more rows
## # ℹ 16 more variables: rush_fumbles <dbl>, rec <dbl>, rec_yds <dbl>,
## #   rec_avg <dbl>, rec_tds <dbl>, rec_fumbles <dbl>, pass_att <dbl>,
## #   pass_yds <dbl>, pass_tds <dbl>, int <dbl>, sck <dbl>, pass_fumbles <dbl>,
## #   rate <dbl>, position <chr>, total_yards <dbl>, `total tds` <dbl>

Questions

Variation

data %>%
    ggplot(aes(x = position)) +
    geom_bar()

Visualizing distributions

data %>%
    ggplot(aes(x = position)) +
    geom_bar()

data %>%
    ggplot(mapping = aes(x = total_yards)) +
    geom_histogram(binwidth = 0.5)

data %>%
    
    filter(total_yards < 2000) %>%
    
    ggplot(aes(x = total_yards)) +
    geom_histogram(binwidth = 0.5)

data %>%
    ggplot(aes(x = total_yards, color = position)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

data %>%
    
    # Filter out diamonds > 3 carat
    filter(total_yards > 1000) %>%
    
    # Plot
    ggplot(aes(x = total_yards)) +
    geom_histogram(binwidth = 0.01)

data %>%
    ggplot(aes(`total tds`)) +
    geom_histogram(binwidth = 0.25)

Unusual values

data  %>%
    ggplot(aes(x = total_yards)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>%
    ggplot(aes(x = total_yards)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

data %>%
  mutate(total_yards_rev = ifelse(total_yards < 3 | total_yards > 20, NA, total_yards)) %>%
  ggplot(aes(x = game_year, y = total_yards_rev)) +
  geom_point()
## Warning: Removed 59809 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

didn’t do this in CA

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = position, y = total_yards)) +
    geom_boxplot()

Two categorical variables

data %>%
  count(game_year, total_yards) %>%
  ggplot(aes(x = game_year, y = total_yards)) +
  geom_tile(aes(fill = n))

Two continous variables

library(hexbin)
## Warning: package 'hexbin' was built under R version 4.5.2
data %>%
    ggplot(aes(x = total_yards, y = position)) +
    geom_hex()

Patterns and models

data %>% ggplot(aes(total_yards, game_week)) + geom_point()