Import data

Mydata <- read_csv("../00_data//Mydata.csv")
## New names:
## Rows: 65706 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): lake, species, comments, region dbl (4): ...1, year, grand_total, values
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Mydata
## # A tibble: 65,706 × 8
##     ...1  year lake  species      grand_total comments region            values
##    <dbl> <dbl> <chr> <chr>              <dbl> <chr>    <chr>              <dbl>
##  1     1  1991 Erie  American Eel           1 <NA>     Michigan (MI)          0
##  2     2  1991 Erie  American Eel           1 <NA>     New York (NY)          0
##  3     3  1991 Erie  American Eel           1 <NA>     Ohio (OH)              0
##  4     4  1991 Erie  American Eel           1 <NA>     Pennsylvania (PA)      0
##  5     5  1991 Erie  American Eel           1 <NA>     U.S. Total             0
##  6     6  1991 Erie  American Eel           1 <NA>     Canada (ONT)           1
##  7     7  1992 Erie  American Eel           0 <NA>     Michigan (MI)          0
##  8     8  1992 Erie  American Eel           0 <NA>     New York (NY)          0
##  9     9  1992 Erie  American Eel           0 <NA>     Ohio (OH)              0
## 10    10  1992 Erie  American Eel           0 <NA>     Pennsylvania (PA)      0
## # ℹ 65,696 more rows

Introduction

Questions

Variation

ggplot(data = Mydata) +
  geom_bar(mapping = aes(x = lake))

Visualizing distributions

ggplot(data = Mydata) +
  geom_histogram(mapping = aes(x = year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = Mydata, mapping = aes(x = year, colour = lake)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

ggplot(data = Mydata, mapping = aes(x = year)) +
  geom_histogram(binwidth = 0.5)

Unusual values

Mydata %>%
    ggplot(aes(year)) +
    geom_histogram(binwidth = 0.5) +
    coord_cartesian(ylim = c(0, 200)) 

Missing Values

Mydata %>%
   
    # filter(y < 3 | y > 20) %>%
    mutate(y = ifelse(year < 1950 | year > 2000, NA, year)) %>%
    
    #Plot
    ggplot(aes(x = year, y = grand_total)) +
    geom_point()
## Warning: Removed 31767 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

Mydata %>%
    
    ggplot(aes(x = lake, y = grand_total)) +
    geom_boxplot()
## Warning: Removed 31767 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Two categorical variables

Mydata %>%
    
    count(species, lake) %>%
    ggplot(mapping= aes(x = species, y = lake, fill = n)) + 
    geom_tile()

Two continous variables

ggplot(data = Mydata) +
  geom_point(mapping = aes(x = year, y = grand_total), alpha = 1 / 100)
## Warning: Removed 31767 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data = Mydata) +
  geom_hex(mapping = aes(x = year, y = grand_total))
## Warning: Removed 31767 rows containing non-finite outside the scale range
## (`stat_binhex()`).

Patterns and models

ggplot(data = Mydata) + 
  geom_point(mapping = aes(x = year, y = grand_total))
## Warning: Removed 31767 rows containing missing values or values outside the scale range
## (`geom_point()`).