##Import data

data <- read_excel("../00_data/myData.xlsx")
data
## # A tibble: 805 × 16
##    decimalLatitude decimalLongitude eventDate      scientificName taxonConceptID
##              <dbl>            <dbl> <chr>          <chr>          <chr>         
##  1           -37.6             146. NA             Myrmecobius f… https://biodi…
##  2           -35.1             150. 2014-06-05T02… Myrmecobius f… https://biodi…
##  3           -35               118. NA             Myrmecobius f… https://biodi…
##  4           -34.7             118. NA             Myrmecobius f… https://biodi…
##  5           -34.6             117. NA             Myrmecobius f… https://biodi…
##  6           -34.6             117. NA             Myrmecobius f… https://biodi…
##  7           -34.6             118. NA             Myrmecobius f… https://biodi…
##  8           -34.6             117. NA             Myrmecobius f… https://biodi…
##  9           -34.6             117. NA             Myrmecobius f… https://biodi…
## 10           -34.6             117. NA             Myrmecobius f… https://biodi…
## # ℹ 795 more rows
## # ℹ 11 more variables: recordID <chr>, dataResourceName <chr>, year <chr>,
## #   month <chr>, wday <chr>, hour <chr>, day <chr>, dryandra <lgl>, prcp <chr>,
## #   tmax <chr>, tmin <chr>

Introduction

Questions

Variation

Visualizing distributions

data %>%
    ggplot(aes(x=year)) +
    geom_bar()

Typical values

data %>%
    filter(decimalLatitude > -31) %>%
    ggplot(aes(x=decimalLatitude)) +
    geom_histogram(binwidth = 0.25)

Unusual values

data %>%
    ggplot(aes(decimalLatitude)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).

Missing Values

data %>% 
    
    mutate(decimalLatitude =ifelse(decimalLatitude < -30 | decimalLatitude > 20, NA, decimalLatitude)) %>%
    #plot
    ggplot(aes(x = decimalLatitude,y = decimalLongitude)) +
    geom_point()
## Warning: Removed 782 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = decimalLatitude, y = dataResourceName)) +
    geom_boxplot()  
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Two categorical variables

data %>%
    count(dataResourceName, scientificName) %>%
    
    ggplot(aes(x=dataResourceName, y =scientificName, fill =n)) +
    geom_tile()  

Two continous variables

library(hexbin)
## Warning: package 'hexbin' was built under R version 4.4.3
data %>%
    ggplot(aes(x = decimalLatitude, y = decimalLongitude)) +
    geom_hex()
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_binhex()`).

Patterns and models

library(modelr)
mod <- lm(log(decimalLongitude) ~ log(decimalLongitude), data = data)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 1 in
## model.matrix: no columns are assigned
data2 <- data %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))


data2 %>%
    ggplot(aes(dataResourceName, resid)) +
    geom_point()
## Warning: Removed 83 rows containing missing values or values outside the scale range
## (`geom_point()`).

data2 %>%
    ggplot(aes(dataResourceName,resid))+
    geom_boxplot()
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_boxplot()`).