Import Data

getwd()
## [1] "C:/Users/james/OneDrive/Desktop/PSU_DAT3000_IntroToDA/04_module7"
data <- read_excel("../00_data/NationoalParkSpecies1.xlsx")
data
## # A tibble: 1,709 × 28
##    ParkCode ParkName         CategoryName Order Family TaxonRecordStatus SciName
##    <chr>    <chr>            <chr>        <chr> <chr>  <chr>             <chr>  
##  1 ACAD     Acadia National… Mammal       Arti… Cervi… Active            Alces …
##  2 ACAD     Acadia National… Mammal       Arti… Cervi… Active            Odocoi…
##  3 ACAD     Acadia National… Mammal       Carn… Canid… Active            Canis …
##  4 ACAD     Acadia National… Mammal       Carn… Canid… Active            Canis …
##  5 ACAD     Acadia National… Mammal       Carn… Canid… Active            Vulpes…
##  6 ACAD     Acadia National… Mammal       Carn… Felid… Active            Lynx c…
##  7 ACAD     Acadia National… Mammal       Carn… Felid… Active            Lynx r…
##  8 ACAD     Acadia National… Mammal       Carn… Mephi… Active            Mephit…
##  9 ACAD     Acadia National… Mammal       Carn… Muste… Active            Lutra …
## 10 ACAD     Acadia National… Mammal       Carn… Muste… Active            Martes…
## # ℹ 1,699 more rows
## # ℹ 21 more variables: CommonNames <chr>, Synonyms <lgl>, ParkAccepted <lgl>,
## #   Sensitive <lgl>, RecordStatus <chr>, Occurrence <chr>,
## #   OccurrenceTags <chr>, Nativeness <chr>, NativenessTags <chr>,
## #   Abundance <chr>, NPSTags <chr>, ParkTags <chr>, References <dbl>,
## #   Observations <dbl>, Vouchers <dbl>, ExternalLinks <chr>, TEStatus <chr>,
## #   StateStatus <chr>, OzoneSensitiveStatus <chr>, GRank <chr>, SRank <chr>

Introduction

Questions

Variation

Visualizing distributions

data %>%
    ggplot(aes(x = CategoryName)) +
    geom_bar() 

data %>%
    ggplot(mapping = aes(x = References)) +
    geom_bar()

data %>%
    
    filter(References > 2) %>%
    
    ggplot(aes(x = Abundance)) +
    geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

data %>%
    ggplot(aes(x = CommonNames, color = Order)) +
    geom_bar()

Typical values

data %>%
    
    # Filter out diamonds > 3 carat
    filter(Observations > 1) %>%
    
    # Plot 
    ggplot(aes(x =  Family)) +
    geom_bar()

faithful %>%
    
    ggplot(aes(x = eruptions)) + 
    geom_bar()

Unusual values

data %>%
    
    ggplot(aes(x = CategoryName, y = Observations)) + 
    geom_point()

data %>%
    
    ggplot(aes(x = Order, y = CommonNames)) + 
    geom_point() +
    coord_cartesian(ylim = c(0,50))

Missing Values

data %>%
    
    #filter(y < 3 | y > 20) %>%
    
    mutate(y = ifelse(Abundance < 3 | Abundance > 20, NA, Abundance)) %>%
               
    # Plot
        ggplot(aes(x = Observations, y = Abundance)) +
        geom_point()

Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = References, y = Occurrence)) +
    geom_boxplot()

Two categorical variables

data %>%
    
    count(Occurrence, Nativeness) %>%
    
    ggplot(aes(x = Nativeness, y = Occurrence, fill = n)) +
    geom_tile()

### Two continous variables

data %>%
    
    count(Observations, Occurrence) %>%
    
    ggplot(aes(x = Observations, y = Occurrence, fill = n)) +
    geom_tile()

## Patterns and models

data_clean <- data %>%
  filter(!is.na(References), !is.na(Observations)) %>%
  filter(References > 0, Observations > 0)

mod <- lm(log(References) ~ log(Observations), data = data_clean)
    
data4 <- data_clean %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

data4 %>%
    ggplot(aes(References, resid)) +
    geom_point()

data4 %>%
    ggplot(aes(Observations, resid)) + 
    geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?