Import Data

MyData <- read_excel("MyData.xlsx")

Introduction

Questions

# 1) What type of variation occurs within my variable?

# 2) What type of covariation occurs between my variables? 

Variation

Visualizing distributions

MyData %>%
    ggplot(aes(x = fiscal_year )) + 
    geom_bar()

MyData %>%
ggplot(mapping = aes(x = fiscal_year)) + 
    geom_histogram(binwidth = 0.5)

MyData %>%
    filter(encounter_count < 30) %>% 
    
    ggplot(aes(x = encounter_count)) + 
    geom_histogram(binwidth = 0.5)

MyData %>%
    ggplot(aes(x = fiscal_year, land_border_region = area_of_responsibility)) + 
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

MyData %>%
    
    # Filter out encounter count < 400 encounter count 
   filter(encounter_count < 400) %>%
    
    # Plot
    ggplot(aes(encounter_count)) + 
    geom_histogram(binwidth = 0.25)

MyData %>% 
    # Filter was added here because data points were not easily seen without
    filter(encounter_count < 400) %>% 
    ggplot(aes(encounter_count)) + 
    geom_histogram(binwidth = 0.75)

Unusual values

MyData %>%
    ggplot(aes(encounter_count)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

MyData %>%
    ggplot(aes(encounter_count)) +
    geom_histogram() + 
    coord_cartesian(ylim = c(0, 15))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

MyData %>%
    
    # filter(encounter_count > 3 | encounter_count > 20)
    
    
     mutate(encounter_rev = ifelse(encounter_count > 3 | encounter_count > 20, NA, encounter_count)) %>%
    
    # Plot
    ggplot(aes(x = fiscal_year, y = encounter_count)) + 
    geom_point()

Covariation

A categorical and continuous variable

MyData %>%
    ggplot(aes(x = encounter_count, y = land_border_region)) +
    geom_boxplot()

Two categorical variables

MyData %>%
    
    count (land_border_region, demographic) %>%
    
    ggplot(aes(x = demographic, y = land_border_region, fill = n)) + 
    geom_tile()

Two continous variables

library(hexbin)
## Warning: package 'hexbin' was built under R version 4.4.3
MyData %>% 
    ggplot(aes(x = encounter_count, y = fiscal_year)) + 
    geom_hex()

MyData %>%
    filter(encounter_count > 30) %>%
    ggplot(aes(x = encounter_count, y = fiscal_year)) + 
    geom_boxplot(aes(group = cut_width(encounter_count, 0.1)))

Patterns and models

library(modelr)
## Warning: package 'modelr' was built under R version 4.4.3
Mod <- lm(encounter_count ~ log (fiscal_year), data = MyData)
Mod
## 
## Call:
## lm(formula = encounter_count ~ log(fiscal_year), data = MyData)
## 
## Coefficients:
##      (Intercept)  log(fiscal_year)  
##          -419636             55151
MyData4 <- MyData %>%
    modelr::add_residuals(Mod) %>%
    mutate(resid = exp(resid))

MyData4
## # A tibble: 68,815 × 13
##    fiscal_year month_grouping month_abbv component            land_border_region
##          <dbl> <chr>          <chr>      <chr>                <chr>             
##  1        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
##  2        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
##  3        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
##  4        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
##  5        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
##  6        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
##  7        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
##  8        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
##  9        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
## 10        2020 FYTD           APR        Office of Field Ope… Northern Land Bor…
## # ℹ 68,805 more rows
## # ℹ 8 more variables: area_of_responsibility <chr>, aor_abbv <chr>,
## #   demographic <chr>, citizenship <chr>, title_of_authority <chr>,
## #   encounter_type <chr>, encounter_count <dbl>, resid <dbl>
MyData4 %>%
    ggplot(aes(fiscal_year, resid)) + 
    geom_point()

MyData4 %>% 
    ggplot(aes(land_border_region, resid)) + 
    geom_boxplot()
## Warning: Removed 2799 rows containing non-finite outside the scale range
## (`stat_boxplot()`).