Import Data
MyData <- read_excel("MyData.xlsx")
Introduction
Questions
# 1) What type of variation occurs within my variable?
# 2) What type of covariation occurs between my variables?
Variation
Visualizing distributions
MyData %>%
ggplot(aes(x = fiscal_year )) +
geom_bar()

MyData %>%
ggplot(mapping = aes(x = fiscal_year)) +
geom_histogram(binwidth = 0.5)

MyData %>%
filter(encounter_count < 30) %>%
ggplot(aes(x = encounter_count)) +
geom_histogram(binwidth = 0.5)

MyData %>%
ggplot(aes(x = fiscal_year, land_border_region = area_of_responsibility)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values
MyData %>%
# Filter out encounter count < 400 encounter count
filter(encounter_count < 400) %>%
# Plot
ggplot(aes(encounter_count)) +
geom_histogram(binwidth = 0.25)

MyData %>%
# Filter was added here because data points were not easily seen without
filter(encounter_count < 400) %>%
ggplot(aes(encounter_count)) +
geom_histogram(binwidth = 0.75)

Unusual values
MyData %>%
ggplot(aes(encounter_count)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

MyData %>%
ggplot(aes(encounter_count)) +
geom_histogram() +
coord_cartesian(ylim = c(0, 15))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
MyData %>%
# filter(encounter_count > 3 | encounter_count > 20)
mutate(encounter_rev = ifelse(encounter_count > 3 | encounter_count > 20, NA, encounter_count)) %>%
# Plot
ggplot(aes(x = fiscal_year, y = encounter_count)) +
geom_point()

Covariation
A categorical and continuous variable
MyData %>%
ggplot(aes(x = encounter_count, y = land_border_region)) +
geom_boxplot()

Two categorical variables
MyData %>%
count (land_border_region, demographic) %>%
ggplot(aes(x = demographic, y = land_border_region, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
## Warning: package 'hexbin' was built under R version 4.4.3
MyData %>%
ggplot(aes(x = encounter_count, y = fiscal_year)) +
geom_hex()

MyData %>%
filter(encounter_count > 30) %>%
ggplot(aes(x = encounter_count, y = fiscal_year)) +
geom_boxplot(aes(group = cut_width(encounter_count, 0.1)))

Patterns and models
library(modelr)
## Warning: package 'modelr' was built under R version 4.4.3
Mod <- lm(encounter_count ~ log (fiscal_year), data = MyData)
Mod
##
## Call:
## lm(formula = encounter_count ~ log(fiscal_year), data = MyData)
##
## Coefficients:
## (Intercept) log(fiscal_year)
## -419636 55151
MyData4 <- MyData %>%
modelr::add_residuals(Mod) %>%
mutate(resid = exp(resid))
MyData4
## # A tibble: 68,815 × 13
## fiscal_year month_grouping month_abbv component land_border_region
## <dbl> <chr> <chr> <chr> <chr>
## 1 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 2 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 3 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 4 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 5 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 6 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 7 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 8 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 9 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## 10 2020 FYTD APR Office of Field Ope… Northern Land Bor…
## # ℹ 68,805 more rows
## # ℹ 8 more variables: area_of_responsibility <chr>, aor_abbv <chr>,
## # demographic <chr>, citizenship <chr>, title_of_authority <chr>,
## # encounter_type <chr>, encounter_count <dbl>, resid <dbl>
MyData4 %>%
ggplot(aes(fiscal_year, resid)) +
geom_point()

MyData4 %>%
ggplot(aes(land_border_region, resid)) +
geom_boxplot()
## Warning: Removed 2799 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
