#1.1 Load Packages and Raw Data
library(tidyverse) # data wrangling and visualization (dplyr, ggplot2, tidyr, readr)
library(knitr) # kable() for table display
library(kableExtra) # kable_styling(), row_spec() for enhanced tables
library(broom) # tidy() and augment() for clean model output
library(corrplot) # corrplot() for correlation matrix visualization
library(car) # vif() for multicollinearity diagnostics
# Load the raw CSV downloaded from Health Data NY
# Source: https://health.data.ny.gov
# Dataset: Deer Tick Surveillance: Adults (Oct to Dec) Powassan Virus Only:
# Beginning 2009
# Last updated: May 2, 2025
tick_raw <- read_csv(
"Deer_Tick.csv",
show_col_types = FALSE
)
# Report starting sample size (all rows in the raw file, all years)
tibble(
Step = "Raw dataset (all years, all counties)",
N = nrow(tick_raw)
) %>%
kable(caption = "Table S1. Starting Sample Size") %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| Step | N |
|---|---|
| Raw dataset (all years, all counties) | 666 |
# Step 1: Rename columns to clean, consistent names
tick_clean <- tick_raw %>%
rename(
year = Year,
county = County,
sites_visited = `Total Sites Visited`,
ticks_collected = `Total Ticks Collected`,
tick_density = `Tick Population Density`,
ticks_tested = `Total Ticks Tested`,
pools_tested = `Pools Tested`,
pools_positive = `Pools Positive`,
mir_raw = `Minimum Infection Rate`
)
# Step 2: Recode MIR from percentage string to numeric
# In the raw file, MIR is stored as a string like "1.1%".
# I stripped the % sign and converted to numeric so it can serve as a
# continuous outcome variable in linear regression.
# County-years with no pools tested have blank MIR and become NA.
tick_clean <- tick_clean %>%
mutate(
mir = as.numeric(gsub("%", "", mir_raw)) # e.g., "1.1%" -> 1.1
)
# Step 3: Filter to study period 2014-2024
# Restrict to the most recent 11 years of surveillance
# to focus on up-to-date tick ecology and Powassan virus dynamics.
n_raw <- nrow(tick_clean)
tick_study <- tick_clean %>%
filter(year >= 2014 & year <= 2024)
n_after_years <- nrow(tick_study)
n_dropped_years <- n_raw - n_after_years
# Step 4: Exclude rows with zero pools tested
# When pools_tested == 0 or is NA, no ticks were tested for Powassan virus.
# MIR cannot be calculated. This is structural missingness and not random
# as it reflects no surveillance activity, not a missing result.
# These rows are excluded from the primary analysis per the proposal.
n_before_pools <- nrow(tick_study)
tick_study <- tick_study %>%
filter(!is.na(pools_tested) & pools_tested > 0)
n_after_pools <- nrow(tick_study)
n_dropped_pools <- n_before_pools - n_after_pools
# Step 5: Exclude rows with missing tick_density (primary exposure)
# A small number of rows may lack density data and cannot contribute
# to the regression model.
n_before_density <- nrow(tick_study)
tick_study <- tick_study %>%
filter(!is.na(tick_density))
n_dropped_density <- n_before_density - nrow(tick_study)
# Report exclusion flowchart
tibble(
Step = c(
"Raw dataset (all years)",
"Exclude years outside 2014-2024",
"Exclude county-years with zero or missing pools tested",
"Exclude county-years with missing tick population density",
"Final analytical sample"
),
`Observations Dropped` = c(
"-",
paste0("-", n_dropped_years),
paste0("-", n_dropped_pools),
paste0("-", n_dropped_density),
"-"
),
`N Remaining` = c(
n_raw,
n_after_years,
n_after_pools,
nrow(tick_study),
nrow(tick_study)
)
) %>%
kable(caption = "Table 1. Analytical Sample Flowchart") %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE) %>%
row_spec(5, bold = TRUE, background = "#d4edda")
| Step | Observations Dropped | N Remaining |
|---|---|---|
| Raw dataset (all years) |
|
666 |
| Exclude years outside 2014-2024 | -140 | 526 |
| Exclude county-years with zero or missing pools tested | -130 | 396 |
| Exclude county-years with missing tick population density | -0 | 396 |
| Final analytical sample |
|
396 |
Note: This dataset does not use a complex sampling design. It is an annual ecological surveillance dataset collected to the county-year level by the NYSDOH. No survey weights are needed.
# Create geographic region variable
# Counties are grouped into three ecologically regions based on
# known tick habitat, deer population density, and Powassan/Lyme degree of being an endemic.
# Reference level = Northern / Western NY (lowest expected Powassan burden).
southeast_counties <- c(
"Suffolk", "Nassau", "Westchester", "Rockland", "Orange", "Putnam",
"Dutchess", "Columbia", "Ulster", "Sullivan", "Greene", "Albany",
"Rensselaer", "Saratoga", "Schenectady", "Montgomery"
)
central_counties <- c(
"Otsego", "Delaware", "Schoharie", "Chenango", "Broome", "Tioga",
"Tompkins", "Cortland", "Onondaga", "Madison", "Oneida", "Herkimer",
"Schuyler", "Chemung", "Steuben", "Yates", "Ontario", "Seneca",
"Cayuga", "Wayne"
)
tick_study <- tick_study %>%
mutate(
# Geographic region (3-level categorical covariate)
region = case_when(
county %in% southeast_counties ~ "Long Island / Hudson Valley / SE",
county %in% central_counties ~ "Central NY",
TRUE ~ "Northern / Western NY"
),
# Set reference to Northern / Western NY
region = factor(region,
levels = c("Northern / Western NY",
"Central NY",
"Long Island / Hudson Valley / SE")),
# Year as numeric (continuous, temporal)
year = as.numeric(year),
# Pools positive: standardize as numeric
pools_positive = as.numeric(pools_positive)
)
# Final complete-case dataset used in all downstream analyses
# Drop remaining rows where MIR is NA (pools tested but result not available).
tick_cc <- tick_study %>%
filter(!is.na(mir))
cat("Complete-case analytical dataset:", nrow(tick_cc), "county-year observations\n")
## Complete-case analytical dataset: 396 county-year observations
# Outcome: MIR (continuous)
# MIR = (positive pools / total ticks tested) x 100
# Expected to be right-skewed: many county-years have MIR = 0,
# with a long tail of higher values in endemic counties.
tick_cc %>%
summarise(
N = sum(!is.na(mir)),
`N Missing` = sum(is.na(mir)),
`% Missing` = round(mean(is.na(mir)) * 100, 1),
Mean = round(mean(mir, na.rm = TRUE), 3),
SD = round(sd(mir, na.rm = TRUE), 3),
Median = round(median(mir, na.rm = TRUE), 3),
Min = round(min(mir, na.rm = TRUE), 3),
Max = round(max(mir, na.rm = TRUE), 3)
) %>%
kable(caption = "Table 2a. Outcome Variable: Minimum Infection Rate (MIR, pct)") %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| N | N Missing | % Missing | Mean | SD | Median | Min | Max |
|---|---|---|---|---|---|---|---|
| 396 | 0 | 0 | 0.413 | 1.299 | 0 | 0 | 14 |
# Primary exposure: tick population density (continuous)
# Average adult deer ticks per 1,000 meters sampled per county-year.
# Higher density = more ticks encountered = potentially higher transmission risk.
tick_cc %>%
summarise(
N = sum(!is.na(tick_density)),
`N Missing` = sum(is.na(tick_density)),
`% Missing` = round(mean(is.na(tick_density)) * 100, 1),
Mean = round(mean(tick_density, na.rm = TRUE), 2),
SD = round(sd(tick_density, na.rm = TRUE), 2),
Median = round(median(tick_density, na.rm = TRUE), 2),
Min = round(min(tick_density, na.rm = TRUE), 2),
Max = round(max(tick_density, na.rm = TRUE), 2)
) %>%
kable(caption = "Table 2b. Primary Exposure: Tick Population Density (ticks per 1,000 m)") %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| N | N Missing | % Missing | Mean | SD | Median | Min | Max |
|---|---|---|---|---|---|---|---|
| 396 | 0 | 0 | 56.99 | 41.35 | 44.45 | 6.6 | 256.9 |
# Missing data summary for all key variables
tibble(
Variable = c(
"MIR - outcome",
"Tick Density - primary exposure",
"Sites Visited",
"Ticks Tested",
"Year",
"Region"
),
`N Missing` = c(
sum(is.na(tick_cc$mir)),
sum(is.na(tick_cc$tick_density)),
sum(is.na(tick_cc$sites_visited)),
sum(is.na(tick_cc$ticks_tested)),
sum(is.na(tick_cc$year)),
sum(is.na(tick_cc$region))
),
`% Missing` = round(c(
mean(is.na(tick_cc$mir)),
mean(is.na(tick_cc$tick_density)),
mean(is.na(tick_cc$sites_visited)),
mean(is.na(tick_cc$ticks_tested)),
mean(is.na(tick_cc$year)),
mean(is.na(tick_cc$region))
) * 100, 1),
`Mechanism / Handling` = c(
"Excluded above - complete case analysis",
"Excluded above - no remaining missing",
"Rare; complete case analysis",
"Rare; complete case analysis",
"None",
"None - all counties assigned to a region"
)
) %>%
kable(caption = "Table 2c. Missing Data Summary for Key Variables") %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| Variable | N Missing | % Missing | Mechanism / Handling |
|---|---|---|---|
| MIR - outcome | 0 | 0 | Excluded above - complete case analysis |
| Tick Density - primary exposure | 0 | 0 | Excluded above - no remaining missing |
| Sites Visited | 0 | 0 | Rare; complete case analysis |
| Ticks Tested | 0 | 0 | Rare; complete case analysis |
| Year | 0 | 0 | None |
| Region | 0 | 0 | None - all counties assigned to a region |
# Table 1: Summary statistics for continuous variables.
# Mean (SD), Median, Min, Max - using summarise() and kable().
# Outcome highlighted in light blue, primary exposure highlighted in light orange.
tick_cc %>%
summarise(
across(
c(mir, tick_density, year, sites_visited, ticks_tested, pools_tested),
list(
Mean = ~ round(mean(., na.rm = TRUE), 2),
SD = ~ round(sd(., na.rm = TRUE), 2),
Median = ~ round(median(., na.rm = TRUE), 2),
Min = ~ round(min(., na.rm = TRUE), 2),
Max = ~ round(max(., na.rm = TRUE), 2)
),
.names = "{.col}__{.fn}"
)
) %>%
pivot_longer(everything(),
names_to = c("Variable", "Stat"),
names_sep = "__") %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(Variable = case_when(
Variable == "mir" ~ "MIR (%)",
Variable == "tick_density" ~ "Tick Population Density (ticks/1,000 m)",
Variable == "year" ~ "Year",
Variable == "sites_visited" ~ "Total Sites Visited",
Variable == "ticks_tested" ~ "Ticks Tested",
Variable == "pools_tested" ~ "Pools Tested",
TRUE ~ Variable
)) %>%
kable(
caption = paste0("Table 3. Descriptive Statistics - Analytic Sample (n = ",
nrow(tick_cc), ")"),
col.names = c("Variable", "Mean", "SD", "Median", "Min", "Max")
) %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#d0e8f5") %>% # outcome row
row_spec(2, bold = TRUE, background = "#fde8c8") # exposure row
| Variable | Mean | SD | Median | Min | Max |
|---|---|---|---|---|---|
| MIR (%) | 0.41 | 1.30 | 0.00 | 0.0 | 14.0 |
| Tick Population Density (ticks/1,000 m) | 56.99 | 41.35 | 44.45 | 6.6 | 256.9 |
| Year | 2019.32 | 3.06 | 2019.50 | 2014.0 | 2024.0 |
| Total Sites Visited | 3.10 | 2.69 | 2.50 | 1.0 | 28.0 |
| Ticks Tested | 119.31 | 175.65 | 59.50 | 1.0 | 2110.0 |
| Pools Tested | 18.08 | 27.49 | 10.00 | 1.0 | 248.0 |
# Table 1b: Geographic region distribution (categorical covariate)
tick_cc %>%
count(region) %>%
mutate(`%` = round(n / sum(n) * 100, 1)) %>%
kable(
caption = "Table 3b. Geographic Region Distribution",
col.names = c("Region", "N", "%")
) %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| Region | N | % |
|---|---|---|
| Northern / Western NY | 138 | 34.8 |
| Central NY | 104 | 26.3 |
| Long Island / Hudson Valley / SE | 154 | 38.9 |
# Histogram of MIR
# Examines the shape of MIR before regression.
# If heavily right-skewed, a log transformation may be needed to satisfy
# the normality of residuals assumption for linear regression.
ggplot(tick_cc, aes(x = mir)) +
geom_histogram(binwidth = 0.1, fill = "steelblue",
color = "white", alpha = 0.85) +
labs(
title = "Figure 1. Distribution of Minimum Infection Rate (MIR)",
subtitle = paste0("NYS Adult Deer Tick Surveillance, 2014-2024 (n = ",
nrow(tick_cc), ")"),
x = "Minimum Infection Rate (MIR, %)",
y = "Count (County-Year Observations)"
) +
theme_minimal(base_size = 13)
Figure 1. Distribution of MIR for Powassan Virus in Adult Deer Ticks, NYS Counties 2014-2024.
# Q-Q plots to assess normality
# Using qqnorm() and qqline().
# Points should fall on the red line if data are normally distributed.
# Deviations in the tails would indicate skewness.
par(mfrow = c(1, 2))
qqnorm(tick_cc$mir, main = "Q-Q Plot: MIR (raw)")
qqline(tick_cc$mir, col = "red")
qqnorm(log(tick_cc$mir + 0.01), main = "Q-Q Plot: log(MIR + 0.01)")
qqline(log(tick_cc$mir + 0.01), col = "red")
Figure 1b. Q-Q Plots of Raw and Log-Transformed MIR.
par(mfrow = c(1, 1))
Interpretation (Figure 1): MIR is right-skewed with a large concentration of values near zero; most county-years had very low or zero Powassan infection rates, with a long right tail in endemic counties. This is consistent with how rare is Powassan virus presence in tick pools. The Q-Q plot confirms drift from normality in the right tail. The log-transformed version (log(MIR + 0.01)) is more symmetric. This suggests that a transformation may better satisfy the normality of residuals assumption for linear regression. Both raw and log-transformed MIR will be considered in the multivariable model.
# Scatterplot: primary exposure vs. outcome
# geom_smooth(method = "lm") adds the OLS fitted line with 95% CI band.
# The orange LOESS smoother checks whether the relationship is truly linear.
ggplot(tick_cc, aes(x = tick_density, y = mir)) +
geom_point(alpha = 0.35, color = "steelblue", size = 1.8) +
geom_smooth(method = "lm", se = TRUE,
color = "red", linewidth = 1.2, fill = "pink") +
geom_smooth(method = "loess", se = FALSE,
color = "orange", linewidth = 1, linetype = "dashed") +
labs(
title = "Figure 2. Tick Population Density vs. Minimum Infection Rate (MIR)",
subtitle = "Red = linear fit (95% CI) | Orange dashed = LOESS smoother",
x = "Tick Population Density (ticks per 1,000 m sampled)",
y = "Minimum Infection Rate (MIR, %)"
) +
theme_minimal(base_size = 13)
Figure 2. Tick Population Density vs. MIR with Fitted Regression Line, NYS Counties 2014-2024.
# Pearson correlation: tick density and MIR
# Unadjusted measure of linear association.
# Using cor.test() and tidy().
cor_density_mir <- cor.test(tick_cc$tick_density, tick_cc$mir)
tidy(cor_density_mir) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Table 4. Pearson Correlation: Tick Density and MIR"
) %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.062 | 1.229 | 0.22 | -0.037 | 0.159 |
Interpretation (Figure 2): The scatterplot shows a positive relationship between tick population density and MIR. Counties with higher tick abundance tend to have slightly higher Powassan infection rates. The LOESS smoother tracks linearly through the data, which supports the use of linear regression. The Pearson correlation (r = 0.062, p = 0.22) shows a weak positive association in the expected direction. However, it does not reach statistical significance(p > 0.05), indicating that the non-adjusted bivariate relationship between tick density and MIR is modest. This underscores the importance of the multivariable model, where adjustment for region and other covariates may add clarification on the relationship.
# -- Boxplots by region --------------------------------------------------------
# Assesses whether MIR differs meaningfully across the three ecological regions,
# which justifies region as a covariate in the multivariable model.
# geom_boxplot() and geom_jitter() were used here.
ggplot(tick_cc, aes(x = region, y = mir, fill = region)) +
geom_boxplot(alpha = 0.7, outlier.color = "gray40", outlier.size = 1.5) +
geom_jitter(width = 0.15, alpha = 0.2, size = 1.2, color = "gray30") +
scale_fill_manual(values = c("steelblue", "coral", "forestgreen")) +
labs(
title = "Figure 3. MIR by Geographic Region",
subtitle = "NYS Adult Deer Tick Surveillance, 2014-2024",
x = "Geographic Region",
y = "Minimum Infection Rate (MIR, %)"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")
Figure 3. Boxplots of MIR by Geographic Region, NYS Counties 2014-2024.
# One-way ANOVA: does mean MIR differ by region?
# H0: mean MIR is equal across all three regions.
# H1: at least one region's mean MIR differs.
# Using aov() and TukeyHSD().
anova_region <- aov(mir ~ region, data = tick_cc)
tidy(anova_region) %>%
kable(
digits = 3,
caption = "Table 5. One-Way ANOVA: MIR by Geographic Region",
col.names = c("Term", "df", "Sum Sq", "Mean Sq", "F statistic", "p-value")
) %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| Term | df | Sum Sq | Mean Sq | F statistic | p-value |
|---|---|---|---|---|---|
| region | 2 | 72.014 | 36.007 | 23.792 | 0 |
| Residuals | 393 | 594.764 | 1.513 | NA | NA |
# Tukey HSD: which pairs of regions differ significantly?
TukeyHSD(anova_region) %>%
tidy() %>%
kable(
digits = 3,
caption = "Table 5b. Tukey HSD Post-Hoc Comparisons by Region",
col.names = c("Term", "Comparison", "Null Value", "Estimate",
"Conf. Low", "Conf. High", "Adj. p-value")
) %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| Term | Comparison | Null Value | Estimate | Conf. Low | Conf. High | Adj. p-value |
|---|---|---|---|---|---|---|
| region | Central NY-Northern / Western NY | 0 | 0.024 | -0.352 | 0.400 | 0.988 |
| region | Long Island / Hudson Valley / SE-Northern / Western NY | 0 | 0.885 | 0.546 | 1.224 | 0.000 |
| region | Long Island / Hudson Valley / SE-Central NY | 0 | 0.861 | 0.494 | 1.228 | 0.000 |
Interpretation (Figure 3): MIR varies across the three geographic regions. The Long Island / Hudson Valley / Southeast region shows the highest median MIR and widest spread. This is consistent with its known status as a highly endemic zone for tick-borne disease in New York State (Aliota et al., 2014; Prusinski et al., 2019). The ANOVA F-test assesses whether these differences are statistically significant at alpha = 0.05.Tukey HSD post-hoc tests identify which specific region pairs drive the overall result.
# Correlation matrix using corrplot()
# Examines pairwise linear relationships among all continuous predictors
# to detect potential multicollinearity before building the regression model.
# Strongly correlated predictors (|r| > 0.8) can impact standard errors.
cor_matrix <- tick_cc %>%
select(
`MIR (%)` = mir,
`Tick Density` = tick_density,
`Sites Visited` = sites_visited,
`Ticks Tested` = ticks_tested,
`Pools Tested` = pools_tested,
`Year` = year
) %>%
cor(use = "complete.obs")
# Display correlation matrix as table
cor_matrix %>%
kable(digits = 3, caption = "Correlation Matrix - Continuous Variables") %>%
kable_styling(latex_options = c("striped", "hold_position"), full_width = FALSE)
| MIR (%) | Tick Density | Sites Visited | Ticks Tested | Pools Tested | Year | |
|---|---|---|---|---|---|---|
| MIR (%) | 1.000 | 0.062 | -0.004 | 0.088 | 0.115 | 0.021 |
| Tick Density | 0.062 | 1.000 | -0.198 | 0.225 | 0.207 | 0.064 |
| Sites Visited | -0.004 | -0.198 | 1.000 | 0.611 | 0.704 | -0.045 |
| Ticks Tested | 0.088 | 0.225 | 0.611 | 1.000 | 0.881 | 0.186 |
| Pools Tested | 0.115 | 0.207 | 0.704 | 0.881 | 1.000 | 0.113 |
| Year | 0.021 | 0.064 | -0.045 | 0.186 | 0.113 | 1.000 |
# Visualize with corrplot
corrplot(cor_matrix,
method = "circle",
type = "lower",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.7,
col = colorRampPalette(c("#3498db", "white", "#e74c3c"))(200),
title = "Figure 4. Correlation Matrix: Key Continuous Variables",
mar = c(0, 0, 2, 0))
Figure 4. Correlation Matrix Among Key Continuous Variables.
Interpretation (Figure 4): The correlation matrix reveals the strength of pairwise linear relationships among all continuous variables. Ticks tested and pools tested are expected to be strongly correlated. If |r| > 0.8, one of these may need to be dropped from the multivariable model to avoid multicollinearity, which can be formally confirmed using vif() in the full model. The correlation between tick density and MIR further confirms the positive direction seen in Figure 2. Any significant correlation of year with MIR would indicate a temporal trend in Powassan infection rates over the 2014-2024 study period.