Load Libraries

library(dplyr)
library(tidyverse)
library(janitor)
library(htmltools)

packageVersion("htmltools")
## [1] '0.5.8.1'

Step 1: Load Input Data

Load all four relational CSV files and apply clean_names() to standardize column names to snake_case.

# DATA table: GEOGRAPHYID, GEOLOCID, INDICATORID, DATAYEAR, DATA_VALUE
data_raw <- read_csv("~/Documents/IU Indy - Spring '26/SAVI Outlier Data Analysis/Input Data/qry_Outlier_Analysis_Data.csv") %>%
  clean_names()

# GEOGRAPHIES table: GEOGRAPHYID, GEOLOCID, GEOGRAPHY_DISPLAY_LABEL
geos_raw <- read_csv("~/Documents/IU Indy - Spring '26/SAVI Outlier Data Analysis/Input Data/qry_Outlier_Analysis_Geographies.csv") %>%
  clean_names()

# GEOGRAPHYTYPES table: GEOGRAPHYID, ABBREV, GEOGRAPHY_NAME
geotypes_raw <- read_csv("~/Documents/IU Indy - Spring '26/SAVI Outlier Data Analysis/Input Data/qry_Outlier_Analysis_Geography_Types.csv") %>%
  clean_names()

# INDICATORS table: INDICATORID, SOURCE_NAME, SOURCE_ABBREV, DISPLAY_LABEL, HAS_CPRO_VIZ
indicators_raw <- read_csv("~/Documents/IU Indy - Spring '26/SAVI Outlier Data Analysis/Input Data/qry_Outlier_Analysis_Indicators.csv") %>%
  clean_names()

Step 1B: Make column names consistent

geotypes_raw <- geotypes_raw %>%
  rename(geographyid = id)

Step 2: Enforce Column Types

Ensure key ID columns are numeric across all tables to support reliable joins.

# DATA
data_raw <- data_raw %>%
  mutate(
    geographyid  = as.numeric(geographyid),
    geolocid     = as.numeric(geolocid),
    indicatorid  = as.numeric(indicatorid),
    datayear     = as.numeric(datayear),
    data_value   = as.numeric(data_value)
  )

# GEOGRAPHIES
geos_raw <- geos_raw %>%
  mutate(
    geographyid = as.numeric(geographyid),
    geolocid    = as.numeric(geolocid)
  )

# GEOGRAPHYTYPES
geotypes_raw <- geotypes_raw %>%
  mutate(geographyid = as.numeric(geographyid))

# INDICATORS -- keep has_cpro_viz as character for filtering later
indicators_raw <- indicators_raw %>%
  mutate(indicatorid = as.numeric(indicatorid))

head(data_raw)
## # A tibble: 6 × 5
##   datayear geographyid geolocid indicatorid data_value
##      <dbl>       <dbl>    <dbl>       <dbl>      <dbl>
## 1     2010           2    18129     1012409          0
## 2     2010           2    18147     1012409          0
## 3     2010           2    18163     1012409          0
## 4     2010           2    21101     1012409          0
## 5     2010           2    18129     1012418          0
## 6     2010           2    18147     1012418          0
head(geos_raw)
## # A tibble: 6 × 3
##   geographyid geolocid geography_display_label
##         <dbl>    <dbl> <chr>                  
## 1           2    18027 Daviess                
## 2           2    18037 Dubois                 
## 3           2    18051 Gibson                 
## 4           2    18083 Knox                   
## 5           2    18101 Martin                 
## 6           2    18123 Perry
head(geotypes_raw)
## # A tibble: 6 × 3
##   geographyid abbrev geography_name           
##         <dbl> <chr>  <chr>                    
## 1         651 AAA    Area Agency on Aging     
## 2         796 PM20   Public Use Microdata Area
## 3           2 CNTY   Counties                 
## 4           3 TWSP   Townships                
## 5          55 SCRP   School Corporations      
## 6         282 TR10   Census Tracts 2010
head(indicators_raw)
## # A tibble: 6 × 5
##   source_name  source_abbrev indicatorid display_label              has_cpro_viz
##   <chr>        <chr>               <dbl> <chr>                      <chr>       
## 1 Connect2Help C2H               1003180 Total 211 Calls for the Y… No          
## 2 Connect2Help C2H               1003186 Total 211 Calls for the Y… Yes         
## 3 Connect2Help C2H               1003339 Total 211 Calls for the Y… No          
## 4 Connect2Help C2H               1003235 Total 211 Calls for the Y… Yes         
## 5 Connect2Help C2H               1003185 Total 211 Calls for the Y… No          
## 6 Connect2Help C2H               1003177 Total 211 Calls for the Y… No

Step 3: Identify Most Recent Year per Group

For each combination of indicatorid, geographyid, and geolocid, find the maximum datayear. These become the “anchor” records representing the most current observation.

max_year_by_group <- data_raw %>%
  group_by(indicatorid, geographyid, geolocid) %>%
  summarise(datayear = max(datayear), .groups = "drop")

# Duplicate key columns for later join tracking
max_year_by_group <- max_year_by_group %>%
  mutate(
    maxdata_indicatorid  = indicatorid,
    maxdata_year         = datayear,
    maxdata_geographyid  = geographyid,
    maxdata_geolocid     = geolocid
  )

head(max_year_by_group)
## # A tibble: 6 × 8
##   indicatorid geographyid geolocid datayear maxdata_indicatorid maxdata_year
##         <dbl>       <dbl>    <dbl>    <dbl>               <dbl>        <dbl>
## 1     1000049           2    18027     2024             1000049         2024
## 2     1000049           2    18037     2024             1000049         2024
## 3     1000049           2    18051     2024             1000049         2024
## 4     1000049           2    18083     2024             1000049         2024
## 5     1000049           2    18101     2024             1000049         2024
## 6     1000049           2    18123     2024             1000049         2024
## # ℹ 2 more variables: maxdata_geographyid <dbl>, maxdata_geolocid <dbl>

Step 4: Full Join to Tag Max-Year Rows

Perform a full join between data_raw and max_year_by_group. Rows where the maxdata_* columns are populated are the most-recent-year observations; rows with NA in those columns are all prior years.

data_joined <- full_join(
  data_raw,
  max_year_by_group,
  by = c("indicatorid", "geographyid", "geolocid", "datayear")
)

head(data_joined)
## # A tibble: 6 × 9
##   datayear geographyid geolocid indicatorid data_value maxdata_indicatorid
##      <dbl>       <dbl>    <dbl>       <dbl>      <dbl>               <dbl>
## 1     2010           2    18129     1012409          0                  NA
## 2     2010           2    18147     1012409          0                  NA
## 3     2010           2    18163     1012409          0                  NA
## 4     2010           2    21101     1012409          0                  NA
## 5     2010           2    18129     1012418          0                  NA
## 6     2010           2    18147     1012418          0                  NA
## # ℹ 3 more variables: maxdata_year <dbl>, maxdata_geographyid <dbl>,
## #   maxdata_geolocid <dbl>

Step 5: Separate Max-Year and Historical Records

Split data_joined into two datasets:

  • max_year – only the most-recent-year rows (used as the “current” value in outlier scoring).
  • historical – all prior-year rows (used to compute the historical mean and SD).
# Rows where maxdata columns are NOT missing -> these are the max-year records
max_year <- data_joined %>%
  filter(complete.cases(.)) %>%
  select(indicatorid, datayear, geographyid, geolocid, data_value) %>%
  mutate(across(everything(), as.numeric))

head(max_year)
## # A tibble: 6 × 5
##   indicatorid datayear geographyid geolocid data_value
##         <dbl>    <dbl>       <dbl>    <dbl>      <dbl>
## 1     1012496     2023           2    21101      166. 
## 2     1012496     2023           2    18129       51.4
## 3     1012496     2023           2    18163      246. 
## 4     1013140     2023           2    18147       14  
## 5     1013140     2023           2    18129        9  
## 6     1013138     2023           2    21101       43
# Rows where maxdata columns ARE missing -> historical (non-max) records
historical <- data_joined %>%
  filter(rowSums(is.na(.)) > 0) %>%
  select(indicatorid, datayear, geographyid, geolocid, data_value)

head(historical)
## # A tibble: 6 × 5
##   indicatorid datayear geographyid geolocid data_value
##         <dbl>    <dbl>       <dbl>    <dbl>      <dbl>
## 1     1012409     2010           2    18129          0
## 2     1012409     2010           2    18147          0
## 3     1012409     2010           2    18163          0
## 4     1012409     2010           2    21101          0
## 5     1012418     2010           2    18129          0
## 6     1012418     2010           2    18147          0

Step 6: Filter to Numeric Data Values

Ensure data_value is truly numeric before computing statistics (removes any coerced NA from non-numeric source values).

historical_numeric <- historical %>%
  mutate(across(everything(), as.numeric)) %>%
  filter(is.numeric(data_value))

head(historical_numeric)
## # A tibble: 6 × 5
##   indicatorid datayear geographyid geolocid data_value
##         <dbl>    <dbl>       <dbl>    <dbl>      <dbl>
## 1     1012409     2010           2    18129          0
## 2     1012409     2010           2    18147          0
## 3     1012409     2010           2    18163          0
## 4     1012409     2010           2    21101          0
## 5     1012418     2010           2    18129          0
## 6     1012418     2010           2    18147          0

Step 7: Compute Historical Count, Mean, and SD

For each group (indicatorid, geographyid, geolocid), calculate:

  • n – the number of historical data points
  • mean_value – group mean of data_value
  • sd_value – group standard deviation of data_value
# Count of historical observations per group
count_df <- historical_numeric %>%
  group_by(indicatorid, geographyid, geolocid) %>%
  count() %>%
  mutate(across(everything(), as.numeric))

head(count_df)
## # A tibble: 6 × 4
## # Groups:   indicatorid, geographyid, geolocid [6]
##   indicatorid geographyid geolocid     n
##         <dbl>       <dbl>    <dbl> <dbl>
## 1     1000049           2    18027    14
## 2     1000049           2    18037    14
## 3     1000049           2    18051    14
## 4     1000049           2    18083    14
## 5     1000049           2    18101    14
## 6     1000049           2    18123    14
# Mean and SD per group
mean_sd_df <- historical_numeric %>%
  group_by(indicatorid, geographyid, geolocid) %>%
  summarise(
    mean_value = mean(data_value, na.rm = TRUE),
    sd_value   = sd(data_value,   na.rm = TRUE),
    .groups    = "drop"
  )

head(mean_sd_df)
## # A tibble: 6 × 5
##   indicatorid geographyid geolocid mean_value sd_value
##         <dbl>       <dbl>    <dbl>      <dbl>    <dbl>
## 1     1000049           2    18027     32518.     792.
## 2     1000049           2    18037     42510.     599.
## 3     1000049           2    18051     33446.     261.
## 4     1000049           2    18083     37530.     872.
## 5     1000049           2    18101     10171.     171.
## 6     1000049           2    18123     19272.     129.
# Combine count + mean + SD into one stats table
stats_df <- left_join(mean_sd_df, count_df,
                      by = c("indicatorid", "geographyid", "geolocid"))

head(stats_df)
## # A tibble: 6 × 6
##   indicatorid geographyid geolocid mean_value sd_value     n
##         <dbl>       <dbl>    <dbl>      <dbl>    <dbl> <dbl>
## 1     1000049           2    18027     32518.     792.    14
## 2     1000049           2    18037     42510.     599.    14
## 3     1000049           2    18051     33446.     261.    14
## 4     1000049           2    18083     37530.     872.    14
## 5     1000049           2    18101     10171.     171.    14
## 6     1000049           2    18123     19272.     129.    14

Step 8: Compute Bias (Z-Score)

Join the stats table with the max-year values and compute the Bias score:

\[\text{Bias} = \frac{\text{data\_value}_{max} - \bar{x}}{\sigma}\]

This is a standard z-score measuring how far the most recent value sits from the historical mean in units of standard deviation.

bias_df <- left_join(stats_df, max_year,
                     by = c("indicatorid", "geographyid", "geolocid")) %>%
  mutate(bias = (data_value - mean_value) / sd_value)

head(bias_df)
## # A tibble: 6 × 9
##   indicatorid geographyid geolocid mean_value sd_value     n datayear data_value
##         <dbl>       <dbl>    <dbl>      <dbl>    <dbl> <dbl>    <dbl>      <dbl>
## 1     1000049           2    18027     32518.     792.    14     2024      33658
## 2     1000049           2    18037     42510.     599.    14     2024      43614
## 3     1000049           2    18051     33446.     261.    14     2024      33000
## 4     1000049           2    18083     37530.     872.    14     2024      36007
## 5     1000049           2    18101     10171.     171.    14     2024       9830
## 6     1000049           2    18123     19272.     129.    14     2024      19270
## # ℹ 1 more variable: bias <dbl>

Step 9: Merge Indicator Metadata (DISPLAY_LABEL, HAS_CPRO_VIZ)

Join with indicators_raw to bring in the human-readable indicator name (display_label) and the CPRO visualization flag (has_cpro_viz).

bias_df <- merge(indicators_raw, bias_df, by = "indicatorid")

head(bias_df)
##   indicatorid                               source_name source_abbrev
## 1     1000049 American Community Survey 5-year Averages          ACS5
## 2     1000049 American Community Survey 5-year Averages          ACS5
## 3     1000049 American Community Survey 5-year Averages          ACS5
## 4     1000049 American Community Survey 5-year Averages          ACS5
## 5     1000049 American Community Survey 5-year Averages          ACS5
## 6     1000049 American Community Survey 5-year Averages          ACS5
##             display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People          Yes          55     1405  17762.929
## 2 Total Population People          Yes          55     7445   8953.357
## 3 Total Population People          Yes         342    21780 268524.455
## 4 Total Population People          Yes         651     1813 135289.615
## 5 Total Population People          Yes          55     6445  12551.214
## 6 Total Population People          Yes          55     2765  12460.143
##    sd_value  n datayear data_value       bias
## 1  184.3181 14     2024      18242  2.5991562
## 2  152.8522 14     2024       8624 -2.1547430
## 3 1106.7739 11     2024     270494  1.7795373
## 4  315.8279 13     2023     135154 -0.4293965
## 5  265.5213 14     2024      12154 -1.4959789
## 6  266.2523 14     2024      12939  1.7985090

Step 10: Filter to Statistically Significant Outliers

Keep only records where |bias| >= 2.0 and bias is not NA. This is the standard two-sigma threshold for flagging potential outliers.

bias_df <- bias_df %>%
  filter(!is.na(bias), bias <= -2.0 | bias >= 2.0)

head(bias_df)
##   indicatorid                               source_name source_abbrev
## 1     1000049 American Community Survey 5-year Averages          ACS5
## 2     1000049 American Community Survey 5-year Averages          ACS5
## 3     1000049 American Community Survey 5-year Averages          ACS5
## 4     1000049 American Community Survey 5-year Averages          ACS5
## 5     1000049 American Community Survey 5-year Averages          ACS5
## 6     1000049 American Community Survey 5-year Averages          ACS5
##             display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People          Yes          55     1405  17762.929
## 2 Total Population People          Yes          55     7445   8953.357
## 3 Total Population People          Yes          55     8130  61496.429
## 4 Total Population People          Yes          55     2120  20041.500
## 5 Total Population People          Yes          55      251  45848.571
## 6 Total Population People          Yes           2    18173  61496.429
##    sd_value  n datayear data_value      bias
## 1  184.3181 14     2024      18242  2.599156
## 2  152.8522 14     2024       8624 -2.154743
## 3 1853.2521 14     2024      65261  2.031333
## 4  433.2373 14     2024      21221  2.722526
## 5  632.1534 14     2024      44280 -2.481314
## 6 1853.2521 14     2024      65261  2.031333

Step 11: Join Geography Display Labels

Attach the full geography name (geography_display_label) from the GEOGRAPHIES table.

geo_label_df <- left_join(bias_df, geos_raw,
                           by = c("geographyid", "geolocid"))

head(geo_label_df)
##   indicatorid                               source_name source_abbrev
## 1     1000049 American Community Survey 5-year Averages          ACS5
## 2     1000049 American Community Survey 5-year Averages          ACS5
## 3     1000049 American Community Survey 5-year Averages          ACS5
## 4     1000049 American Community Survey 5-year Averages          ACS5
## 5     1000049 American Community Survey 5-year Averages          ACS5
## 6     1000049 American Community Survey 5-year Averages          ACS5
##             display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People          Yes          55     1405  17762.929
## 2 Total Population People          Yes          55     7445   8953.357
## 3 Total Population People          Yes          55     8130  61496.429
## 4 Total Population People          Yes          55     2120  20041.500
## 5 Total Population People          Yes          55      251  45848.571
## 6 Total Population People          Yes           2    18173  61496.429
##    sd_value  n datayear data_value      bias
## 1  184.3181 14     2024      18242  2.599156
## 2  152.8522 14     2024       8624 -2.154743
## 3 1853.2521 14     2024      65261  2.031333
## 4  433.2373 14     2024      21221  2.722526
## 5  632.1534 14     2024      44280 -2.481314
## 6 1853.2521 14     2024      65261  2.031333
##               geography_display_label
## 1    Washington Community School Corp
## 2    South Spencer County School Corp
## 3          Warrick County School Corp
## 4 Greater Jasper Consolidated Schools
## 5                 Henderson County SD
## 6                             Warrick

Step 12: Filter to Target Geography Types

Retain only the geography types of interest (county = geographyid == 2, school corporation = geographyid == 55). Update this vector as needed for other analyses.

geo_label_df <- geo_label_df %>%
  filter(geographyid %in% c(2, 55))

head(geo_label_df)
##   indicatorid                               source_name source_abbrev
## 1     1000049 American Community Survey 5-year Averages          ACS5
## 2     1000049 American Community Survey 5-year Averages          ACS5
## 3     1000049 American Community Survey 5-year Averages          ACS5
## 4     1000049 American Community Survey 5-year Averages          ACS5
## 5     1000049 American Community Survey 5-year Averages          ACS5
## 6     1000049 American Community Survey 5-year Averages          ACS5
##             display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People          Yes          55     1405  17762.929
## 2 Total Population People          Yes          55     7445   8953.357
## 3 Total Population People          Yes          55     8130  61496.429
## 4 Total Population People          Yes          55     2120  20041.500
## 5 Total Population People          Yes          55      251  45848.571
## 6 Total Population People          Yes           2    18173  61496.429
##    sd_value  n datayear data_value      bias
## 1  184.3181 14     2024      18242  2.599156
## 2  152.8522 14     2024       8624 -2.154743
## 3 1853.2521 14     2024      65261  2.031333
## 4  433.2373 14     2024      21221  2.722526
## 5  632.1534 14     2024      44280 -2.481314
## 6 1853.2521 14     2024      65261  2.031333
##               geography_display_label
## 1    Washington Community School Corp
## 2    South Spencer County School Corp
## 3          Warrick County School Corp
## 4 Greater Jasper Consolidated Schools
## 5                 Henderson County SD
## 6                             Warrick

Step 13: Require Minimum Historical Observations (n >= 3)

Exclude groups with fewer than 3 historical data points, as the standard deviation is unreliable for very small samples.

geo_label_df <- geo_label_df %>%
  filter(!is.na(n), n >= 3)

head(geo_label_df)
##   indicatorid                               source_name source_abbrev
## 1     1000049 American Community Survey 5-year Averages          ACS5
## 2     1000049 American Community Survey 5-year Averages          ACS5
## 3     1000049 American Community Survey 5-year Averages          ACS5
## 4     1000049 American Community Survey 5-year Averages          ACS5
## 5     1000049 American Community Survey 5-year Averages          ACS5
## 6     1000049 American Community Survey 5-year Averages          ACS5
##             display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People          Yes          55     1405  17762.929
## 2 Total Population People          Yes          55     7445   8953.357
## 3 Total Population People          Yes          55     8130  61496.429
## 4 Total Population People          Yes          55     2120  20041.500
## 5 Total Population People          Yes          55      251  45848.571
## 6 Total Population People          Yes           2    18173  61496.429
##    sd_value  n datayear data_value      bias
## 1  184.3181 14     2024      18242  2.599156
## 2  152.8522 14     2024       8624 -2.154743
## 3 1853.2521 14     2024      65261  2.031333
## 4  433.2373 14     2024      21221  2.722526
## 5  632.1534 14     2024      44280 -2.481314
## 6 1853.2521 14     2024      65261  2.031333
##               geography_display_label
## 1    Washington Community School Corp
## 2    South Spencer County School Corp
## 3          Warrick County School Corp
## 4 Greater Jasper Consolidated Schools
## 5                 Henderson County SD
## 6                             Warrick

Step 14: Join Geography Type Metadata (ABBREV)

Join the GEOGRAPHYTYPES table to retrieve the geography-type abbreviation (abbrev). Drop the full geography_name column after joining since abbrev alone is sufficient for grouping.

geo_label_df <- merge(
  geo_label_df,
  geotypes_raw,
  by    = "geographyid",
  all.x = TRUE
) %>%
  select(-geography_name)

head(geo_label_df)
##   geographyid indicatorid                               source_name
## 1           2     1000275 American Community Survey 5-year Averages
## 2           2     1000275 American Community Survey 5-year Averages
## 3           2     1000174 American Community Survey 5-year Averages
## 4           2     1000208 American Community Survey 5-year Averages
## 5           2     1001412 American Community Survey 5-year Averages
## 6           2     1001221          Indiana Business Research Center
##   source_abbrev
## 1          ACS5
## 2          ACS5
## 3          ACS5
## 4          ACS5
## 5          ACS5
## 6          IBRC
##                                                       display_label
## 1 Population 25-64 without a High School Diploma in the Labor Force
## 2 Population 25-64 without a High School Diploma in the Labor Force
## 3                               Male Population Age 60 to 64 People
## 4     Unemployed Hispanic Population as % of Hispanic Civ Lab Force
## 5                 Minority Population - Bachelor's Degree or Higher
## 6                Population Projection - Male Population 0-19 Years
##   has_cpro_viz geolocid   mean_value   sd_value  n datayear  data_value
## 1           No    18101  391.5000000  62.464451 14     2024  210.000000
## 2           No    18027 2314.5714286 186.391942 14     2024 2698.000000
## 3          Yes    18051 1031.7857143 106.453441 14     2024 1254.000000
## 4          Yes    18173    0.9133065   1.991990 14     2024    7.540603
## 5           No    18101   16.3571429   9.434855 14     2024   43.000000
## 6           No    18027 5561.8333333  50.834699  6     2050 5728.000000
##        bias geography_display_label abbrev
## 1 -2.905653                  Martin   CNTY
## 2  2.057109                 Daviess   CNTY
## 3  2.087432                  Gibson   CNTY
## 4  3.326973                 Warrick   CNTY
## 5  2.823876                  Martin   CNTY
## 6  3.268765                 Daviess   CNTY

Step 15: Assign Trend Direction

Create a trend variable classifying each outlier as High (bias > 2) or Low (bias < -2).

geo_label_df <- geo_label_df %>%
  mutate(trend = case_when(
    bias >  2 ~ "High",
    bias < -2 ~ "Low",
    TRUE      ~ "Normal"
  ))

head(geo_label_df)
##   geographyid indicatorid                               source_name
## 1           2     1000275 American Community Survey 5-year Averages
## 2           2     1000275 American Community Survey 5-year Averages
## 3           2     1000174 American Community Survey 5-year Averages
## 4           2     1000208 American Community Survey 5-year Averages
## 5           2     1001412 American Community Survey 5-year Averages
## 6           2     1001221          Indiana Business Research Center
##   source_abbrev
## 1          ACS5
## 2          ACS5
## 3          ACS5
## 4          ACS5
## 5          ACS5
## 6          IBRC
##                                                       display_label
## 1 Population 25-64 without a High School Diploma in the Labor Force
## 2 Population 25-64 without a High School Diploma in the Labor Force
## 3                               Male Population Age 60 to 64 People
## 4     Unemployed Hispanic Population as % of Hispanic Civ Lab Force
## 5                 Minority Population - Bachelor's Degree or Higher
## 6                Population Projection - Male Population 0-19 Years
##   has_cpro_viz geolocid   mean_value   sd_value  n datayear  data_value
## 1           No    18101  391.5000000  62.464451 14     2024  210.000000
## 2           No    18027 2314.5714286 186.391942 14     2024 2698.000000
## 3          Yes    18051 1031.7857143 106.453441 14     2024 1254.000000
## 4          Yes    18173    0.9133065   1.991990 14     2024    7.540603
## 5           No    18101   16.3571429   9.434855 14     2024   43.000000
## 6           No    18027 5561.8333333  50.834699  6     2050 5728.000000
##        bias geography_display_label abbrev trend
## 1 -2.905653                  Martin   CNTY   Low
## 2  2.057109                 Daviess   CNTY  High
## 3  2.087432                  Gibson   CNTY  High
## 4  3.326973                 Warrick   CNTY  High
## 5  2.823876                  Martin   CNTY  High
## 6  3.268765                 Daviess   CNTY  High

Step 16: Apply Stricter Outlier Threshold (|Bias| > 3)

Use a tighter threshold of 3 standard deviations to surface the most extreme outliers. Build a summary count of outlier observations per indicator-geography group.

outlier_threshold <- 3

geography_counts <- geo_label_df %>%
  group_by(geographyid, display_label, indicatorid) %>%
  filter(bias > outlier_threshold) %>%
  summarise(outlier_count = n(), .groups = "drop")

# Re-join full detail columns
geography_counts <- merge(
  geography_counts,
  geo_label_df %>%
    select(geographyid, geolocid, indicatorid, display_label,
           mean_value, sd_value, n, data_value,
           geography_display_label, bias, trend) %>%
    distinct(),
  by     = c("geographyid", "indicatorid"),
  all.x  = TRUE,
  suffixes = c("", "_label")
)

head(geography_counts)
##   geographyid indicatorid
## 1           2     1000105
## 2           2     1000106
## 3           2     1000106
## 4           2     1000106
## 5           2     1000106
## 6           2     1000119
##                                                        display_label
## 1 Population Age 65 and Over Living in Poverty as % of Pop 65 and Up
## 2                Population Age 65 and Over Living in Poverty People
## 3                Population Age 65 and Over Living in Poverty People
## 4                Population Age 65 and Over Living in Poverty People
## 5                Population Age 65 and Over Living in Poverty People
## 6                                         Hispanic Population People
##   outlier_count geolocid
## 1             1    18101
## 2             1    18163
## 3             1    18101
## 4             1    18147
## 5             1    18083
## 6             1    18027
##                                                  display_label_label
## 1 Population Age 65 and Over Living in Poverty as % of Pop 65 and Up
## 2                Population Age 65 and Over Living in Poverty People
## 3                Population Age 65 and Over Living in Poverty People
## 4                Population Age 65 and Over Living in Poverty People
## 5                Population Age 65 and Over Living in Poverty People
## 6                                         Hispanic Population People
##    mean_value    sd_value  n data_value geography_display_label     bias trend
## 1    8.824683   0.6187757 14   11.49706                  Martin 4.318821  High
## 2 2269.357143 367.5012521 14 3080.00000                Vndrbrgh 2.205823  High
## 3  156.285714  18.3153092 14  235.00000                  Martin 4.297732  High
## 4  308.285714  46.7734780 14  435.00000                 Spencer 2.709105  High
## 5  523.714286  80.8925485 14  719.00000                    Knox 2.414137  High
## 6 1573.071429 263.7293089 14 2193.00000                 Daviess 2.350624  High

Step 17: Final Outlier Filter and Cleanup

Keep only rows where |bias| > 3 (excluding infinite values), then remove indicators known to produce systematic data artifacts.

filtered_geography <- geography_counts %>%
  filter((bias > 3 | bias < -3) & is.finite(bias)) %>%
  arrange(desc(bias))

print("Geography records with |Bias| > 3 (excluding infinite values):")
## [1] "Geography records with |Bias| > 3 (excluding infinite values):"
head(filtered_geography)
##   geographyid indicatorid
## 1           2     1000284
## 2          55     1005053
## 3          55     1000925
## 4          55     1005059
## 5          55     1000539
## 6          55     1000518
##                                                                                                                display_label
## 1                                                                    Median Household Income - Hispanic Householders Dollars
## 2                                    People over the age of 5 years speaking primarily other Indo-European languages at home
## 3                                                                       % Enrollment - Asian Students - Total Public Schools
## 4 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 5                                                                  Asian Population 25+ With High School Diploma Only People
## 6                                                                 Asian Population 25+ With Bachelor Degree or Higher People
##   outlier_count geolocid
## 1             1    18129
## 2             2     2110
## 3             3     7445
## 4             1     2110
## 5             3     4325
## 6             3     2110
##                                                                                                          display_label_label
## 1                                                                    Median Household Income - Hispanic Householders Dollars
## 2                                    People over the age of 5 years speaking primarily other Indo-European languages at home
## 3                                                                       % Enrollment - Asian Students - Total Public Schools
## 4 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 5                                                                  Asian Population 25+ With High School Diploma Only People
## 6                                                                 Asian Population 25+ With Bachelor Degree or Higher People
##     mean_value     sd_value  n   data_value             geography_display_label
## 1 3.753575e+04 1.509747e+03  4 1.788440e+05                               Posey
## 2 1.875000e+00 1.125992e+00  8 9.500000e+01 Southwest Dubois County School Corp
## 3 8.456176e-02 3.556189e-03  5 3.734827e-01    South Spencer County School Corp
## 4 2.060024e-02 1.261647e-02  8 1.016913e+00 Southwest Dubois County School Corp
## 5 2.142857e+00 1.657484e+00 14 1.190000e+02              South Knox School Corp
## 6 1.214286e+00 1.251373e+00 14 5.900000e+01 Southwest Dubois County School Corp
##       bias trend
## 1 93.59733  High
## 2 82.70488  High
## 3 81.24455  High
## 4 78.96921  High
## 5 70.50273  High
## 6 46.17785  High
# Indicator IDs flagged for removal due to known data quality issues
ids_to_remove <- unique(c(
  1003860, 1003401, 1000931, 1000204, 1003672,
  1003996, 1003407, 1012339, 1000260, 1003949,
  1004430, 1003952, 1003860, 1011638, 1000561,
  1000931, 1002004, 100436,  1003954, 1011628,
  1003947, 1004436
))

filtered_geography <- filtered_geography %>%
  filter(!indicatorid %in% ids_to_remove)

# Remove a known problematic geography location
filtered_geography <- filtered_geography %>%
  filter(geolocid != 365)

head(filtered_geography)
##   geographyid indicatorid
## 1           2     1000284
## 2          55     1005053
## 3          55     1000925
## 4          55     1005059
## 5          55     1000539
## 6          55     1000518
##                                                                                                                display_label
## 1                                                                    Median Household Income - Hispanic Householders Dollars
## 2                                    People over the age of 5 years speaking primarily other Indo-European languages at home
## 3                                                                       % Enrollment - Asian Students - Total Public Schools
## 4 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 5                                                                  Asian Population 25+ With High School Diploma Only People
## 6                                                                 Asian Population 25+ With Bachelor Degree or Higher People
##   outlier_count geolocid
## 1             1    18129
## 2             2     2110
## 3             3     7445
## 4             1     2110
## 5             3     4325
## 6             3     2110
##                                                                                                          display_label_label
## 1                                                                    Median Household Income - Hispanic Householders Dollars
## 2                                    People over the age of 5 years speaking primarily other Indo-European languages at home
## 3                                                                       % Enrollment - Asian Students - Total Public Schools
## 4 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 5                                                                  Asian Population 25+ With High School Diploma Only People
## 6                                                                 Asian Population 25+ With Bachelor Degree or Higher People
##     mean_value     sd_value  n   data_value             geography_display_label
## 1 3.753575e+04 1.509747e+03  4 1.788440e+05                               Posey
## 2 1.875000e+00 1.125992e+00  8 9.500000e+01 Southwest Dubois County School Corp
## 3 8.456176e-02 3.556189e-03  5 3.734827e-01    South Spencer County School Corp
## 4 2.060024e-02 1.261647e-02  8 1.016913e+00 Southwest Dubois County School Corp
## 5 2.142857e+00 1.657484e+00 14 1.190000e+02              South Knox School Corp
## 6 1.214286e+00 1.251373e+00 14 5.900000e+01 Southwest Dubois County School Corp
##       bias trend
## 1 93.59733  High
## 2 82.70488  High
## 3 81.24455  High
## 4 78.96921  High
## 5 70.50273  High
## 6 46.17785  High

Step 18: Split by Geography Type

Separate the filtered outlier set into counties and school corporations for independent analysis.

# Counties (geographyid == 2)
geography_counts_county <- filtered_geography %>%
  filter(geographyid == 2)

# School Corporations (geographyid == 55)
geography_counts_sc <- filtered_geography %>%
  filter(geographyid == 55)

head(geography_counts_county)
##   geographyid indicatorid
## 1           2     1000284
## 2           2     1004212
## 3           2     1004295
## 4           2     1004633
## 5           2     1003708
## 6           2     1000901
##                                                                        display_label
## 1                            Median Household Income - Hispanic Householders Dollars
## 2             RSEI Aggregated Risk Score for Chemical Air Releases - NonCancer Score
## 3                                                    RSEI NonCancer Score Per Capita
## 4                                    African American Population Age 20 to 24 People
## 5 African American Home Improvement Loan Apps as % of All Home Improvement Loan Apps
## 6                      Enrollment - African American Students - Total Public Schools
##   outlier_count geolocid
## 1             1    18129
## 2             2    18101
## 3             2    18101
## 4             1    18101
## 5             1    18083
## 6             5    18027
##                                                                  display_label_label
## 1                            Median Household Income - Hispanic Householders Dollars
## 2             RSEI Aggregated Risk Score for Chemical Air Releases - NonCancer Score
## 3                                                    RSEI NonCancer Score Per Capita
## 4                                    African American Population Age 20 to 24 People
## 5 African American Home Improvement Loan Apps as % of All Home Improvement Loan Apps
## 6                      Enrollment - African American Students - Total Public Schools
##     mean_value     sd_value  n   data_value geography_display_label     bias
## 1 3.753575e+04 1.509747e+03  4 1.788440e+05                   Posey 93.59733
## 2 1.156423e+04 1.334012e+04  3 5.943543e+05                  Martin 43.68702
## 3 1.150613e+00 1.328687e+00  3 5.875783e+01                  Martin 43.35649
## 4 2.857143e-01 4.688072e-01 14 1.700000e+01                  Martin 35.65279
## 5 7.725116e-01 9.092810e-02  3 2.678571e+00                    Knox 20.96227
## 6 1.114000e+02 4.560702e+00  5 1.850000e+02                 Daviess 16.13787
##   trend
## 1  High
## 2  High
## 3  High
## 4  High
## 5  High
## 6  High
head(geography_counts_sc)
##   geographyid indicatorid
## 1          55     1005053
## 2          55     1000925
## 3          55     1005059
## 4          55     1000539
## 5          55     1000518
## 6          55     1004758
##                                                                                                                display_label
## 1                                    People over the age of 5 years speaking primarily other Indo-European languages at home
## 2                                                                       % Enrollment - Asian Students - Total Public Schools
## 3 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 4                                                                  Asian Population 25+ With High School Diploma Only People
## 5                                                                 Asian Population 25+ With Bachelor Degree or Higher People
## 6                        People over the age of 5 years speaking primarily Korean at home as a % of people over the age of 5
##   outlier_count geolocid
## 1             2     2110
## 2             3     7445
## 3             1     2110
## 4             3     4325
## 5             3     2110
## 6             1     4325
##                                                                                                          display_label_label
## 1                                    People over the age of 5 years speaking primarily other Indo-European languages at home
## 2                                                                       % Enrollment - Asian Students - Total Public Schools
## 3 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 4                                                                  Asian Population 25+ With High School Diploma Only People
## 5                                                                 Asian Population 25+ With Bachelor Degree or Higher People
## 6                        People over the age of 5 years speaking primarily Korean at home as a % of people over the age of 5
##   mean_value    sd_value  n  data_value             geography_display_label
## 1 1.87500000 1.125991626  8  95.0000000 Southwest Dubois County School Corp
## 2 0.08456176 0.003556189  5   0.3734827    South Spencer County School Corp
## 3 0.02060024 0.012616469  8   1.0169129 Southwest Dubois County School Corp
## 4 2.14285714 1.657483860 14 119.0000000              South Knox School Corp
## 5 1.21428571 1.251372872 14  59.0000000 Southwest Dubois County School Corp
## 6 0.12464582 0.107894348  8   3.4399852              South Knox School Corp
##       bias trend
## 1 82.70488  High
## 2 81.24455  High
## 3 78.96921  High
## 4 70.50273  High
## 5 46.17785  High
## 6 30.72765  High

Step 19: Goal 2 – Percent of Geographies Flagged per Indicator

Compute what share of counties (out of 12) and school corporations (out of 25) are flagged as outliers for each indicator.

# Counties: denominator = 12 counties in the study area
goal_2_county <- geography_counts_county %>%
  group_by(indicatorid, display_label, outlier_count, trend) %>%
  summarise(.groups = "drop") %>%
  mutate(percent_of_counties = (outlier_count / 12) * 100)

# School Corporations: denominator = 25 school corporations
goal_2_sc <- geography_counts_sc %>%
  group_by(indicatorid, display_label, outlier_count, trend) %>%
  summarise(.groups = "drop") %>%
  mutate(percent_of_school_corps = (outlier_count / 25) * 100)

head(goal_2_county)
## # A tibble: 6 × 5
##   indicatorid display_label              outlier_count trend percent_of_counties
##         <dbl> <chr>                              <int> <chr>               <dbl>
## 1     1000105 Population Age 65 and Ove…             1 High                 8.33
## 2     1000106 Population Age 65 and Ove…             1 High                 8.33
## 3     1000119 Hispanic Population People             1 High                 8.33
## 4     1000120 Caucasian Population Peop…             3 High                25   
## 5     1000120 Caucasian Population Peop…             3 Low                 25   
## 6     1000127 Median Age - Hispanic Yea…             1 High                 8.33
head(goal_2_sc)
## # A tibble: 6 × 5
##   indicatorid display_label           outlier_count trend percent_of_school_co…¹
##         <dbl> <chr>                           <int> <chr>                  <dbl>
## 1     1000084 Hispanic Population 25…             1 High                       4
## 2     1000085 Hispanic Population 25…             1 High                       4
## 3     1000101 Population Age 18 to 6…             1 High                       4
## 4     1000103 Population Age 5 and U…             1 High                       4
## 5     1000105 Population Age 65 and …             1 High                       4
## 6     1000106 Population Age 65 and …             1 High                       4
## # ℹ abbreviated name: ¹​percent_of_school_corps

Step 20: Total Outliers by Geography (High Trend)

For each geography type, rank individual geographies by how many high-outlier indicators they appear in. This surfaces locations that are broadly trending upward across multiple indicators.

# Counties -- High outliers per geography
geo_2_high <- filtered_geography %>%
  filter(geographyid == 2, trend == "High") %>%
  group_by(geography_display_label) %>%
  summarise(total_outliers = sum(outlier_count), .groups = "drop") %>%
  arrange(desc(total_outliers))

print("Counties with the most High outlier indicators:")
## [1] "Counties with the most High outlier indicators:"
head(geo_2_high)
## # A tibble: 6 × 2
##   geography_display_label total_outliers
##   <chr>                            <int>
## 1 Gibson                             166
## 2 Daviess                            160
## 3 Vndrbrgh                           154
## 4 Dubois                             150
## 5 Spencer                            142
## 6 Warrick                            141
# School Corporations -- High outliers per geography
geo_55_high <- filtered_geography %>%
  filter(geographyid == 55, trend == "High") %>%
  group_by(geography_display_label) %>%
  summarise(total_outliers = sum(outlier_count), .groups = "drop") %>%
  arrange(desc(total_outliers))

print("School Corporations with the most High outlier indicators:")
## [1] "School Corporations with the most High outlier indicators:"
head(geo_55_high)
## # A tibble: 6 × 2
##   geography_display_label             total_outliers
##   <chr>                                        <int>
## 1 Greater Jasper Consolidated Schools            298
## 2 Washington Community School Corp               202
## 3 North Gibson School Corp                       157
## 4 Evansville-Vanderburgh School Corp             156
## 5 Barr-Reeve Community School Corp               155
## 6 North Knox School Corp                         154

Step 21: Total Outliers by Geography (Low Trend)

Same as above but for geographies trending significantly below the historical mean.

# Counties -- Low outliers per geography
geo_2_low <- filtered_geography %>%
  filter(geographyid == 2, trend == "Low") %>%
  group_by(geography_display_label) %>%
  summarise(total_outliers = sum(outlier_count), .groups = "drop") %>%
  arrange(desc(total_outliers))

print("Counties with the most Low outlier indicators:")
## [1] "Counties with the most Low outlier indicators:"
head(geo_2_low)
## # A tibble: 6 × 2
##   geography_display_label total_outliers
##   <chr>                            <int>
## 1 <NA>                                69
## 2 Martin                               8
## 3 Daviess                              7
## 4 Dubois                               6
## 5 Knox                                 5
## 6 Spencer                              5
# School Corporations -- Low outliers per geography
geo_55_low <- filtered_geography %>%
  filter(geographyid == 55, trend == "Low") %>%
  group_by(geography_display_label) %>%
  summarise(total_outliers = sum(outlier_count), .groups = "drop") %>%
  arrange(desc(total_outliers))

print("School Corporations with the most Low outlier indicators:")
## [1] "School Corporations with the most Low outlier indicators:"
head(geo_55_low)
## # A tibble: 6 × 2
##   geography_display_label             total_outliers
##   <chr>                                        <int>
## 1 Shoals Community School Corp                    26
## 2 Evansville-Vanderburgh School Corp              19
## 3 North Spencer County School Corp                12
## 4 Southwest Dubois County School Corp             12
## 5 Vincennes Community School Corp                 12
## 6 Mount Vernon MSD                                10

Step 22: CPRO Viz Filter

Subset the final outlier table to only indicators where has_cpro_viz == "Yes". Because has_cpro_viz lives in indicators_raw and is not automatically carried through all prior joins, we re-join it here before filtering.

cpro_outliers <- filtered_geography %>%
  left_join(
    indicators_raw %>% select(indicatorid, has_cpro_viz),
    by = "indicatorid"
  ) %>%
  filter(tolower(has_cpro_viz) == "yes")

print(paste("Total CPRO-eligible outlier records:", nrow(cpro_outliers)))
## [1] "Total CPRO-eligible outlier records: 954"
head(cpro_outliers)
##   geographyid indicatorid
## 1           2     1000284
## 2           2     1004295
## 3           2     1004633
## 4          55     1004758
## 5          55     1004751
## 6          55     1004670
##                                                                                         display_label
## 1                                             Median Household Income - Hispanic Householders Dollars
## 2                                                                     RSEI NonCancer Score Per Capita
## 3                                                     African American Population Age 20 to 24 People
## 4 People over the age of 5 years speaking primarily Korean at home as a % of people over the age of 5
## 5                                    People over the age of 5 years speaking primarily Korean at home
## 6                                                             Hispanic Population Age 25 to 34 People
##   outlier_count geolocid
## 1             1    18129
## 2             2    18101
## 3             1    18101
## 4             1     4325
## 5             1     4325
## 6             4     5520
##                                                                                   display_label_label
## 1                                             Median Household Income - Hispanic Householders Dollars
## 2                                                                     RSEI NonCancer Score Per Capita
## 3                                                     African American Population Age 20 to 24 People
## 4 People over the age of 5 years speaking primarily Korean at home as a % of people over the age of 5
## 5                                    People over the age of 5 years speaking primarily Korean at home
## 6                                                             Hispanic Population Age 25 to 34 People
##     mean_value     sd_value  n   data_value      geography_display_label
## 1 3.753575e+04 1509.7466399  4 1.788440e+05                        Posey
## 2 1.150613e+00    1.3286872  3 5.875783e+01                       Martin
## 3 2.857143e-01    0.4688072 14 1.700000e+01                       Martin
## 4 1.246458e-01    0.1078943  8 3.439985e+00       South Knox School Corp
## 5 6.875000e+00    5.9145704  8 1.860000e+02       South Knox School Corp
## 6 4.285714e-01    0.7559289 14 1.900000e+01 Shoals Community School Corp
##       bias trend has_cpro_viz
## 1 93.59733  High          Yes
## 2 43.35649  High          Yes
## 3 35.65279  High          Yes
## 4 30.72765  High          Yes
## 5 30.28538  High          Yes
## 6 24.56769  High          Yes

Step 23: Summary Table – Source Breakdown

Use the source_abbrev field from INDICATORS to summarize how many outliers originate from each data source. This helps prioritize follow-up data quality review by source.

source_summary <- filtered_geography %>%
  left_join(indicators_raw %>% select(indicatorid, source_name, source_abbrev),
            by = "indicatorid") %>%
  group_by(source_abbrev, source_name, trend) %>%
  summarise(total_outlier_records = n(), .groups = "drop") %>%
  arrange(desc(total_outlier_records))

print("Outlier records by data source and trend direction:")
## [1] "Outlier records by data source and trend direction:"
head(source_summary, 20)
## # A tibble: 20 × 4
##    source_abbrev source_name                         trend total_outlier_records
##    <chr>         <chr>                               <chr>                 <int>
##  1 ACS5          American Community Survey 5-year A… High                   1424
##  2 IDOE          Indiana Department of Education     High                    110
##  3 ACS5          American Community Survey 5-year A… Low                      87
##  4 INKYDE        Indiana Department of Education/Ke… High                     63
##  5 IDOH          Indiana Department of Health        High                     36
##  6 INSC          Indiana Supreme Court               High                     35
##  7 IDOE          Indiana Department of Education     Low                      32
##  8 HMDA          Federal Financial Institutions Exa… High                     29
##  9 INKYDE        Indiana Department of Education/Ke… Low                      24
## 10 CDCPLC        CDC PLACES                          High                     15
## 11 ACAG          Atmospheric Composition Analysis G… High                     12
## 12 BLSQCE        Bureau of Labor Statistics - Quart… High                     10
## 13 CMS           Center for Medicaid and Medicare S… High                      8
## 14 RSEI          Environmental Protection Agency Ri… High                      8
## 15 LEHD          US Census Bureau - Longitudinal Em… High                      5
## 16 IDOH          Indiana Department of Health        Low                       4
## 17 CMS           Center for Medicaid and Medicare S… Low                       3
## 18 CDC           Centers for Disease Control and Pr… High                      2
## 19 INSC          Indiana Supreme Court               Low                       2
## 20 KDOE          Kentucky Department of Education    High                      2

End of SAVI Outlier Analysis Pipeline