library(dplyr)
library(tidyverse)
library(janitor)
library(htmltools)
packageVersion("htmltools")
## [1] '0.5.8.1'
Load all four relational CSV files and apply
clean_names() to standardize column names to
snake_case.
# DATA table: GEOGRAPHYID, GEOLOCID, INDICATORID, DATAYEAR, DATA_VALUE
data_raw <- read_csv("~/Documents/IU Indy - Spring '26/SAVI Outlier Data Analysis/Input Data/qry_Outlier_Analysis_Data.csv") %>%
clean_names()
# GEOGRAPHIES table: GEOGRAPHYID, GEOLOCID, GEOGRAPHY_DISPLAY_LABEL
geos_raw <- read_csv("~/Documents/IU Indy - Spring '26/SAVI Outlier Data Analysis/Input Data/qry_Outlier_Analysis_Geographies.csv") %>%
clean_names()
# GEOGRAPHYTYPES table: GEOGRAPHYID, ABBREV, GEOGRAPHY_NAME
geotypes_raw <- read_csv("~/Documents/IU Indy - Spring '26/SAVI Outlier Data Analysis/Input Data/qry_Outlier_Analysis_Geography_Types.csv") %>%
clean_names()
# INDICATORS table: INDICATORID, SOURCE_NAME, SOURCE_ABBREV, DISPLAY_LABEL, HAS_CPRO_VIZ
indicators_raw <- read_csv("~/Documents/IU Indy - Spring '26/SAVI Outlier Data Analysis/Input Data/qry_Outlier_Analysis_Indicators.csv") %>%
clean_names()
geotypes_raw <- geotypes_raw %>%
rename(geographyid = id)
Ensure key ID columns are numeric across all tables to support reliable joins.
# DATA
data_raw <- data_raw %>%
mutate(
geographyid = as.numeric(geographyid),
geolocid = as.numeric(geolocid),
indicatorid = as.numeric(indicatorid),
datayear = as.numeric(datayear),
data_value = as.numeric(data_value)
)
# GEOGRAPHIES
geos_raw <- geos_raw %>%
mutate(
geographyid = as.numeric(geographyid),
geolocid = as.numeric(geolocid)
)
# GEOGRAPHYTYPES
geotypes_raw <- geotypes_raw %>%
mutate(geographyid = as.numeric(geographyid))
# INDICATORS -- keep has_cpro_viz as character for filtering later
indicators_raw <- indicators_raw %>%
mutate(indicatorid = as.numeric(indicatorid))
head(data_raw)
## # A tibble: 6 × 5
## datayear geographyid geolocid indicatorid data_value
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010 2 18129 1012409 0
## 2 2010 2 18147 1012409 0
## 3 2010 2 18163 1012409 0
## 4 2010 2 21101 1012409 0
## 5 2010 2 18129 1012418 0
## 6 2010 2 18147 1012418 0
head(geos_raw)
## # A tibble: 6 × 3
## geographyid geolocid geography_display_label
## <dbl> <dbl> <chr>
## 1 2 18027 Daviess
## 2 2 18037 Dubois
## 3 2 18051 Gibson
## 4 2 18083 Knox
## 5 2 18101 Martin
## 6 2 18123 Perry
head(geotypes_raw)
## # A tibble: 6 × 3
## geographyid abbrev geography_name
## <dbl> <chr> <chr>
## 1 651 AAA Area Agency on Aging
## 2 796 PM20 Public Use Microdata Area
## 3 2 CNTY Counties
## 4 3 TWSP Townships
## 5 55 SCRP School Corporations
## 6 282 TR10 Census Tracts 2010
head(indicators_raw)
## # A tibble: 6 × 5
## source_name source_abbrev indicatorid display_label has_cpro_viz
## <chr> <chr> <dbl> <chr> <chr>
## 1 Connect2Help C2H 1003180 Total 211 Calls for the Y… No
## 2 Connect2Help C2H 1003186 Total 211 Calls for the Y… Yes
## 3 Connect2Help C2H 1003339 Total 211 Calls for the Y… No
## 4 Connect2Help C2H 1003235 Total 211 Calls for the Y… Yes
## 5 Connect2Help C2H 1003185 Total 211 Calls for the Y… No
## 6 Connect2Help C2H 1003177 Total 211 Calls for the Y… No
For each combination of indicatorid,
geographyid, and geolocid, find the maximum
datayear. These become the “anchor” records representing
the most current observation.
max_year_by_group <- data_raw %>%
group_by(indicatorid, geographyid, geolocid) %>%
summarise(datayear = max(datayear), .groups = "drop")
# Duplicate key columns for later join tracking
max_year_by_group <- max_year_by_group %>%
mutate(
maxdata_indicatorid = indicatorid,
maxdata_year = datayear,
maxdata_geographyid = geographyid,
maxdata_geolocid = geolocid
)
head(max_year_by_group)
## # A tibble: 6 × 8
## indicatorid geographyid geolocid datayear maxdata_indicatorid maxdata_year
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18027 2024 1000049 2024
## 2 1000049 2 18037 2024 1000049 2024
## 3 1000049 2 18051 2024 1000049 2024
## 4 1000049 2 18083 2024 1000049 2024
## 5 1000049 2 18101 2024 1000049 2024
## 6 1000049 2 18123 2024 1000049 2024
## # ℹ 2 more variables: maxdata_geographyid <dbl>, maxdata_geolocid <dbl>
Perform a full join between data_raw and
max_year_by_group. Rows where the maxdata_*
columns are populated are the most-recent-year observations; rows with
NA in those columns are all prior years.
data_joined <- full_join(
data_raw,
max_year_by_group,
by = c("indicatorid", "geographyid", "geolocid", "datayear")
)
head(data_joined)
## # A tibble: 6 × 9
## datayear geographyid geolocid indicatorid data_value maxdata_indicatorid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010 2 18129 1012409 0 NA
## 2 2010 2 18147 1012409 0 NA
## 3 2010 2 18163 1012409 0 NA
## 4 2010 2 21101 1012409 0 NA
## 5 2010 2 18129 1012418 0 NA
## 6 2010 2 18147 1012418 0 NA
## # ℹ 3 more variables: maxdata_year <dbl>, maxdata_geographyid <dbl>,
## # maxdata_geolocid <dbl>
Split data_joined into two datasets:
max_year – only the most-recent-year
rows (used as the “current” value in outlier scoring).historical – all prior-year rows (used
to compute the historical mean and SD).# Rows where maxdata columns are NOT missing -> these are the max-year records
max_year <- data_joined %>%
filter(complete.cases(.)) %>%
select(indicatorid, datayear, geographyid, geolocid, data_value) %>%
mutate(across(everything(), as.numeric))
head(max_year)
## # A tibble: 6 × 5
## indicatorid datayear geographyid geolocid data_value
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1012496 2023 2 21101 166.
## 2 1012496 2023 2 18129 51.4
## 3 1012496 2023 2 18163 246.
## 4 1013140 2023 2 18147 14
## 5 1013140 2023 2 18129 9
## 6 1013138 2023 2 21101 43
# Rows where maxdata columns ARE missing -> historical (non-max) records
historical <- data_joined %>%
filter(rowSums(is.na(.)) > 0) %>%
select(indicatorid, datayear, geographyid, geolocid, data_value)
head(historical)
## # A tibble: 6 × 5
## indicatorid datayear geographyid geolocid data_value
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1012409 2010 2 18129 0
## 2 1012409 2010 2 18147 0
## 3 1012409 2010 2 18163 0
## 4 1012409 2010 2 21101 0
## 5 1012418 2010 2 18129 0
## 6 1012418 2010 2 18147 0
Ensure data_value is truly numeric before computing
statistics (removes any coerced NA from non-numeric source values).
historical_numeric <- historical %>%
mutate(across(everything(), as.numeric)) %>%
filter(is.numeric(data_value))
head(historical_numeric)
## # A tibble: 6 × 5
## indicatorid datayear geographyid geolocid data_value
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1012409 2010 2 18129 0
## 2 1012409 2010 2 18147 0
## 3 1012409 2010 2 18163 0
## 4 1012409 2010 2 21101 0
## 5 1012418 2010 2 18129 0
## 6 1012418 2010 2 18147 0
For each group (indicatorid, geographyid,
geolocid), calculate:
n – the number of historical data pointsmean_value – group mean of data_valuesd_value – group standard deviation of
data_value# Count of historical observations per group
count_df <- historical_numeric %>%
group_by(indicatorid, geographyid, geolocid) %>%
count() %>%
mutate(across(everything(), as.numeric))
head(count_df)
## # A tibble: 6 × 4
## # Groups: indicatorid, geographyid, geolocid [6]
## indicatorid geographyid geolocid n
## <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18027 14
## 2 1000049 2 18037 14
## 3 1000049 2 18051 14
## 4 1000049 2 18083 14
## 5 1000049 2 18101 14
## 6 1000049 2 18123 14
# Mean and SD per group
mean_sd_df <- historical_numeric %>%
group_by(indicatorid, geographyid, geolocid) %>%
summarise(
mean_value = mean(data_value, na.rm = TRUE),
sd_value = sd(data_value, na.rm = TRUE),
.groups = "drop"
)
head(mean_sd_df)
## # A tibble: 6 × 5
## indicatorid geographyid geolocid mean_value sd_value
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18027 32518. 792.
## 2 1000049 2 18037 42510. 599.
## 3 1000049 2 18051 33446. 261.
## 4 1000049 2 18083 37530. 872.
## 5 1000049 2 18101 10171. 171.
## 6 1000049 2 18123 19272. 129.
# Combine count + mean + SD into one stats table
stats_df <- left_join(mean_sd_df, count_df,
by = c("indicatorid", "geographyid", "geolocid"))
head(stats_df)
## # A tibble: 6 × 6
## indicatorid geographyid geolocid mean_value sd_value n
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18027 32518. 792. 14
## 2 1000049 2 18037 42510. 599. 14
## 3 1000049 2 18051 33446. 261. 14
## 4 1000049 2 18083 37530. 872. 14
## 5 1000049 2 18101 10171. 171. 14
## 6 1000049 2 18123 19272. 129. 14
Join the stats table with the max-year values and compute the Bias score:
\[\text{Bias} = \frac{\text{data\_value}_{max} - \bar{x}}{\sigma}\]
This is a standard z-score measuring how far the most recent value sits from the historical mean in units of standard deviation.
bias_df <- left_join(stats_df, max_year,
by = c("indicatorid", "geographyid", "geolocid")) %>%
mutate(bias = (data_value - mean_value) / sd_value)
head(bias_df)
## # A tibble: 6 × 9
## indicatorid geographyid geolocid mean_value sd_value n datayear data_value
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18027 32518. 792. 14 2024 33658
## 2 1000049 2 18037 42510. 599. 14 2024 43614
## 3 1000049 2 18051 33446. 261. 14 2024 33000
## 4 1000049 2 18083 37530. 872. 14 2024 36007
## 5 1000049 2 18101 10171. 171. 14 2024 9830
## 6 1000049 2 18123 19272. 129. 14 2024 19270
## # ℹ 1 more variable: bias <dbl>
Join with indicators_raw to bring in the human-readable
indicator name (display_label) and the CPRO visualization
flag (has_cpro_viz).
bias_df <- merge(indicators_raw, bias_df, by = "indicatorid")
head(bias_df)
## indicatorid source_name source_abbrev
## 1 1000049 American Community Survey 5-year Averages ACS5
## 2 1000049 American Community Survey 5-year Averages ACS5
## 3 1000049 American Community Survey 5-year Averages ACS5
## 4 1000049 American Community Survey 5-year Averages ACS5
## 5 1000049 American Community Survey 5-year Averages ACS5
## 6 1000049 American Community Survey 5-year Averages ACS5
## display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People Yes 55 1405 17762.929
## 2 Total Population People Yes 55 7445 8953.357
## 3 Total Population People Yes 342 21780 268524.455
## 4 Total Population People Yes 651 1813 135289.615
## 5 Total Population People Yes 55 6445 12551.214
## 6 Total Population People Yes 55 2765 12460.143
## sd_value n datayear data_value bias
## 1 184.3181 14 2024 18242 2.5991562
## 2 152.8522 14 2024 8624 -2.1547430
## 3 1106.7739 11 2024 270494 1.7795373
## 4 315.8279 13 2023 135154 -0.4293965
## 5 265.5213 14 2024 12154 -1.4959789
## 6 266.2523 14 2024 12939 1.7985090
Keep only records where |bias| >= 2.0 and
bias is not NA. This is the standard two-sigma
threshold for flagging potential outliers.
bias_df <- bias_df %>%
filter(!is.na(bias), bias <= -2.0 | bias >= 2.0)
head(bias_df)
## indicatorid source_name source_abbrev
## 1 1000049 American Community Survey 5-year Averages ACS5
## 2 1000049 American Community Survey 5-year Averages ACS5
## 3 1000049 American Community Survey 5-year Averages ACS5
## 4 1000049 American Community Survey 5-year Averages ACS5
## 5 1000049 American Community Survey 5-year Averages ACS5
## 6 1000049 American Community Survey 5-year Averages ACS5
## display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People Yes 55 1405 17762.929
## 2 Total Population People Yes 55 7445 8953.357
## 3 Total Population People Yes 55 8130 61496.429
## 4 Total Population People Yes 55 2120 20041.500
## 5 Total Population People Yes 55 251 45848.571
## 6 Total Population People Yes 2 18173 61496.429
## sd_value n datayear data_value bias
## 1 184.3181 14 2024 18242 2.599156
## 2 152.8522 14 2024 8624 -2.154743
## 3 1853.2521 14 2024 65261 2.031333
## 4 433.2373 14 2024 21221 2.722526
## 5 632.1534 14 2024 44280 -2.481314
## 6 1853.2521 14 2024 65261 2.031333
Attach the full geography name (geography_display_label)
from the GEOGRAPHIES table.
geo_label_df <- left_join(bias_df, geos_raw,
by = c("geographyid", "geolocid"))
head(geo_label_df)
## indicatorid source_name source_abbrev
## 1 1000049 American Community Survey 5-year Averages ACS5
## 2 1000049 American Community Survey 5-year Averages ACS5
## 3 1000049 American Community Survey 5-year Averages ACS5
## 4 1000049 American Community Survey 5-year Averages ACS5
## 5 1000049 American Community Survey 5-year Averages ACS5
## 6 1000049 American Community Survey 5-year Averages ACS5
## display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People Yes 55 1405 17762.929
## 2 Total Population People Yes 55 7445 8953.357
## 3 Total Population People Yes 55 8130 61496.429
## 4 Total Population People Yes 55 2120 20041.500
## 5 Total Population People Yes 55 251 45848.571
## 6 Total Population People Yes 2 18173 61496.429
## sd_value n datayear data_value bias
## 1 184.3181 14 2024 18242 2.599156
## 2 152.8522 14 2024 8624 -2.154743
## 3 1853.2521 14 2024 65261 2.031333
## 4 433.2373 14 2024 21221 2.722526
## 5 632.1534 14 2024 44280 -2.481314
## 6 1853.2521 14 2024 65261 2.031333
## geography_display_label
## 1 Washington Community School Corp
## 2 South Spencer County School Corp
## 3 Warrick County School Corp
## 4 Greater Jasper Consolidated Schools
## 5 Henderson County SD
## 6 Warrick
Retain only the geography types of interest (county =
geographyid == 2, school corporation =
geographyid == 55). Update this vector as needed for other
analyses.
geo_label_df <- geo_label_df %>%
filter(geographyid %in% c(2, 55))
head(geo_label_df)
## indicatorid source_name source_abbrev
## 1 1000049 American Community Survey 5-year Averages ACS5
## 2 1000049 American Community Survey 5-year Averages ACS5
## 3 1000049 American Community Survey 5-year Averages ACS5
## 4 1000049 American Community Survey 5-year Averages ACS5
## 5 1000049 American Community Survey 5-year Averages ACS5
## 6 1000049 American Community Survey 5-year Averages ACS5
## display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People Yes 55 1405 17762.929
## 2 Total Population People Yes 55 7445 8953.357
## 3 Total Population People Yes 55 8130 61496.429
## 4 Total Population People Yes 55 2120 20041.500
## 5 Total Population People Yes 55 251 45848.571
## 6 Total Population People Yes 2 18173 61496.429
## sd_value n datayear data_value bias
## 1 184.3181 14 2024 18242 2.599156
## 2 152.8522 14 2024 8624 -2.154743
## 3 1853.2521 14 2024 65261 2.031333
## 4 433.2373 14 2024 21221 2.722526
## 5 632.1534 14 2024 44280 -2.481314
## 6 1853.2521 14 2024 65261 2.031333
## geography_display_label
## 1 Washington Community School Corp
## 2 South Spencer County School Corp
## 3 Warrick County School Corp
## 4 Greater Jasper Consolidated Schools
## 5 Henderson County SD
## 6 Warrick
Exclude groups with fewer than 3 historical data points, as the standard deviation is unreliable for very small samples.
geo_label_df <- geo_label_df %>%
filter(!is.na(n), n >= 3)
head(geo_label_df)
## indicatorid source_name source_abbrev
## 1 1000049 American Community Survey 5-year Averages ACS5
## 2 1000049 American Community Survey 5-year Averages ACS5
## 3 1000049 American Community Survey 5-year Averages ACS5
## 4 1000049 American Community Survey 5-year Averages ACS5
## 5 1000049 American Community Survey 5-year Averages ACS5
## 6 1000049 American Community Survey 5-year Averages ACS5
## display_label has_cpro_viz geographyid geolocid mean_value
## 1 Total Population People Yes 55 1405 17762.929
## 2 Total Population People Yes 55 7445 8953.357
## 3 Total Population People Yes 55 8130 61496.429
## 4 Total Population People Yes 55 2120 20041.500
## 5 Total Population People Yes 55 251 45848.571
## 6 Total Population People Yes 2 18173 61496.429
## sd_value n datayear data_value bias
## 1 184.3181 14 2024 18242 2.599156
## 2 152.8522 14 2024 8624 -2.154743
## 3 1853.2521 14 2024 65261 2.031333
## 4 433.2373 14 2024 21221 2.722526
## 5 632.1534 14 2024 44280 -2.481314
## 6 1853.2521 14 2024 65261 2.031333
## geography_display_label
## 1 Washington Community School Corp
## 2 South Spencer County School Corp
## 3 Warrick County School Corp
## 4 Greater Jasper Consolidated Schools
## 5 Henderson County SD
## 6 Warrick
Join the GEOGRAPHYTYPES table to retrieve the
geography-type abbreviation (abbrev). Drop the full
geography_name column after joining since
abbrev alone is sufficient for grouping.
geo_label_df <- merge(
geo_label_df,
geotypes_raw,
by = "geographyid",
all.x = TRUE
) %>%
select(-geography_name)
head(geo_label_df)
## geographyid indicatorid source_name
## 1 2 1000275 American Community Survey 5-year Averages
## 2 2 1000275 American Community Survey 5-year Averages
## 3 2 1000174 American Community Survey 5-year Averages
## 4 2 1000208 American Community Survey 5-year Averages
## 5 2 1001412 American Community Survey 5-year Averages
## 6 2 1001221 Indiana Business Research Center
## source_abbrev
## 1 ACS5
## 2 ACS5
## 3 ACS5
## 4 ACS5
## 5 ACS5
## 6 IBRC
## display_label
## 1 Population 25-64 without a High School Diploma in the Labor Force
## 2 Population 25-64 without a High School Diploma in the Labor Force
## 3 Male Population Age 60 to 64 People
## 4 Unemployed Hispanic Population as % of Hispanic Civ Lab Force
## 5 Minority Population - Bachelor's Degree or Higher
## 6 Population Projection - Male Population 0-19 Years
## has_cpro_viz geolocid mean_value sd_value n datayear data_value
## 1 No 18101 391.5000000 62.464451 14 2024 210.000000
## 2 No 18027 2314.5714286 186.391942 14 2024 2698.000000
## 3 Yes 18051 1031.7857143 106.453441 14 2024 1254.000000
## 4 Yes 18173 0.9133065 1.991990 14 2024 7.540603
## 5 No 18101 16.3571429 9.434855 14 2024 43.000000
## 6 No 18027 5561.8333333 50.834699 6 2050 5728.000000
## bias geography_display_label abbrev
## 1 -2.905653 Martin CNTY
## 2 2.057109 Daviess CNTY
## 3 2.087432 Gibson CNTY
## 4 3.326973 Warrick CNTY
## 5 2.823876 Martin CNTY
## 6 3.268765 Daviess CNTY
Create a trend variable classifying each outlier as
High (bias > 2) or Low (bias <
-2).
geo_label_df <- geo_label_df %>%
mutate(trend = case_when(
bias > 2 ~ "High",
bias < -2 ~ "Low",
TRUE ~ "Normal"
))
head(geo_label_df)
## geographyid indicatorid source_name
## 1 2 1000275 American Community Survey 5-year Averages
## 2 2 1000275 American Community Survey 5-year Averages
## 3 2 1000174 American Community Survey 5-year Averages
## 4 2 1000208 American Community Survey 5-year Averages
## 5 2 1001412 American Community Survey 5-year Averages
## 6 2 1001221 Indiana Business Research Center
## source_abbrev
## 1 ACS5
## 2 ACS5
## 3 ACS5
## 4 ACS5
## 5 ACS5
## 6 IBRC
## display_label
## 1 Population 25-64 without a High School Diploma in the Labor Force
## 2 Population 25-64 without a High School Diploma in the Labor Force
## 3 Male Population Age 60 to 64 People
## 4 Unemployed Hispanic Population as % of Hispanic Civ Lab Force
## 5 Minority Population - Bachelor's Degree or Higher
## 6 Population Projection - Male Population 0-19 Years
## has_cpro_viz geolocid mean_value sd_value n datayear data_value
## 1 No 18101 391.5000000 62.464451 14 2024 210.000000
## 2 No 18027 2314.5714286 186.391942 14 2024 2698.000000
## 3 Yes 18051 1031.7857143 106.453441 14 2024 1254.000000
## 4 Yes 18173 0.9133065 1.991990 14 2024 7.540603
## 5 No 18101 16.3571429 9.434855 14 2024 43.000000
## 6 No 18027 5561.8333333 50.834699 6 2050 5728.000000
## bias geography_display_label abbrev trend
## 1 -2.905653 Martin CNTY Low
## 2 2.057109 Daviess CNTY High
## 3 2.087432 Gibson CNTY High
## 4 3.326973 Warrick CNTY High
## 5 2.823876 Martin CNTY High
## 6 3.268765 Daviess CNTY High
Use a tighter threshold of 3 standard deviations to surface the most extreme outliers. Build a summary count of outlier observations per indicator-geography group.
outlier_threshold <- 3
geography_counts <- geo_label_df %>%
group_by(geographyid, display_label, indicatorid) %>%
filter(bias > outlier_threshold) %>%
summarise(outlier_count = n(), .groups = "drop")
# Re-join full detail columns
geography_counts <- merge(
geography_counts,
geo_label_df %>%
select(geographyid, geolocid, indicatorid, display_label,
mean_value, sd_value, n, data_value,
geography_display_label, bias, trend) %>%
distinct(),
by = c("geographyid", "indicatorid"),
all.x = TRUE,
suffixes = c("", "_label")
)
head(geography_counts)
## geographyid indicatorid
## 1 2 1000105
## 2 2 1000106
## 3 2 1000106
## 4 2 1000106
## 5 2 1000106
## 6 2 1000119
## display_label
## 1 Population Age 65 and Over Living in Poverty as % of Pop 65 and Up
## 2 Population Age 65 and Over Living in Poverty People
## 3 Population Age 65 and Over Living in Poverty People
## 4 Population Age 65 and Over Living in Poverty People
## 5 Population Age 65 and Over Living in Poverty People
## 6 Hispanic Population People
## outlier_count geolocid
## 1 1 18101
## 2 1 18163
## 3 1 18101
## 4 1 18147
## 5 1 18083
## 6 1 18027
## display_label_label
## 1 Population Age 65 and Over Living in Poverty as % of Pop 65 and Up
## 2 Population Age 65 and Over Living in Poverty People
## 3 Population Age 65 and Over Living in Poverty People
## 4 Population Age 65 and Over Living in Poverty People
## 5 Population Age 65 and Over Living in Poverty People
## 6 Hispanic Population People
## mean_value sd_value n data_value geography_display_label bias trend
## 1 8.824683 0.6187757 14 11.49706 Martin 4.318821 High
## 2 2269.357143 367.5012521 14 3080.00000 Vndrbrgh 2.205823 High
## 3 156.285714 18.3153092 14 235.00000 Martin 4.297732 High
## 4 308.285714 46.7734780 14 435.00000 Spencer 2.709105 High
## 5 523.714286 80.8925485 14 719.00000 Knox 2.414137 High
## 6 1573.071429 263.7293089 14 2193.00000 Daviess 2.350624 High
Keep only rows where |bias| > 3 (excluding infinite
values), then remove indicators known to produce systematic data
artifacts.
filtered_geography <- geography_counts %>%
filter((bias > 3 | bias < -3) & is.finite(bias)) %>%
arrange(desc(bias))
print("Geography records with |Bias| > 3 (excluding infinite values):")
## [1] "Geography records with |Bias| > 3 (excluding infinite values):"
head(filtered_geography)
## geographyid indicatorid
## 1 2 1000284
## 2 55 1005053
## 3 55 1000925
## 4 55 1005059
## 5 55 1000539
## 6 55 1000518
## display_label
## 1 Median Household Income - Hispanic Householders Dollars
## 2 People over the age of 5 years speaking primarily other Indo-European languages at home
## 3 % Enrollment - Asian Students - Total Public Schools
## 4 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 5 Asian Population 25+ With High School Diploma Only People
## 6 Asian Population 25+ With Bachelor Degree or Higher People
## outlier_count geolocid
## 1 1 18129
## 2 2 2110
## 3 3 7445
## 4 1 2110
## 5 3 4325
## 6 3 2110
## display_label_label
## 1 Median Household Income - Hispanic Householders Dollars
## 2 People over the age of 5 years speaking primarily other Indo-European languages at home
## 3 % Enrollment - Asian Students - Total Public Schools
## 4 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 5 Asian Population 25+ With High School Diploma Only People
## 6 Asian Population 25+ With Bachelor Degree or Higher People
## mean_value sd_value n data_value geography_display_label
## 1 3.753575e+04 1.509747e+03 4 1.788440e+05 Posey
## 2 1.875000e+00 1.125992e+00 8 9.500000e+01 Southwest Dubois County School Corp
## 3 8.456176e-02 3.556189e-03 5 3.734827e-01 South Spencer County School Corp
## 4 2.060024e-02 1.261647e-02 8 1.016913e+00 Southwest Dubois County School Corp
## 5 2.142857e+00 1.657484e+00 14 1.190000e+02 South Knox School Corp
## 6 1.214286e+00 1.251373e+00 14 5.900000e+01 Southwest Dubois County School Corp
## bias trend
## 1 93.59733 High
## 2 82.70488 High
## 3 81.24455 High
## 4 78.96921 High
## 5 70.50273 High
## 6 46.17785 High
# Indicator IDs flagged for removal due to known data quality issues
ids_to_remove <- unique(c(
1003860, 1003401, 1000931, 1000204, 1003672,
1003996, 1003407, 1012339, 1000260, 1003949,
1004430, 1003952, 1003860, 1011638, 1000561,
1000931, 1002004, 100436, 1003954, 1011628,
1003947, 1004436
))
filtered_geography <- filtered_geography %>%
filter(!indicatorid %in% ids_to_remove)
# Remove a known problematic geography location
filtered_geography <- filtered_geography %>%
filter(geolocid != 365)
head(filtered_geography)
## geographyid indicatorid
## 1 2 1000284
## 2 55 1005053
## 3 55 1000925
## 4 55 1005059
## 5 55 1000539
## 6 55 1000518
## display_label
## 1 Median Household Income - Hispanic Householders Dollars
## 2 People over the age of 5 years speaking primarily other Indo-European languages at home
## 3 % Enrollment - Asian Students - Total Public Schools
## 4 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 5 Asian Population 25+ With High School Diploma Only People
## 6 Asian Population 25+ With Bachelor Degree or Higher People
## outlier_count geolocid
## 1 1 18129
## 2 2 2110
## 3 3 7445
## 4 1 2110
## 5 3 4325
## 6 3 2110
## display_label_label
## 1 Median Household Income - Hispanic Householders Dollars
## 2 People over the age of 5 years speaking primarily other Indo-European languages at home
## 3 % Enrollment - Asian Students - Total Public Schools
## 4 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 5 Asian Population 25+ With High School Diploma Only People
## 6 Asian Population 25+ With Bachelor Degree or Higher People
## mean_value sd_value n data_value geography_display_label
## 1 3.753575e+04 1.509747e+03 4 1.788440e+05 Posey
## 2 1.875000e+00 1.125992e+00 8 9.500000e+01 Southwest Dubois County School Corp
## 3 8.456176e-02 3.556189e-03 5 3.734827e-01 South Spencer County School Corp
## 4 2.060024e-02 1.261647e-02 8 1.016913e+00 Southwest Dubois County School Corp
## 5 2.142857e+00 1.657484e+00 14 1.190000e+02 South Knox School Corp
## 6 1.214286e+00 1.251373e+00 14 5.900000e+01 Southwest Dubois County School Corp
## bias trend
## 1 93.59733 High
## 2 82.70488 High
## 3 81.24455 High
## 4 78.96921 High
## 5 70.50273 High
## 6 46.17785 High
Separate the filtered outlier set into counties and school corporations for independent analysis.
# Counties (geographyid == 2)
geography_counts_county <- filtered_geography %>%
filter(geographyid == 2)
# School Corporations (geographyid == 55)
geography_counts_sc <- filtered_geography %>%
filter(geographyid == 55)
head(geography_counts_county)
## geographyid indicatorid
## 1 2 1000284
## 2 2 1004212
## 3 2 1004295
## 4 2 1004633
## 5 2 1003708
## 6 2 1000901
## display_label
## 1 Median Household Income - Hispanic Householders Dollars
## 2 RSEI Aggregated Risk Score for Chemical Air Releases - NonCancer Score
## 3 RSEI NonCancer Score Per Capita
## 4 African American Population Age 20 to 24 People
## 5 African American Home Improvement Loan Apps as % of All Home Improvement Loan Apps
## 6 Enrollment - African American Students - Total Public Schools
## outlier_count geolocid
## 1 1 18129
## 2 2 18101
## 3 2 18101
## 4 1 18101
## 5 1 18083
## 6 5 18027
## display_label_label
## 1 Median Household Income - Hispanic Householders Dollars
## 2 RSEI Aggregated Risk Score for Chemical Air Releases - NonCancer Score
## 3 RSEI NonCancer Score Per Capita
## 4 African American Population Age 20 to 24 People
## 5 African American Home Improvement Loan Apps as % of All Home Improvement Loan Apps
## 6 Enrollment - African American Students - Total Public Schools
## mean_value sd_value n data_value geography_display_label bias
## 1 3.753575e+04 1.509747e+03 4 1.788440e+05 Posey 93.59733
## 2 1.156423e+04 1.334012e+04 3 5.943543e+05 Martin 43.68702
## 3 1.150613e+00 1.328687e+00 3 5.875783e+01 Martin 43.35649
## 4 2.857143e-01 4.688072e-01 14 1.700000e+01 Martin 35.65279
## 5 7.725116e-01 9.092810e-02 3 2.678571e+00 Knox 20.96227
## 6 1.114000e+02 4.560702e+00 5 1.850000e+02 Daviess 16.13787
## trend
## 1 High
## 2 High
## 3 High
## 4 High
## 5 High
## 6 High
head(geography_counts_sc)
## geographyid indicatorid
## 1 55 1005053
## 2 55 1000925
## 3 55 1005059
## 4 55 1000539
## 5 55 1000518
## 6 55 1004758
## display_label
## 1 People over the age of 5 years speaking primarily other Indo-European languages at home
## 2 % Enrollment - Asian Students - Total Public Schools
## 3 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 4 Asian Population 25+ With High School Diploma Only People
## 5 Asian Population 25+ With Bachelor Degree or Higher People
## 6 People over the age of 5 years speaking primarily Korean at home as a % of people over the age of 5
## outlier_count geolocid
## 1 2 2110
## 2 3 7445
## 3 1 2110
## 4 3 4325
## 5 3 2110
## 6 1 4325
## display_label_label
## 1 People over the age of 5 years speaking primarily other Indo-European languages at home
## 2 % Enrollment - Asian Students - Total Public Schools
## 3 People over the age of 5 years speaking primarily other Indo-European languages at home as a % of people over the age of 5
## 4 Asian Population 25+ With High School Diploma Only People
## 5 Asian Population 25+ With Bachelor Degree or Higher People
## 6 People over the age of 5 years speaking primarily Korean at home as a % of people over the age of 5
## mean_value sd_value n data_value geography_display_label
## 1 1.87500000 1.125991626 8 95.0000000 Southwest Dubois County School Corp
## 2 0.08456176 0.003556189 5 0.3734827 South Spencer County School Corp
## 3 0.02060024 0.012616469 8 1.0169129 Southwest Dubois County School Corp
## 4 2.14285714 1.657483860 14 119.0000000 South Knox School Corp
## 5 1.21428571 1.251372872 14 59.0000000 Southwest Dubois County School Corp
## 6 0.12464582 0.107894348 8 3.4399852 South Knox School Corp
## bias trend
## 1 82.70488 High
## 2 81.24455 High
## 3 78.96921 High
## 4 70.50273 High
## 5 46.17785 High
## 6 30.72765 High
Compute what share of counties (out of 12) and school corporations (out of 25) are flagged as outliers for each indicator.
# Counties: denominator = 12 counties in the study area
goal_2_county <- geography_counts_county %>%
group_by(indicatorid, display_label, outlier_count, trend) %>%
summarise(.groups = "drop") %>%
mutate(percent_of_counties = (outlier_count / 12) * 100)
# School Corporations: denominator = 25 school corporations
goal_2_sc <- geography_counts_sc %>%
group_by(indicatorid, display_label, outlier_count, trend) %>%
summarise(.groups = "drop") %>%
mutate(percent_of_school_corps = (outlier_count / 25) * 100)
head(goal_2_county)
## # A tibble: 6 × 5
## indicatorid display_label outlier_count trend percent_of_counties
## <dbl> <chr> <int> <chr> <dbl>
## 1 1000105 Population Age 65 and Ove… 1 High 8.33
## 2 1000106 Population Age 65 and Ove… 1 High 8.33
## 3 1000119 Hispanic Population People 1 High 8.33
## 4 1000120 Caucasian Population Peop… 3 High 25
## 5 1000120 Caucasian Population Peop… 3 Low 25
## 6 1000127 Median Age - Hispanic Yea… 1 High 8.33
head(goal_2_sc)
## # A tibble: 6 × 5
## indicatorid display_label outlier_count trend percent_of_school_co…¹
## <dbl> <chr> <int> <chr> <dbl>
## 1 1000084 Hispanic Population 25… 1 High 4
## 2 1000085 Hispanic Population 25… 1 High 4
## 3 1000101 Population Age 18 to 6… 1 High 4
## 4 1000103 Population Age 5 and U… 1 High 4
## 5 1000105 Population Age 65 and … 1 High 4
## 6 1000106 Population Age 65 and … 1 High 4
## # ℹ abbreviated name: ¹percent_of_school_corps
For each geography type, rank individual geographies by how many high-outlier indicators they appear in. This surfaces locations that are broadly trending upward across multiple indicators.
# Counties -- High outliers per geography
geo_2_high <- filtered_geography %>%
filter(geographyid == 2, trend == "High") %>%
group_by(geography_display_label) %>%
summarise(total_outliers = sum(outlier_count), .groups = "drop") %>%
arrange(desc(total_outliers))
print("Counties with the most High outlier indicators:")
## [1] "Counties with the most High outlier indicators:"
head(geo_2_high)
## # A tibble: 6 × 2
## geography_display_label total_outliers
## <chr> <int>
## 1 Gibson 166
## 2 Daviess 160
## 3 Vndrbrgh 154
## 4 Dubois 150
## 5 Spencer 142
## 6 Warrick 141
# School Corporations -- High outliers per geography
geo_55_high <- filtered_geography %>%
filter(geographyid == 55, trend == "High") %>%
group_by(geography_display_label) %>%
summarise(total_outliers = sum(outlier_count), .groups = "drop") %>%
arrange(desc(total_outliers))
print("School Corporations with the most High outlier indicators:")
## [1] "School Corporations with the most High outlier indicators:"
head(geo_55_high)
## # A tibble: 6 × 2
## geography_display_label total_outliers
## <chr> <int>
## 1 Greater Jasper Consolidated Schools 298
## 2 Washington Community School Corp 202
## 3 North Gibson School Corp 157
## 4 Evansville-Vanderburgh School Corp 156
## 5 Barr-Reeve Community School Corp 155
## 6 North Knox School Corp 154
Same as above but for geographies trending significantly below the historical mean.
# Counties -- Low outliers per geography
geo_2_low <- filtered_geography %>%
filter(geographyid == 2, trend == "Low") %>%
group_by(geography_display_label) %>%
summarise(total_outliers = sum(outlier_count), .groups = "drop") %>%
arrange(desc(total_outliers))
print("Counties with the most Low outlier indicators:")
## [1] "Counties with the most Low outlier indicators:"
head(geo_2_low)
## # A tibble: 6 × 2
## geography_display_label total_outliers
## <chr> <int>
## 1 <NA> 69
## 2 Martin 8
## 3 Daviess 7
## 4 Dubois 6
## 5 Knox 5
## 6 Spencer 5
# School Corporations -- Low outliers per geography
geo_55_low <- filtered_geography %>%
filter(geographyid == 55, trend == "Low") %>%
group_by(geography_display_label) %>%
summarise(total_outliers = sum(outlier_count), .groups = "drop") %>%
arrange(desc(total_outliers))
print("School Corporations with the most Low outlier indicators:")
## [1] "School Corporations with the most Low outlier indicators:"
head(geo_55_low)
## # A tibble: 6 × 2
## geography_display_label total_outliers
## <chr> <int>
## 1 Shoals Community School Corp 26
## 2 Evansville-Vanderburgh School Corp 19
## 3 North Spencer County School Corp 12
## 4 Southwest Dubois County School Corp 12
## 5 Vincennes Community School Corp 12
## 6 Mount Vernon MSD 10
Subset the final outlier table to only indicators where
has_cpro_viz == "Yes". Because has_cpro_viz
lives in indicators_raw and is not automatically carried
through all prior joins, we re-join it here before filtering.
cpro_outliers <- filtered_geography %>%
left_join(
indicators_raw %>% select(indicatorid, has_cpro_viz),
by = "indicatorid"
) %>%
filter(tolower(has_cpro_viz) == "yes")
print(paste("Total CPRO-eligible outlier records:", nrow(cpro_outliers)))
## [1] "Total CPRO-eligible outlier records: 954"
head(cpro_outliers)
## geographyid indicatorid
## 1 2 1000284
## 2 2 1004295
## 3 2 1004633
## 4 55 1004758
## 5 55 1004751
## 6 55 1004670
## display_label
## 1 Median Household Income - Hispanic Householders Dollars
## 2 RSEI NonCancer Score Per Capita
## 3 African American Population Age 20 to 24 People
## 4 People over the age of 5 years speaking primarily Korean at home as a % of people over the age of 5
## 5 People over the age of 5 years speaking primarily Korean at home
## 6 Hispanic Population Age 25 to 34 People
## outlier_count geolocid
## 1 1 18129
## 2 2 18101
## 3 1 18101
## 4 1 4325
## 5 1 4325
## 6 4 5520
## display_label_label
## 1 Median Household Income - Hispanic Householders Dollars
## 2 RSEI NonCancer Score Per Capita
## 3 African American Population Age 20 to 24 People
## 4 People over the age of 5 years speaking primarily Korean at home as a % of people over the age of 5
## 5 People over the age of 5 years speaking primarily Korean at home
## 6 Hispanic Population Age 25 to 34 People
## mean_value sd_value n data_value geography_display_label
## 1 3.753575e+04 1509.7466399 4 1.788440e+05 Posey
## 2 1.150613e+00 1.3286872 3 5.875783e+01 Martin
## 3 2.857143e-01 0.4688072 14 1.700000e+01 Martin
## 4 1.246458e-01 0.1078943 8 3.439985e+00 South Knox School Corp
## 5 6.875000e+00 5.9145704 8 1.860000e+02 South Knox School Corp
## 6 4.285714e-01 0.7559289 14 1.900000e+01 Shoals Community School Corp
## bias trend has_cpro_viz
## 1 93.59733 High Yes
## 2 43.35649 High Yes
## 3 35.65279 High Yes
## 4 30.72765 High Yes
## 5 30.28538 High Yes
## 6 24.56769 High Yes
Use the source_abbrev field from INDICATORS
to summarize how many outliers originate from each data source. This
helps prioritize follow-up data quality review by source.
source_summary <- filtered_geography %>%
left_join(indicators_raw %>% select(indicatorid, source_name, source_abbrev),
by = "indicatorid") %>%
group_by(source_abbrev, source_name, trend) %>%
summarise(total_outlier_records = n(), .groups = "drop") %>%
arrange(desc(total_outlier_records))
print("Outlier records by data source and trend direction:")
## [1] "Outlier records by data source and trend direction:"
head(source_summary, 20)
## # A tibble: 20 × 4
## source_abbrev source_name trend total_outlier_records
## <chr> <chr> <chr> <int>
## 1 ACS5 American Community Survey 5-year A… High 1424
## 2 IDOE Indiana Department of Education High 110
## 3 ACS5 American Community Survey 5-year A… Low 87
## 4 INKYDE Indiana Department of Education/Ke… High 63
## 5 IDOH Indiana Department of Health High 36
## 6 INSC Indiana Supreme Court High 35
## 7 IDOE Indiana Department of Education Low 32
## 8 HMDA Federal Financial Institutions Exa… High 29
## 9 INKYDE Indiana Department of Education/Ke… Low 24
## 10 CDCPLC CDC PLACES High 15
## 11 ACAG Atmospheric Composition Analysis G… High 12
## 12 BLSQCE Bureau of Labor Statistics - Quart… High 10
## 13 CMS Center for Medicaid and Medicare S… High 8
## 14 RSEI Environmental Protection Agency Ri… High 8
## 15 LEHD US Census Bureau - Longitudinal Em… High 5
## 16 IDOH Indiana Department of Health Low 4
## 17 CMS Center for Medicaid and Medicare S… Low 3
## 18 CDC Centers for Disease Control and Pr… High 2
## 19 INSC Indiana Supreme Court Low 2
## 20 KDOE Kentucky Department of Education High 2
End of SAVI Outlier Analysis Pipeline