library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   4.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(htmltools)
packageVersion("htmltools")
## [1] '0.5.8.1'
library(readxl)
# Load files from Documents
Outlier_Analysis <-  read_csv("/Users/woodsprocise/Documents/IU Indy - Fall '25/SAVI Outliers Repo/SAVI-Outliers/data/SAVI_data/qry_Outlier_Analysis_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 743370 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): INDICATORID, DATAYEAR, GEOGRAPHYID, GEOLOCID, DATA_VALUE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Outlier_Analysis_Geographies <-read_csv("~/Documents/IU Indy - Fall '25/SAVI Outliers Repo/SAVI-Outliers/data/SAVI_data/qry_Outlier_Analysis_Geographies.csv")
## Rows: 4711 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GEOGRAPHY_DISPLAY_LABEL
## dbl (2): GEOGRAPHYID, GEOLOCID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Outliers_CPRO_Viz <-read_excel("/Users/woodsprocise/Documents/IU Indy - Fall '25/SAVI Outliers Repo/SAVI-Outliers/data/SAVI_data/qry_Outlier_Analysis_Indicators.xlsx")
Geography_Type_Name <-read_csv("~/Documents/IU Indy - Fall '25/SAVI Outliers Repo/SAVI-Outliers/data/SAVI_data/qry_Outlier_Analysis_Geography_Types.csv")
## Rows: 26 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ABBREV, GEOGRAPHY_NAME
## dbl (1): ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Filter Max_year for each group

max_year_by_group <- Outlier_Analysis %>%
  group_by(INDICATORID, GEOGRAPHYID, GEOLOCID) %>%
  summarise(DATAYEAR = max(DATAYEAR))
## `summarise()` has grouped output by 'INDICATORID', 'GEOGRAPHYID'. You can
## override using the `.groups` argument.
head(max_year_by_group)
## # A tibble: 6 × 4
## # Groups:   INDICATORID, GEOGRAPHYID [1]
##   INDICATORID GEOGRAPHYID GEOLOCID DATAYEAR
##         <dbl>       <dbl>    <dbl>    <dbl>
## 1     1000049           2    18001     2020
## 2     1000049           2    18003     2020
## 3     1000049           2    18005     2020
## 4     1000049           2    18007     2020
## 5     1000049           2    18009     2020
## 6     1000049           2    18011     2022
# Duplicate relevant columns for later use
max_year_by_group$maxdata_INDICATORID <- max_year_by_group$INDICATORID

max_year_by_group$maxdata_year <- max_year_by_group$DATAYEAR

max_year_by_group$maxdata_GEOGRAPHYID <- max_year_by_group$GEOGRAPHYID

max_year_by_group$maxdata_GEOLOCID <- max_year_by_group$GEOLOCID

head(max_year_by_group)
## # A tibble: 6 × 8
## # Groups:   INDICATORID, GEOGRAPHYID [1]
##   INDICATORID GEOGRAPHYID GEOLOCID DATAYEAR maxdata_INDICATORID maxdata_year
##         <dbl>       <dbl>    <dbl>    <dbl>               <dbl>        <dbl>
## 1     1000049           2    18001     2020             1000049         2020
## 2     1000049           2    18003     2020             1000049         2020
## 3     1000049           2    18005     2020             1000049         2020
## 4     1000049           2    18007     2020             1000049         2020
## 5     1000049           2    18009     2020             1000049         2020
## 6     1000049           2    18011     2022             1000049         2022
## # ℹ 2 more variables: maxdata_GEOGRAPHYID <dbl>, maxdata_GEOLOCID <dbl>
# Perform an outer join between max_year_by_group and Outlier_Analysis
Outlier_Analysis_Data <- full_join(Outlier_Analysis, max_year_by_group, by = c('INDICATORID', 'GEOGRAPHYID', 'GEOLOCID', 'DATAYEAR'))

head(Outlier_Analysis_Data)
## # A tibble: 6 × 9
##   INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE maxdata_INDICATORID
##         <dbl>    <dbl>       <dbl>    <dbl>      <dbl>               <dbl>
## 1     1000122     2000           2    18095      31776                  NA
## 2     1000122     2000           2    18081      31333                  NA
## 3     1000122     2006           2    18095      29994                  NA
## 4     1000122     2006           2    18081      33691                  NA
## 5     1000122     2007           2    18081      34509                  NA
## 6     1000122     2007           2    18095      29947                  NA
## # ℹ 3 more variables: maxdata_year <dbl>, maxdata_GEOGRAPHYID <dbl>,
## #   maxdata_GEOLOCID <dbl>
# Select rows where DATAYEAR is not null (years other than Max_year)
Max_Year <- Outlier_Analysis_Data[complete.cases(Outlier_Analysis_Data),]

Max_Year <- subset(Max_Year, select=-c(maxdata_INDICATORID, maxdata_year, maxdata_GEOLOCID, maxdata_GEOGRAPHYID))
head(Max_Year)
## # A tibble: 6 × 5
##   INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE
##         <dbl>    <dbl>       <dbl>    <dbl>      <dbl>
## 1     1000122     2010           2    18047       6148
## 2     1000122     2010           2    18131       3389
## 3     1000122     2010           2    18147       5064
## 4     1000122     2010           2    18151       7985
## 5     1000122     2010           2    18003      95462
## 6     1000122     2010           2    18183       8216
# Convert all columns to numeric
Max_Year[] <- lapply(Max_Year, as.numeric)
# Create a Data Frame (DF)  not containing rows with missing values

DF <- Outlier_Analysis_Data[rowSums(is.na(Outlier_Analysis_Data)) > 0,]
head(DF)
## # A tibble: 6 × 9
##   INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE maxdata_INDICATORID
##         <dbl>    <dbl>       <dbl>    <dbl>      <dbl>               <dbl>
## 1     1000122     2000           2    18095      31776                  NA
## 2     1000122     2000           2    18081      31333                  NA
## 3     1000122     2006           2    18095      29994                  NA
## 4     1000122     2006           2    18081      33691                  NA
## 5     1000122     2007           2    18081      34509                  NA
## 6     1000122     2007           2    18095      29947                  NA
## # ℹ 3 more variables: maxdata_year <dbl>, maxdata_GEOGRAPHYID <dbl>,
## #   maxdata_GEOLOCID <dbl>
new_dfs <- subset(DF, select=-c(maxdata_INDICATORID, maxdata_year, maxdata_GEOLOCID, maxdata_GEOGRAPHYID))
head(new_dfs)
## # A tibble: 6 × 5
##   INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE
##         <dbl>    <dbl>       <dbl>    <dbl>      <dbl>
## 1     1000122     2000           2    18095      31776
## 2     1000122     2000           2    18081      31333
## 3     1000122     2006           2    18095      29994
## 4     1000122     2006           2    18081      33691
## 5     1000122     2007           2    18081      34509
## 6     1000122     2007           2    18095      29947
# Count occurrences for each group

COUNT <- new_dfs %>%
  group_by(INDICATORID, GEOGRAPHYID, GEOLOCID) %>%
  count()

head(COUNT)
## # A tibble: 6 × 4
## # Groups:   INDICATORID, GEOGRAPHYID, GEOLOCID [6]
##   INDICATORID GEOGRAPHYID GEOLOCID     n
##         <dbl>       <dbl>    <dbl> <int>
## 1     1000049           2    18001     2
## 2     1000049           2    18003     2
## 3     1000049           2    18005     2
## 4     1000049           2    18007     2
## 5     1000049           2    18009     2
## 6     1000049           2    18011    16
# Convert all columns to numeric
COUNT[] <- lapply(COUNT, as.numeric)

# Convert all columns to numeric
new_dfs[] <- lapply(new_dfs, as.numeric)
# Filter out non-numeric values
new_dfs_numeric <- new_dfs %>%
  filter(is.numeric(DATA_VALUE))

head(new_dfs_numeric)
## # A tibble: 6 × 5
##   INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE
##         <dbl>    <dbl>       <dbl>    <dbl>      <dbl>
## 1     1000122     2000           2    18095      31776
## 2     1000122     2000           2    18081      31333
## 3     1000122     2006           2    18095      29994
## 4     1000122     2006           2    18081      33691
## 5     1000122     2007           2    18081      34509
## 6     1000122     2007           2    18095      29947
# Calculate mean and standard deviation for each group
MEAN_STD <- new_dfs_numeric %>%
  group_by(INDICATORID, GEOGRAPHYID, GEOLOCID) %>%
  summarize(mean_value = mean(DATA_VALUE), sd_value = sd(DATA_VALUE))
## `summarise()` has grouped output by 'INDICATORID', 'GEOGRAPHYID'. You can
## override using the `.groups` argument.
# View the resulting dataframe
head(MEAN_STD)
## # A tibble: 6 × 5
## # Groups:   INDICATORID, GEOGRAPHYID [1]
##   INDICATORID GEOGRAPHYID GEOLOCID mean_value sd_value
##         <dbl>       <dbl>    <dbl>      <dbl>    <dbl>
## 1     1000049           2    18001     34769      858.
## 2     1000049           2    18003    361954.   15021.
## 3     1000049           2    18005     79168     4685.
## 4     1000049           2    18007      8748.     101.
## 5     1000049           2    18009     12486.     670.
## 6     1000049           2    18011     56719.   10022.
# Merge count, mean, and standard deviation into a single DataFrame

STATS_df <- left_join(MEAN_STD, COUNT, by = c('INDICATORID', 'GEOGRAPHYID', 'GEOLOCID'), suffix = c("_left", "_right"))

head(STATS_df)
## # A tibble: 6 × 6
## # Groups:   INDICATORID, GEOGRAPHYID [1]
##   INDICATORID GEOGRAPHYID GEOLOCID mean_value sd_value     n
##         <dbl>       <dbl>    <dbl>      <dbl>    <dbl> <dbl>
## 1     1000049           2    18001     34769      858.     2
## 2     1000049           2    18003    361954.   15021.     2
## 3     1000049           2    18005     79168     4685.     2
## 4     1000049           2    18007      8748.     101.     2
## 5     1000049           2    18009     12486.     670.     2
## 6     1000049           2    18011     56719.   10022.    16
# Calculate Bias and merge with additional information
BIAS <- left_join(STATS_df, Max_Year, by = c('INDICATORID', 'GEOGRAPHYID', 'GEOLOCID'))
head(BIAS)
## # A tibble: 6 × 8
## # Groups:   INDICATORID, GEOGRAPHYID [1]
##   INDICATORID GEOGRAPHYID GEOLOCID mean_value sd_value     n DATAYEAR DATA_VALUE
##         <dbl>       <dbl>    <dbl>      <dbl>    <dbl> <dbl>    <dbl>      <dbl>
## 1     1000049           2    18001     34769      858.     2     2020      35544
## 2     1000049           2    18003    361954.   15021.     2     2020     375520
## 3     1000049           2    18005     79168     4685.     2     2020      83280
## 4     1000049           2    18007      8748.     101.     2     2020       8695
## 5     1000049           2    18009     12486.     670.     2     2020      11926
## 6     1000049           2    18011     56719.   10022.    16     2022      71235
BIAS <- BIAS %>%
  mutate(Bias = (DATA_VALUE - mean_value) / sd_value)

head(BIAS)
## # A tibble: 6 × 9
## # Groups:   INDICATORID, GEOGRAPHYID [1]
##   INDICATORID GEOGRAPHYID GEOLOCID mean_value sd_value     n DATAYEAR DATA_VALUE
##         <dbl>       <dbl>    <dbl>      <dbl>    <dbl> <dbl>    <dbl>      <dbl>
## 1     1000049           2    18001     34769      858.     2     2020      35544
## 2     1000049           2    18003    361954.   15021.     2     2020     375520
## 3     1000049           2    18005     79168     4685.     2     2020      83280
## 4     1000049           2    18007      8748.     101.     2     2020       8695
## 5     1000049           2    18009     12486.     670.     2     2020      11926
## 6     1000049           2    18011     56719.   10022.    16     2022      71235
## # ℹ 1 more variable: Bias <dbl>
# Merge with DISPLAY_LABEL
BIAS <- merge(Outliers_CPRO_Viz, BIAS, by = "INDICATORID")


head(BIAS)
##   INDICATORID           DISPLAY_LABEL Has_CPRO_Viz GEOGRAPHYID   GEOLOCID
## 1     1000049 Total Population People          Yes           3 1812905014
## 2     1000049 Total Population People          Yes           3 1812782484
## 3     1000049 Total Population People          Yes           3 1812965052
## 4     1000049 Total Population People          Yes           3 1812905428
## 5     1000049 Total Population People          Yes           3 1806308308
## 6     1000049 Total Population People          Yes           3 1812911584
##   mean_value sd_value  n DATAYEAR DATA_VALUE     Bias
## 1     404.00       NA  1     2020        245       NA
## 2   19265.00       NA  1     2020      20635       NA
## 3    2008.00       NA  1     2020       1743       NA
## 4    9591.00       NA  1     2020       9032       NA
## 5   12059.54  972.485 13     2022      14335 2.339842
## 6    1271.00       NA  1     2020       1479       NA
# Filter Bias values outside the range [-2.0, 2.0]

BIAS <- BIAS %>%
  filter(!is.na(Bias), Bias <= -2.0 | Bias >= 2.0)

head(BIAS)
##   INDICATORID           DISPLAY_LABEL Has_CPRO_Viz GEOGRAPHYID   GEOLOCID
## 1     1000049 Total Population People          Yes           3 1806308308
## 2     1000049 Total Population People          Yes           3 1814500568
## 3     1000049 Total Population People          Yes           3 1814533070
## 4     1000049 Total Population People          Yes         160    1812628
## 5     1000049 Total Population People          Yes         160    1813492
## 6     1000049 Total Population People          Yes           3 1814578578
##   mean_value  sd_value  n DATAYEAR DATA_VALUE      Bias
## 1  12059.538 972.48501 13     2022      14335  2.339842
## 2  20554.538 272.84019 13     2022      21404  3.113403
## 3   1163.231 104.95726 13     2022       1450  2.732248
## 4   4899.333 127.88086 12     2022       5356  3.571032
## 5   1405.833  48.35444 12     2022       1624  4.511823
## 6   1492.000  73.58895 13     2022       1304 -2.554731
Outlier_Analysis_Geographies$GEOGRAPHYID <- as.numeric(Outlier_Analysis_Geographies$GEOGRAPHYID)
Outlier_Analysis_Geographies$GEOLOCID <- as.numeric(Outlier_Analysis_Geographies$GEOLOCID)


# Join with GEOGRAPHY_DISPLAY_LABEL
GEOGRAPHY_LABEL <- left_join(BIAS, Outlier_Analysis_Geographies, by = c('GEOGRAPHYID', 'GEOLOCID'))

head(GEOGRAPHY_LABEL)
##   INDICATORID           DISPLAY_LABEL Has_CPRO_Viz GEOGRAPHYID   GEOLOCID
## 1     1000049 Total Population People          Yes           3 1806308308
## 2     1000049 Total Population People          Yes           3 1814500568
## 3     1000049 Total Population People          Yes           3 1814533070
## 4     1000049 Total Population People          Yes         160    1812628
## 5     1000049 Total Population People          Yes         160    1813492
## 6     1000049 Total Population People          Yes           3 1814578578
##   mean_value  sd_value  n DATAYEAR DATA_VALUE      Bias
## 1  12059.538 972.48501 13     2022      14335  2.339842
## 2  20554.538 272.84019 13     2022      21404  3.113403
## 3   1163.231 104.95726 13     2022       1450  2.732248
## 4   4899.333 127.88086 12     2022       5356  3.571032
## 5   1405.833  48.35444 12     2022       1624  4.511823
## 6   1492.000  73.58895 13     2022       1304 -2.554731
##                GEOGRAPHY_DISPLAY_LABEL
## 1  Brown township Hendricks County, IN
## 2   Addison township Shelby County, IN
## 3 Hendricks township Shelby County, IN
## 4                               Cicero
## 5                             Clermont
## 6 Van Buren township Shelby County, IN
GEOGRAPHY_LABEL <- GEOGRAPHY_LABEL %>%
  filter(`GEOGRAPHYID` %in% c(2, 55))
head(GEOGRAPHY_LABEL)
##   INDICATORID
## 1     1000049
## 2     1000085
## 3     1000085
## 4     1000085
## 5     1000085
## 6     1000085
##                                                                      DISPLAY_LABEL
## 1                                                          Total Population People
## 2 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 3 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 4 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 5 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 6 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
##   Has_CPRO_Viz GEOGRAPHYID GEOLOCID   mean_value sd_value  n DATAYEAR
## 1          Yes           2    18113 47509.500000 4.949747  2     2020
## 2          Yes          55     5275    39.137402 5.495275 12     2022
## 3          Yes          55     5370    37.313334 5.139149 12     2022
## 4          Yes          55     5385    49.886587 2.880584 12     2022
## 5          Yes          55     7360     0.000000 0.000000  6     2022
## 6          Yes          55      630     5.958245 7.481864 12     2022
##    DATA_VALUE      Bias                         GEOGRAPHY_DISPLAY_LABEL
## 1 47640.00000 26.364981                                            <NA>
## 2    24.73552 -2.620776           Anderson Community School Corporation
## 3    24.87942 -2.419450                       M S D Washington Township
## 4    42.90828 -2.422532                     Indianapolis Public Schools
## 5    42.85714       Inf Southwestern Shelby County Consolidated Schools
## 6    21.71799  2.106393                    Zionsville Community Schools
# Filter out records where n >= 3
GEOGRAPHY_LABEL <- GEOGRAPHY_LABEL %>%
  filter(!is.na(n), n >= 3.0)

head(GEOGRAPHY_LABEL)
##   INDICATORID
## 1     1000085
## 2     1000085
## 3     1000085
## 4     1000085
## 5     1000085
## 6     1000085
##                                                                      DISPLAY_LABEL
## 1 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 2 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 3 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 4 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 5 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 6 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
##   Has_CPRO_Viz GEOGRAPHYID GEOLOCID mean_value sd_value  n DATAYEAR DATA_VALUE
## 1          Yes          55     5275  39.137402 5.495275 12     2022   24.73552
## 2          Yes          55     5370  37.313334 5.139149 12     2022   24.87942
## 3          Yes          55     5385  49.886587 2.880584 12     2022   42.90828
## 4          Yes          55     7360   0.000000 0.000000  6     2022   42.85714
## 5          Yes          55      630   5.958245 7.481864 12     2022   21.71799
## 6          Yes          55      670  18.756400 7.883576 12     2022    0.00000
##        Bias                         GEOGRAPHY_DISPLAY_LABEL
## 1 -2.620776           Anderson Community School Corporation
## 2 -2.419450                       M S D Washington Township
## 3 -2.422532                     Indianapolis Public Schools
## 4       Inf Southwestern Shelby County Consolidated Schools
## 5  2.106393                    Zionsville Community Schools
## 6 -2.379174       County School Corporation Of Brown County
# Join Geography_Type_Name with GEOGRAPHY_LABEL

GEOGRAPHY_LABEL <- merge(GEOGRAPHY_LABEL, Geography_Type_Name, by.x = "GEOGRAPHYID", by.y = "ID", all.x = TRUE)

GEOGRAPHY_LABEL <- subset(GEOGRAPHY_LABEL, select = -c(GEOGRAPHY_NAME))


GEOGRAPHY_LABEL <- GEOGRAPHY_LABEL %>%
  mutate(Trend = ifelse(Bias > 2, "High", ifelse(Bias < -2, "Low", "Normal")))


head(GEOGRAPHY_LABEL)  
##   GEOGRAPHYID INDICATORID
## 1           2     1003941
## 2           2     1000355
## 3           2     1002004
## 4           2     1004025
## 5           2     1000691
## 6           2     1000561
##                                                                                                                                DISPLAY_LABEL
## 1                                                                       Emergency room visits due to asthma (all ages) per 10,000 population
## 2                                                                                                                           Total households
## 3                                             Students Eligible for Free or Reduced Meals at Public Schools as % of Public School Enrollment
## 4                                                                           Percent population change since 2010 - Population under 18 years
## 5 Families with a Male Head of Household, No Wife Present, Living Below 185% of Poverty Level as % of Families with a Male Head of Household
## 6                                                          Dependency Ratio (Ratio of Population under 18 and over 64 to Population 18 - 64)
##   Has_CPRO_Viz GEOLOCID    mean_value     sd_value  n DATAYEAR    DATA_VALUE
## 1          Yes    18127    43.0000000 8.888194e-01  3     2019    27.4000000
## 2          Yes    18059 26398.0000000 2.246500e+03 17     2022 31424.0000000
## 3          Yes    18059    26.5093770 1.255077e+00 13     2023    32.0878355
## 4          Yes    18035    -4.4556626 3.356004e+00  9     2020   -11.3222755
## 5          Yes    18133    41.6615913 1.025866e+01 12     2022    17.2794118
## 6          Yes    18141     0.6127291 1.221634e-03  5     2015     0.6174084
##         Bias GEOGRAPHY_DISPLAY_LABEL ABBREV Trend
## 1 -17.551371                    <NA>   CNTY   Low
## 2   2.237258                  County   CNTY  High
## 3   4.444714                  County   CNTY  High
## 4  -2.046068                    <NA>   CNTY   Low
## 5  -2.376741                  County   CNTY   Low
## 6   3.830313                    <NA>   CNTY  High
outlier_criteria2 <- 3
GEOGRAPHY_counts <- GEOGRAPHY_LABEL %>%
  group_by(GEOGRAPHYID, DISPLAY_LABEL, INDICATORID) %>%
  filter(Bias > outlier_criteria2) %>%
  summarise(OutlierCount = n(), .groups = "drop")


# Merge additional columns


GEOGRAPHY_counts <- merge(GEOGRAPHY_counts, GEOGRAPHY_LABEL %>%
                            select("GEOGRAPHYID", "GEOLOCID", "INDICATORID", "DISPLAY_LABEL", "mean_value", "sd_value", "n", "DATA_VALUE", "GEOGRAPHY_DISPLAY_LABEL","Bias","Trend") %>%
                            distinct(), 
                          by = c("GEOGRAPHYID", "INDICATORID"), 
                          all.x = TRUE,
                          suffixes = c("", "_LABEL"))



View(GEOGRAPHY_counts)
# Filter rows with Bias > 3 or Bias < -3, excluding infinite values
filtered_geography <- GEOGRAPHY_counts %>%
  filter((Bias > 3 | Bias < -3) & is.finite(Bias)) %>%
  arrange(desc(Bias))  # Sorting by Bias in descending order

# Print the filtered table
print("Geography counts with |Bias| > 3 (excluding infinite values):")
## [1] "Geography counts with |Bias| > 3 (excluding infinite values):"
View(filtered_geography)
# Combining all INDICATORID values to remove (removing duplicates)
ids_to_remove <- c(1003860, 1003401, 1000931, 1000204, 1003672, 
                  1003996, 1003407, 1012339, 1000260, 1003949, 
                  1004430, 1003952, 1003860, 1011638, 1000561, 
                  1000931, 1002004, 100436, 1003954,1011628,1003947,1004436)

# Remove duplicates from the list
ids_to_remove <- unique(ids_to_remove)

# Filter the dataframe to remove these INDICATORID values
filtered_geography <- filtered_geography[!filtered_geography[, "INDICATORID"] %in% ids_to_remove, ]
#goal 1
filtered_geography <- filtered_geography[filtered_geography$GEOLOCID != 365, ]

#creating separate Dataframes for counties and school corporations
GEOGRAPHY_counts_county <- filtered_geography %>%
  filter(GEOGRAPHYID == 2)

GEOGRAPHY_counts_SC <- filtered_geography %>%
  filter(GEOGRAPHYID == 55)

View(GEOGRAPHY_counts_county)
View(GEOGRAPHY_counts_SC)
#goal2
GOAL_2_COUNTY <- GEOGRAPHY_counts_county %>%
  group_by(INDICATORID, DISPLAY_LABEL, OutlierCount, Trend) %>%
  summarise() %>%
  mutate(percent_of_COUNTIES = (OutlierCount / 12) * 100)
## `summarise()` has grouped output by 'INDICATORID', 'DISPLAY_LABEL',
## 'OutlierCount'. You can override using the `.groups` argument.
GOAL_2_SC <- GEOGRAPHY_counts_SC %>%
  group_by(INDICATORID, DISPLAY_LABEL, OutlierCount, Trend) %>%
  summarise() %>%
  mutate(percent_of_School_corps = (OutlierCount / 25) * 100)
## `summarise()` has grouped output by 'INDICATORID', 'DISPLAY_LABEL',
## 'OutlierCount'. You can override using the `.groups` argument.
View(GOAL_2_SC)
View(GOAL_2_COUNTY)
library(dplyr)
# For GEOGRAPHYID = 2
geo_2_high <- filtered_geography %>%
  filter(GEOGRAPHYID == 2, Trend == "High") %>%
  group_by(GEOGRAPHY_DISPLAY_LABEL) %>%
  summarise(
    total_outliers = sum(OutlierCount)
  ) %>%
  arrange(desc(total_outliers))

geo_2_low <- filtered_geography %>%
  filter(GEOGRAPHYID == 2, Trend == "Low") %>%
  group_by(GEOGRAPHY_DISPLAY_LABEL) %>%
  summarise(
    total_outliers = sum(OutlierCount)
  ) %>%
  arrange(desc(total_outliers))

# For GEOGRAPHYID = 55
geo_55_high <- filtered_geography %>%
  filter(GEOGRAPHYID == 55, Trend == "High") %>%
  group_by(GEOGRAPHY_DISPLAY_LABEL) %>%
  summarise(
    total_outliers = sum(OutlierCount)
  ) %>%
  arrange(desc(total_outliers))

geo_55_low <- filtered_geography %>%
  filter(GEOGRAPHYID == 55, Trend == "Low") %>%
  group_by(GEOGRAPHY_DISPLAY_LABEL) %>%
  summarise(
    total_outliers = sum(OutlierCount)
  ) %>%
  arrange(desc(total_outliers)) 

# Print all tables with clear headers
print("GEOGRAPHYID = 2, Trend = High:")
## [1] "GEOGRAPHYID = 2, Trend = High:"
View(geo_2_high)

print("\nGEOGRAPHYID = 2, Trend = Low:")
## [1] "\nGEOGRAPHYID = 2, Trend = Low:"
View(geo_2_low)

print("\nGEOGRAPHYID = 55, Trend = High:")
## [1] "\nGEOGRAPHYID = 55, Trend = High:"
View(geo_55_high)

print("\nGEOGRAPHYID = 55, Trend = Low:")
## [1] "\nGEOGRAPHYID = 55, Trend = Low:"
View(geo_55_low)