library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 4.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(htmltools)
packageVersion("htmltools")
## [1] '0.5.8.1'
library(readxl)
# Load files from Documents
Outlier_Analysis <- read_csv("/Users/woodsprocise/Documents/IU Indy - Fall '25/SAVI Outliers Repo/SAVI-Outliers/data/SAVI_data/qry_Outlier_Analysis_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 743370 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): INDICATORID, DATAYEAR, GEOGRAPHYID, GEOLOCID, DATA_VALUE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Outlier_Analysis_Geographies <-read_csv("~/Documents/IU Indy - Fall '25/SAVI Outliers Repo/SAVI-Outliers/data/SAVI_data/qry_Outlier_Analysis_Geographies.csv")
## Rows: 4711 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GEOGRAPHY_DISPLAY_LABEL
## dbl (2): GEOGRAPHYID, GEOLOCID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Outliers_CPRO_Viz <-read_excel("/Users/woodsprocise/Documents/IU Indy - Fall '25/SAVI Outliers Repo/SAVI-Outliers/data/SAVI_data/qry_Outlier_Analysis_Indicators.xlsx")
Geography_Type_Name <-read_csv("~/Documents/IU Indy - Fall '25/SAVI Outliers Repo/SAVI-Outliers/data/SAVI_data/qry_Outlier_Analysis_Geography_Types.csv")
## Rows: 26 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ABBREV, GEOGRAPHY_NAME
## dbl (1): ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Filter Max_year for each group
max_year_by_group <- Outlier_Analysis %>%
group_by(INDICATORID, GEOGRAPHYID, GEOLOCID) %>%
summarise(DATAYEAR = max(DATAYEAR))
## `summarise()` has grouped output by 'INDICATORID', 'GEOGRAPHYID'. You can
## override using the `.groups` argument.
head(max_year_by_group)
## # A tibble: 6 × 4
## # Groups: INDICATORID, GEOGRAPHYID [1]
## INDICATORID GEOGRAPHYID GEOLOCID DATAYEAR
## <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18001 2020
## 2 1000049 2 18003 2020
## 3 1000049 2 18005 2020
## 4 1000049 2 18007 2020
## 5 1000049 2 18009 2020
## 6 1000049 2 18011 2022
# Duplicate relevant columns for later use
max_year_by_group$maxdata_INDICATORID <- max_year_by_group$INDICATORID
max_year_by_group$maxdata_year <- max_year_by_group$DATAYEAR
max_year_by_group$maxdata_GEOGRAPHYID <- max_year_by_group$GEOGRAPHYID
max_year_by_group$maxdata_GEOLOCID <- max_year_by_group$GEOLOCID
head(max_year_by_group)
## # A tibble: 6 × 8
## # Groups: INDICATORID, GEOGRAPHYID [1]
## INDICATORID GEOGRAPHYID GEOLOCID DATAYEAR maxdata_INDICATORID maxdata_year
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18001 2020 1000049 2020
## 2 1000049 2 18003 2020 1000049 2020
## 3 1000049 2 18005 2020 1000049 2020
## 4 1000049 2 18007 2020 1000049 2020
## 5 1000049 2 18009 2020 1000049 2020
## 6 1000049 2 18011 2022 1000049 2022
## # ℹ 2 more variables: maxdata_GEOGRAPHYID <dbl>, maxdata_GEOLOCID <dbl>
# Perform an outer join between max_year_by_group and Outlier_Analysis
Outlier_Analysis_Data <- full_join(Outlier_Analysis, max_year_by_group, by = c('INDICATORID', 'GEOGRAPHYID', 'GEOLOCID', 'DATAYEAR'))
head(Outlier_Analysis_Data)
## # A tibble: 6 × 9
## INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE maxdata_INDICATORID
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000122 2000 2 18095 31776 NA
## 2 1000122 2000 2 18081 31333 NA
## 3 1000122 2006 2 18095 29994 NA
## 4 1000122 2006 2 18081 33691 NA
## 5 1000122 2007 2 18081 34509 NA
## 6 1000122 2007 2 18095 29947 NA
## # ℹ 3 more variables: maxdata_year <dbl>, maxdata_GEOGRAPHYID <dbl>,
## # maxdata_GEOLOCID <dbl>
# Select rows where DATAYEAR is not null (years other than Max_year)
Max_Year <- Outlier_Analysis_Data[complete.cases(Outlier_Analysis_Data),]
Max_Year <- subset(Max_Year, select=-c(maxdata_INDICATORID, maxdata_year, maxdata_GEOLOCID, maxdata_GEOGRAPHYID))
head(Max_Year)
## # A tibble: 6 × 5
## INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000122 2010 2 18047 6148
## 2 1000122 2010 2 18131 3389
## 3 1000122 2010 2 18147 5064
## 4 1000122 2010 2 18151 7985
## 5 1000122 2010 2 18003 95462
## 6 1000122 2010 2 18183 8216
# Convert all columns to numeric
Max_Year[] <- lapply(Max_Year, as.numeric)
# Create a Data Frame (DF) not containing rows with missing values
DF <- Outlier_Analysis_Data[rowSums(is.na(Outlier_Analysis_Data)) > 0,]
head(DF)
## # A tibble: 6 × 9
## INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE maxdata_INDICATORID
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000122 2000 2 18095 31776 NA
## 2 1000122 2000 2 18081 31333 NA
## 3 1000122 2006 2 18095 29994 NA
## 4 1000122 2006 2 18081 33691 NA
## 5 1000122 2007 2 18081 34509 NA
## 6 1000122 2007 2 18095 29947 NA
## # ℹ 3 more variables: maxdata_year <dbl>, maxdata_GEOGRAPHYID <dbl>,
## # maxdata_GEOLOCID <dbl>
new_dfs <- subset(DF, select=-c(maxdata_INDICATORID, maxdata_year, maxdata_GEOLOCID, maxdata_GEOGRAPHYID))
head(new_dfs)
## # A tibble: 6 × 5
## INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000122 2000 2 18095 31776
## 2 1000122 2000 2 18081 31333
## 3 1000122 2006 2 18095 29994
## 4 1000122 2006 2 18081 33691
## 5 1000122 2007 2 18081 34509
## 6 1000122 2007 2 18095 29947
# Count occurrences for each group
COUNT <- new_dfs %>%
group_by(INDICATORID, GEOGRAPHYID, GEOLOCID) %>%
count()
head(COUNT)
## # A tibble: 6 × 4
## # Groups: INDICATORID, GEOGRAPHYID, GEOLOCID [6]
## INDICATORID GEOGRAPHYID GEOLOCID n
## <dbl> <dbl> <dbl> <int>
## 1 1000049 2 18001 2
## 2 1000049 2 18003 2
## 3 1000049 2 18005 2
## 4 1000049 2 18007 2
## 5 1000049 2 18009 2
## 6 1000049 2 18011 16
# Convert all columns to numeric
COUNT[] <- lapply(COUNT, as.numeric)
# Convert all columns to numeric
new_dfs[] <- lapply(new_dfs, as.numeric)
# Filter out non-numeric values
new_dfs_numeric <- new_dfs %>%
filter(is.numeric(DATA_VALUE))
head(new_dfs_numeric)
## # A tibble: 6 × 5
## INDICATORID DATAYEAR GEOGRAPHYID GEOLOCID DATA_VALUE
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000122 2000 2 18095 31776
## 2 1000122 2000 2 18081 31333
## 3 1000122 2006 2 18095 29994
## 4 1000122 2006 2 18081 33691
## 5 1000122 2007 2 18081 34509
## 6 1000122 2007 2 18095 29947
# Calculate mean and standard deviation for each group
MEAN_STD <- new_dfs_numeric %>%
group_by(INDICATORID, GEOGRAPHYID, GEOLOCID) %>%
summarize(mean_value = mean(DATA_VALUE), sd_value = sd(DATA_VALUE))
## `summarise()` has grouped output by 'INDICATORID', 'GEOGRAPHYID'. You can
## override using the `.groups` argument.
# View the resulting dataframe
head(MEAN_STD)
## # A tibble: 6 × 5
## # Groups: INDICATORID, GEOGRAPHYID [1]
## INDICATORID GEOGRAPHYID GEOLOCID mean_value sd_value
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18001 34769 858.
## 2 1000049 2 18003 361954. 15021.
## 3 1000049 2 18005 79168 4685.
## 4 1000049 2 18007 8748. 101.
## 5 1000049 2 18009 12486. 670.
## 6 1000049 2 18011 56719. 10022.
# Merge count, mean, and standard deviation into a single DataFrame
STATS_df <- left_join(MEAN_STD, COUNT, by = c('INDICATORID', 'GEOGRAPHYID', 'GEOLOCID'), suffix = c("_left", "_right"))
head(STATS_df)
## # A tibble: 6 × 6
## # Groups: INDICATORID, GEOGRAPHYID [1]
## INDICATORID GEOGRAPHYID GEOLOCID mean_value sd_value n
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18001 34769 858. 2
## 2 1000049 2 18003 361954. 15021. 2
## 3 1000049 2 18005 79168 4685. 2
## 4 1000049 2 18007 8748. 101. 2
## 5 1000049 2 18009 12486. 670. 2
## 6 1000049 2 18011 56719. 10022. 16
# Calculate Bias and merge with additional information
BIAS <- left_join(STATS_df, Max_Year, by = c('INDICATORID', 'GEOGRAPHYID', 'GEOLOCID'))
head(BIAS)
## # A tibble: 6 × 8
## # Groups: INDICATORID, GEOGRAPHYID [1]
## INDICATORID GEOGRAPHYID GEOLOCID mean_value sd_value n DATAYEAR DATA_VALUE
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18001 34769 858. 2 2020 35544
## 2 1000049 2 18003 361954. 15021. 2 2020 375520
## 3 1000049 2 18005 79168 4685. 2 2020 83280
## 4 1000049 2 18007 8748. 101. 2 2020 8695
## 5 1000049 2 18009 12486. 670. 2 2020 11926
## 6 1000049 2 18011 56719. 10022. 16 2022 71235
BIAS <- BIAS %>%
mutate(Bias = (DATA_VALUE - mean_value) / sd_value)
head(BIAS)
## # A tibble: 6 × 9
## # Groups: INDICATORID, GEOGRAPHYID [1]
## INDICATORID GEOGRAPHYID GEOLOCID mean_value sd_value n DATAYEAR DATA_VALUE
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000049 2 18001 34769 858. 2 2020 35544
## 2 1000049 2 18003 361954. 15021. 2 2020 375520
## 3 1000049 2 18005 79168 4685. 2 2020 83280
## 4 1000049 2 18007 8748. 101. 2 2020 8695
## 5 1000049 2 18009 12486. 670. 2 2020 11926
## 6 1000049 2 18011 56719. 10022. 16 2022 71235
## # ℹ 1 more variable: Bias <dbl>
# Merge with DISPLAY_LABEL
BIAS <- merge(Outliers_CPRO_Viz, BIAS, by = "INDICATORID")
head(BIAS)
## INDICATORID DISPLAY_LABEL Has_CPRO_Viz GEOGRAPHYID GEOLOCID
## 1 1000049 Total Population People Yes 3 1812905014
## 2 1000049 Total Population People Yes 3 1812782484
## 3 1000049 Total Population People Yes 3 1812965052
## 4 1000049 Total Population People Yes 3 1812905428
## 5 1000049 Total Population People Yes 3 1806308308
## 6 1000049 Total Population People Yes 3 1812911584
## mean_value sd_value n DATAYEAR DATA_VALUE Bias
## 1 404.00 NA 1 2020 245 NA
## 2 19265.00 NA 1 2020 20635 NA
## 3 2008.00 NA 1 2020 1743 NA
## 4 9591.00 NA 1 2020 9032 NA
## 5 12059.54 972.485 13 2022 14335 2.339842
## 6 1271.00 NA 1 2020 1479 NA
# Filter Bias values outside the range [-2.0, 2.0]
BIAS <- BIAS %>%
filter(!is.na(Bias), Bias <= -2.0 | Bias >= 2.0)
head(BIAS)
## INDICATORID DISPLAY_LABEL Has_CPRO_Viz GEOGRAPHYID GEOLOCID
## 1 1000049 Total Population People Yes 3 1806308308
## 2 1000049 Total Population People Yes 3 1814500568
## 3 1000049 Total Population People Yes 3 1814533070
## 4 1000049 Total Population People Yes 160 1812628
## 5 1000049 Total Population People Yes 160 1813492
## 6 1000049 Total Population People Yes 3 1814578578
## mean_value sd_value n DATAYEAR DATA_VALUE Bias
## 1 12059.538 972.48501 13 2022 14335 2.339842
## 2 20554.538 272.84019 13 2022 21404 3.113403
## 3 1163.231 104.95726 13 2022 1450 2.732248
## 4 4899.333 127.88086 12 2022 5356 3.571032
## 5 1405.833 48.35444 12 2022 1624 4.511823
## 6 1492.000 73.58895 13 2022 1304 -2.554731
Outlier_Analysis_Geographies$GEOGRAPHYID <- as.numeric(Outlier_Analysis_Geographies$GEOGRAPHYID)
Outlier_Analysis_Geographies$GEOLOCID <- as.numeric(Outlier_Analysis_Geographies$GEOLOCID)
# Join with GEOGRAPHY_DISPLAY_LABEL
GEOGRAPHY_LABEL <- left_join(BIAS, Outlier_Analysis_Geographies, by = c('GEOGRAPHYID', 'GEOLOCID'))
head(GEOGRAPHY_LABEL)
## INDICATORID DISPLAY_LABEL Has_CPRO_Viz GEOGRAPHYID GEOLOCID
## 1 1000049 Total Population People Yes 3 1806308308
## 2 1000049 Total Population People Yes 3 1814500568
## 3 1000049 Total Population People Yes 3 1814533070
## 4 1000049 Total Population People Yes 160 1812628
## 5 1000049 Total Population People Yes 160 1813492
## 6 1000049 Total Population People Yes 3 1814578578
## mean_value sd_value n DATAYEAR DATA_VALUE Bias
## 1 12059.538 972.48501 13 2022 14335 2.339842
## 2 20554.538 272.84019 13 2022 21404 3.113403
## 3 1163.231 104.95726 13 2022 1450 2.732248
## 4 4899.333 127.88086 12 2022 5356 3.571032
## 5 1405.833 48.35444 12 2022 1624 4.511823
## 6 1492.000 73.58895 13 2022 1304 -2.554731
## GEOGRAPHY_DISPLAY_LABEL
## 1 Brown township Hendricks County, IN
## 2 Addison township Shelby County, IN
## 3 Hendricks township Shelby County, IN
## 4 Cicero
## 5 Clermont
## 6 Van Buren township Shelby County, IN
GEOGRAPHY_LABEL <- GEOGRAPHY_LABEL %>%
filter(`GEOGRAPHYID` %in% c(2, 55))
head(GEOGRAPHY_LABEL)
## INDICATORID
## 1 1000049
## 2 1000085
## 3 1000085
## 4 1000085
## 5 1000085
## 6 1000085
## DISPLAY_LABEL
## 1 Total Population People
## 2 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 3 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 4 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 5 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 6 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## Has_CPRO_Viz GEOGRAPHYID GEOLOCID mean_value sd_value n DATAYEAR
## 1 Yes 2 18113 47509.500000 4.949747 2 2020
## 2 Yes 55 5275 39.137402 5.495275 12 2022
## 3 Yes 55 5370 37.313334 5.139149 12 2022
## 4 Yes 55 5385 49.886587 2.880584 12 2022
## 5 Yes 55 7360 0.000000 0.000000 6 2022
## 6 Yes 55 630 5.958245 7.481864 12 2022
## DATA_VALUE Bias GEOGRAPHY_DISPLAY_LABEL
## 1 47640.00000 26.364981 <NA>
## 2 24.73552 -2.620776 Anderson Community School Corporation
## 3 24.87942 -2.419450 M S D Washington Township
## 4 42.90828 -2.422532 Indianapolis Public Schools
## 5 42.85714 Inf Southwestern Shelby County Consolidated Schools
## 6 21.71799 2.106393 Zionsville Community Schools
# Filter out records where n >= 3
GEOGRAPHY_LABEL <- GEOGRAPHY_LABEL %>%
filter(!is.na(n), n >= 3.0)
head(GEOGRAPHY_LABEL)
## INDICATORID
## 1 1000085
## 2 1000085
## 3 1000085
## 4 1000085
## 5 1000085
## 6 1000085
## DISPLAY_LABEL
## 1 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 2 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 3 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 4 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 5 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## 6 Hispanic Population 25+ Without High School Diploma as % of Hisp Pop Age 25 Over
## Has_CPRO_Viz GEOGRAPHYID GEOLOCID mean_value sd_value n DATAYEAR DATA_VALUE
## 1 Yes 55 5275 39.137402 5.495275 12 2022 24.73552
## 2 Yes 55 5370 37.313334 5.139149 12 2022 24.87942
## 3 Yes 55 5385 49.886587 2.880584 12 2022 42.90828
## 4 Yes 55 7360 0.000000 0.000000 6 2022 42.85714
## 5 Yes 55 630 5.958245 7.481864 12 2022 21.71799
## 6 Yes 55 670 18.756400 7.883576 12 2022 0.00000
## Bias GEOGRAPHY_DISPLAY_LABEL
## 1 -2.620776 Anderson Community School Corporation
## 2 -2.419450 M S D Washington Township
## 3 -2.422532 Indianapolis Public Schools
## 4 Inf Southwestern Shelby County Consolidated Schools
## 5 2.106393 Zionsville Community Schools
## 6 -2.379174 County School Corporation Of Brown County
# Join Geography_Type_Name with GEOGRAPHY_LABEL
GEOGRAPHY_LABEL <- merge(GEOGRAPHY_LABEL, Geography_Type_Name, by.x = "GEOGRAPHYID", by.y = "ID", all.x = TRUE)
GEOGRAPHY_LABEL <- subset(GEOGRAPHY_LABEL, select = -c(GEOGRAPHY_NAME))
GEOGRAPHY_LABEL <- GEOGRAPHY_LABEL %>%
mutate(Trend = ifelse(Bias > 2, "High", ifelse(Bias < -2, "Low", "Normal")))
head(GEOGRAPHY_LABEL)
## GEOGRAPHYID INDICATORID
## 1 2 1003941
## 2 2 1000355
## 3 2 1002004
## 4 2 1004025
## 5 2 1000691
## 6 2 1000561
## DISPLAY_LABEL
## 1 Emergency room visits due to asthma (all ages) per 10,000 population
## 2 Total households
## 3 Students Eligible for Free or Reduced Meals at Public Schools as % of Public School Enrollment
## 4 Percent population change since 2010 - Population under 18 years
## 5 Families with a Male Head of Household, No Wife Present, Living Below 185% of Poverty Level as % of Families with a Male Head of Household
## 6 Dependency Ratio (Ratio of Population under 18 and over 64 to Population 18 - 64)
## Has_CPRO_Viz GEOLOCID mean_value sd_value n DATAYEAR DATA_VALUE
## 1 Yes 18127 43.0000000 8.888194e-01 3 2019 27.4000000
## 2 Yes 18059 26398.0000000 2.246500e+03 17 2022 31424.0000000
## 3 Yes 18059 26.5093770 1.255077e+00 13 2023 32.0878355
## 4 Yes 18035 -4.4556626 3.356004e+00 9 2020 -11.3222755
## 5 Yes 18133 41.6615913 1.025866e+01 12 2022 17.2794118
## 6 Yes 18141 0.6127291 1.221634e-03 5 2015 0.6174084
## Bias GEOGRAPHY_DISPLAY_LABEL ABBREV Trend
## 1 -17.551371 <NA> CNTY Low
## 2 2.237258 County CNTY High
## 3 4.444714 County CNTY High
## 4 -2.046068 <NA> CNTY Low
## 5 -2.376741 County CNTY Low
## 6 3.830313 <NA> CNTY High
outlier_criteria2 <- 3
GEOGRAPHY_counts <- GEOGRAPHY_LABEL %>%
group_by(GEOGRAPHYID, DISPLAY_LABEL, INDICATORID) %>%
filter(Bias > outlier_criteria2) %>%
summarise(OutlierCount = n(), .groups = "drop")
# Merge additional columns
GEOGRAPHY_counts <- merge(GEOGRAPHY_counts, GEOGRAPHY_LABEL %>%
select("GEOGRAPHYID", "GEOLOCID", "INDICATORID", "DISPLAY_LABEL", "mean_value", "sd_value", "n", "DATA_VALUE", "GEOGRAPHY_DISPLAY_LABEL","Bias","Trend") %>%
distinct(),
by = c("GEOGRAPHYID", "INDICATORID"),
all.x = TRUE,
suffixes = c("", "_LABEL"))
View(GEOGRAPHY_counts)
# Filter rows with Bias > 3 or Bias < -3, excluding infinite values
filtered_geography <- GEOGRAPHY_counts %>%
filter((Bias > 3 | Bias < -3) & is.finite(Bias)) %>%
arrange(desc(Bias)) # Sorting by Bias in descending order
# Print the filtered table
print("Geography counts with |Bias| > 3 (excluding infinite values):")
## [1] "Geography counts with |Bias| > 3 (excluding infinite values):"
View(filtered_geography)
# Combining all INDICATORID values to remove (removing duplicates)
ids_to_remove <- c(1003860, 1003401, 1000931, 1000204, 1003672,
1003996, 1003407, 1012339, 1000260, 1003949,
1004430, 1003952, 1003860, 1011638, 1000561,
1000931, 1002004, 100436, 1003954,1011628,1003947,1004436)
# Remove duplicates from the list
ids_to_remove <- unique(ids_to_remove)
# Filter the dataframe to remove these INDICATORID values
filtered_geography <- filtered_geography[!filtered_geography[, "INDICATORID"] %in% ids_to_remove, ]
#goal 1
filtered_geography <- filtered_geography[filtered_geography$GEOLOCID != 365, ]
#creating separate Dataframes for counties and school corporations
GEOGRAPHY_counts_county <- filtered_geography %>%
filter(GEOGRAPHYID == 2)
GEOGRAPHY_counts_SC <- filtered_geography %>%
filter(GEOGRAPHYID == 55)
View(GEOGRAPHY_counts_county)
View(GEOGRAPHY_counts_SC)
#goal2
GOAL_2_COUNTY <- GEOGRAPHY_counts_county %>%
group_by(INDICATORID, DISPLAY_LABEL, OutlierCount, Trend) %>%
summarise() %>%
mutate(percent_of_COUNTIES = (OutlierCount / 12) * 100)
## `summarise()` has grouped output by 'INDICATORID', 'DISPLAY_LABEL',
## 'OutlierCount'. You can override using the `.groups` argument.
GOAL_2_SC <- GEOGRAPHY_counts_SC %>%
group_by(INDICATORID, DISPLAY_LABEL, OutlierCount, Trend) %>%
summarise() %>%
mutate(percent_of_School_corps = (OutlierCount / 25) * 100)
## `summarise()` has grouped output by 'INDICATORID', 'DISPLAY_LABEL',
## 'OutlierCount'. You can override using the `.groups` argument.
View(GOAL_2_SC)
View(GOAL_2_COUNTY)
library(dplyr)
# For GEOGRAPHYID = 2
geo_2_high <- filtered_geography %>%
filter(GEOGRAPHYID == 2, Trend == "High") %>%
group_by(GEOGRAPHY_DISPLAY_LABEL) %>%
summarise(
total_outliers = sum(OutlierCount)
) %>%
arrange(desc(total_outliers))
geo_2_low <- filtered_geography %>%
filter(GEOGRAPHYID == 2, Trend == "Low") %>%
group_by(GEOGRAPHY_DISPLAY_LABEL) %>%
summarise(
total_outliers = sum(OutlierCount)
) %>%
arrange(desc(total_outliers))
# For GEOGRAPHYID = 55
geo_55_high <- filtered_geography %>%
filter(GEOGRAPHYID == 55, Trend == "High") %>%
group_by(GEOGRAPHY_DISPLAY_LABEL) %>%
summarise(
total_outliers = sum(OutlierCount)
) %>%
arrange(desc(total_outliers))
geo_55_low <- filtered_geography %>%
filter(GEOGRAPHYID == 55, Trend == "Low") %>%
group_by(GEOGRAPHY_DISPLAY_LABEL) %>%
summarise(
total_outliers = sum(OutlierCount)
) %>%
arrange(desc(total_outliers))
# Print all tables with clear headers
print("GEOGRAPHYID = 2, Trend = High:")
## [1] "GEOGRAPHYID = 2, Trend = High:"
View(geo_2_high)
print("\nGEOGRAPHYID = 2, Trend = Low:")
## [1] "\nGEOGRAPHYID = 2, Trend = Low:"
View(geo_2_low)
print("\nGEOGRAPHYID = 55, Trend = High:")
## [1] "\nGEOGRAPHYID = 55, Trend = High:"
View(geo_55_high)
print("\nGEOGRAPHYID = 55, Trend = Low:")
## [1] "\nGEOGRAPHYID = 55, Trend = Low:"
View(geo_55_low)