Load Libraries

1. Read & Clean Input Files

2. Analyze Data

Numerical Columns

  • data_raw: datayear, geographyid, geolocid, indicatorid, data_value

  • geos_raw: geographyid, geolocid

  • geotypes_raw: id

  • indicators_raw: indicatorid

Data Raw

# Statistical Summary
summary(data_raw[, sapply(data_raw, is.numeric)]) 
##     datayear     geographyid        geolocid          indicatorid     
##  Min.   :2010   Min.   :   2.0   Min.   :0.000e+00   Min.   :1000049  
##  1st Qu.:2014   1st Qu.:   2.0   1st Qu.:2.725e+03   1st Qu.:1001255  
##  Median :2018   Median :  55.0   Median :6.590e+03   Median :1004730  
##  Mean   :2018   Mean   : 103.2   Mean   :3.903e+06   Mean   :1006535  
##  3rd Qu.:2021   3rd Qu.:  55.0   3rd Qu.:1.810e+04   3rd Qu.:1011831  
##  Max.   :2060   Max.   :1000.0   Max.   :2.110e+10   Max.   :1013342  
##    data_value        
##  Min.   :-9.154e+04  
##  1st Qu.: 7.000e+00  
##  Median : 5.400e+01  
##  Mean   : 4.578e+09  
##  3rd Qu.: 6.950e+02  
##  Max.   : 7.129e+14
# Visualize year distributions
data_raw %>%
  ggplot(aes(x = datayear)) +
  geom_histogram(binwidth = 1, fill = "steelblue") +
  scale_x_continuous(limits = c(2005,2026)) +
  labs(title = "Distribution of Data Years")
## Warning: Removed 930 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# Visualize year outliers
data_raw %>%
  ggplot(aes(y = datayear)) +
  geom_boxplot(fill = "steelblue") +
  labs(title = "Outlier Scan: Data Years")

Geos Raw

# Statistical Summary
summary(geos_raw[, sapply(geos_raw, is.numeric)]) 
##   geographyid      geolocid        
##  Min.   :   2   Min.   :1.800e+01  
##  1st Qu.: 348   1st Qu.:1.813e+10  
##  Median : 349   Median :1.805e+11  
##  Mean   : 337   Mean   :1.165e+11  
##  3rd Qu.: 349   3rd Qu.:1.816e+11  
##  Max.   :1000   Max.   :2.110e+11

Geotypes Raw

# Statistical Summary
summary(geotypes_raw[, sapply(geotypes_raw, is.numeric)]) 
##        id        
##  Min.   :   2.0  
##  1st Qu.: 190.5  
##  Median : 341.5  
##  Mean   : 401.0  
##  3rd Qu.: 575.5  
##  Max.   :1002.0

Indicators Raw

# Statistical Summary
summary(indicators_raw[, sapply(indicators_raw, is.numeric)]) 
##   indicatorid     
##  Min.   :1000043  
##  1st Qu.:1003946  
##  Median :1005733  
##  Mean   :1007270  
##  3rd Qu.:1011828  
##  Max.   :1013342
# Correlation Matrix for outlier detection
data_raw %>%
  select(datayear, geographyid, geolocid, indicatorid, data_value) %>%
  cor(use = "pairwise.complete.obs") %>%
  round(2)
##             datayear geographyid geolocid indicatorid data_value
## datayear        1.00        0.04     0.01        0.04          0
## geographyid     0.04        1.00     0.01        0.08          0
## geolocid        0.01        0.01     1.00        0.01          0
## indicatorid     0.04        0.08     0.01        1.00          0
## data_value      0.00        0.00     0.00        0.00          1

Categorical Columns

  • geos_raw: geography_display_label

  • geotypes_raw: abbrev, geography_name

  • indicators_raw: source_name, source_abbrev, display_label, has_cpro_viz

Frequency Tables

# counting occurences for each categorical variable
geos_raw %>% count(geography_display_label, sort = TRUE)
## # A tibble: 751 × 2
##    geography_display_label     n
##    <chr>                   <int>
##  1 42406                       1
##  2 42420                       1
##  3 42451                       1
##  4 42452                       1
##  5 42458                       1
##  6 47501                       1
##  7 47512                       1
##  8 47513                       1
##  9 47514                       1
## 10 47515                       1
## # ℹ 741 more rows
geotypes_raw %>% count(abbrev, sort = TRUE)
## # A tibble: 14 × 2
##    abbrev     n
##    <chr>  <int>
##  1 AAA        1
##  2 BG10       1
##  3 BG20       1
##  4 CNTY       1
##  5 INST       1
##  6 MS13       1
##  7 PLC        1
##  8 PM20       1
##  9 SCRP       1
## 10 TR10       1
## 11 TR20       1
## 12 TWSP       1
## 13 USA        1
## 14 ZC10       1
geotypes_raw %>% count(geography_name, sort = TRUE)
## # A tibble: 14 × 2
##    geography_name                          n
##    <chr>                               <int>
##  1 Area Agency on Aging                    1
##  2 Blockgroups                             1
##  3 Blockgroups 2010                        1
##  4 Census Places (Cities/Towns)            1
##  5 Census Tracts                           1
##  6 Census Tracts 2010                      1
##  7 Counties                                1
##  8 Metropolitan Statistical Areas 2013     1
##  9 Public Use Microdata Area               1
## 10 School Corporations                     1
## 11 State                                   1
## 12 Townships                               1
## 13 United States                           1
## 14 ZIP Codes                               1
indicators_raw %>% count(source_name, sort = TRUE)
## # A tibble: 76 × 2
##    source_name                                                                n
##    <chr>                                                                  <int>
##  1 American Community Survey 5-year Averages                               2151
##  2 Census Public Use Microdata Area Data                                    835
##  3 Indiana Department of Education                                          546
##  4 United for ALICE                                                         424
##  5 Indiana Department of Health                                             160
##  6 Marion County Public Health Department                                   126
##  7 Indiana Commission for Higher Education                                  109
##  8 Federal Financial Institutions Examination Council via Urban Institute    96
##  9 U.S. Census Bureau - Decennial Census                                     91
## 10 SAVI Community Information System                                         79
## # ℹ 66 more rows
indicators_raw %>% count(source_abbrev, sort = TRUE)
## # A tibble: 76 × 2
##    source_abbrev     n
##    <chr>         <int>
##  1 ACS5           2151
##  2 PUMA            835
##  3 IDOE            546
##  4 ALICE           424
##  5 IDOH            160
##  6 MCPHD           126
##  7 CHE             109
##  8 HMDA             96
##  9 CENSUS           91
## 10 SAVI             79
## # ℹ 66 more rows
indicators_raw %>% count(display_label, sort = TRUE)
## # A tibble: 5,518 × 2
##    display_label                                                               n
##    <chr>                                                                   <int>
##  1 Aggravated Assaults per 1,000 Population                                    3
##  2 Births Where Mother Smoked During Pregnancy as % of All Births              3
##  3 Births Where Prenatal Care Began in the First Trimester as % of All Bi…     3
##  4 Burglaries per 1,000 Population                                             3
##  5 Larcenies per 1,000 Population                                              3
##  6 Low Weight Births as % of All Births                                        3
##  7 Robberies per 1,000 Population                                              3
##  8 Students who are homeless as a % of total enrollment                        3
##  9 Total Juvenile Offense Charges per 1000 Population Age 5-17                 3
## 10 Vehicle Thefts per 1,000 Population                                         3
## # ℹ 5,508 more rows
indicators_raw %>% count(has_cpro_viz, sort = TRUE)
## # A tibble: 2 × 2
##   has_cpro_viz     n
##   <chr>        <int>
## 1 No            3968
## 2 Yes           1671

Geos Raw

geos_raw %>%
  count(geographyid) %>%
  slice_max(n = 5, order_by = n) %>%
  ggplot(aes(x = reorder(geographyid, n), y = n)) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(title = "Count of Geography Types",
       x = "Geography Type ID",
       y = "Top 5 Count")

Top 3 geography type occurrences in geos_raw: Blockgroups, census tracks, zip codes

Data Raw

indicators_raw %>%
  count(source_name) %>%
  slice_max(n = 10, order_by = n) %>%
  ggplot(aes(x = reorder(source_name, n), y = n)) +
  geom_col(fill = "orange") +
  coord_flip() +
  labs(title = "Indicators by Source",
       x = "Source Name",
       y = "Top 10 Count")

# further statistical analysis
# standard deviation
# bias