Load Libraries
data_raw: datayear, geographyid, geolocid, indicatorid, data_value
geos_raw: geographyid, geolocid
geotypes_raw: id
indicators_raw: indicatorid
# Statistical Summary
summary(data_raw[, sapply(data_raw, is.numeric)])
## datayear geographyid geolocid indicatorid
## Min. :2010 Min. : 2.0 Min. :0.000e+00 Min. :1000049
## 1st Qu.:2014 1st Qu.: 2.0 1st Qu.:2.725e+03 1st Qu.:1001255
## Median :2018 Median : 55.0 Median :6.590e+03 Median :1004730
## Mean :2018 Mean : 103.2 Mean :3.903e+06 Mean :1006535
## 3rd Qu.:2021 3rd Qu.: 55.0 3rd Qu.:1.810e+04 3rd Qu.:1011831
## Max. :2060 Max. :1000.0 Max. :2.110e+10 Max. :1013342
## data_value
## Min. :-9.154e+04
## 1st Qu.: 7.000e+00
## Median : 5.400e+01
## Mean : 4.578e+09
## 3rd Qu.: 6.950e+02
## Max. : 7.129e+14
# Visualize year distributions
data_raw %>%
ggplot(aes(x = datayear)) +
geom_histogram(binwidth = 1, fill = "steelblue") +
scale_x_continuous(limits = c(2005,2026)) +
labs(title = "Distribution of Data Years")
## Warning: Removed 930 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Visualize year outliers
data_raw %>%
ggplot(aes(y = datayear)) +
geom_boxplot(fill = "steelblue") +
labs(title = "Outlier Scan: Data Years")
# Statistical Summary
summary(geos_raw[, sapply(geos_raw, is.numeric)])
## geographyid geolocid
## Min. : 2 Min. :1.800e+01
## 1st Qu.: 348 1st Qu.:1.813e+10
## Median : 349 Median :1.805e+11
## Mean : 337 Mean :1.165e+11
## 3rd Qu.: 349 3rd Qu.:1.816e+11
## Max. :1000 Max. :2.110e+11
# Statistical Summary
summary(geotypes_raw[, sapply(geotypes_raw, is.numeric)])
## id
## Min. : 2.0
## 1st Qu.: 190.5
## Median : 341.5
## Mean : 401.0
## 3rd Qu.: 575.5
## Max. :1002.0
# Statistical Summary
summary(indicators_raw[, sapply(indicators_raw, is.numeric)])
## indicatorid
## Min. :1000043
## 1st Qu.:1003946
## Median :1005733
## Mean :1007270
## 3rd Qu.:1011828
## Max. :1013342
# Correlation Matrix for outlier detection
data_raw %>%
select(datayear, geographyid, geolocid, indicatorid, data_value) %>%
cor(use = "pairwise.complete.obs") %>%
round(2)
## datayear geographyid geolocid indicatorid data_value
## datayear 1.00 0.04 0.01 0.04 0
## geographyid 0.04 1.00 0.01 0.08 0
## geolocid 0.01 0.01 1.00 0.01 0
## indicatorid 0.04 0.08 0.01 1.00 0
## data_value 0.00 0.00 0.00 0.00 1
geos_raw: geography_display_label
geotypes_raw: abbrev, geography_name
indicators_raw: source_name, source_abbrev, display_label, has_cpro_viz
# counting occurences for each categorical variable
geos_raw %>% count(geography_display_label, sort = TRUE)
## # A tibble: 751 × 2
## geography_display_label n
## <chr> <int>
## 1 42406 1
## 2 42420 1
## 3 42451 1
## 4 42452 1
## 5 42458 1
## 6 47501 1
## 7 47512 1
## 8 47513 1
## 9 47514 1
## 10 47515 1
## # ℹ 741 more rows
geotypes_raw %>% count(abbrev, sort = TRUE)
## # A tibble: 14 × 2
## abbrev n
## <chr> <int>
## 1 AAA 1
## 2 BG10 1
## 3 BG20 1
## 4 CNTY 1
## 5 INST 1
## 6 MS13 1
## 7 PLC 1
## 8 PM20 1
## 9 SCRP 1
## 10 TR10 1
## 11 TR20 1
## 12 TWSP 1
## 13 USA 1
## 14 ZC10 1
geotypes_raw %>% count(geography_name, sort = TRUE)
## # A tibble: 14 × 2
## geography_name n
## <chr> <int>
## 1 Area Agency on Aging 1
## 2 Blockgroups 1
## 3 Blockgroups 2010 1
## 4 Census Places (Cities/Towns) 1
## 5 Census Tracts 1
## 6 Census Tracts 2010 1
## 7 Counties 1
## 8 Metropolitan Statistical Areas 2013 1
## 9 Public Use Microdata Area 1
## 10 School Corporations 1
## 11 State 1
## 12 Townships 1
## 13 United States 1
## 14 ZIP Codes 1
indicators_raw %>% count(source_name, sort = TRUE)
## # A tibble: 76 × 2
## source_name n
## <chr> <int>
## 1 American Community Survey 5-year Averages 2151
## 2 Census Public Use Microdata Area Data 835
## 3 Indiana Department of Education 546
## 4 United for ALICE 424
## 5 Indiana Department of Health 160
## 6 Marion County Public Health Department 126
## 7 Indiana Commission for Higher Education 109
## 8 Federal Financial Institutions Examination Council via Urban Institute 96
## 9 U.S. Census Bureau - Decennial Census 91
## 10 SAVI Community Information System 79
## # ℹ 66 more rows
indicators_raw %>% count(source_abbrev, sort = TRUE)
## # A tibble: 76 × 2
## source_abbrev n
## <chr> <int>
## 1 ACS5 2151
## 2 PUMA 835
## 3 IDOE 546
## 4 ALICE 424
## 5 IDOH 160
## 6 MCPHD 126
## 7 CHE 109
## 8 HMDA 96
## 9 CENSUS 91
## 10 SAVI 79
## # ℹ 66 more rows
indicators_raw %>% count(display_label, sort = TRUE)
## # A tibble: 5,518 × 2
## display_label n
## <chr> <int>
## 1 Aggravated Assaults per 1,000 Population 3
## 2 Births Where Mother Smoked During Pregnancy as % of All Births 3
## 3 Births Where Prenatal Care Began in the First Trimester as % of All Bi… 3
## 4 Burglaries per 1,000 Population 3
## 5 Larcenies per 1,000 Population 3
## 6 Low Weight Births as % of All Births 3
## 7 Robberies per 1,000 Population 3
## 8 Students who are homeless as a % of total enrollment 3
## 9 Total Juvenile Offense Charges per 1000 Population Age 5-17 3
## 10 Vehicle Thefts per 1,000 Population 3
## # ℹ 5,508 more rows
indicators_raw %>% count(has_cpro_viz, sort = TRUE)
## # A tibble: 2 × 2
## has_cpro_viz n
## <chr> <int>
## 1 No 3968
## 2 Yes 1671
geos_raw %>%
count(geographyid) %>%
slice_max(n = 5, order_by = n) %>%
ggplot(aes(x = reorder(geographyid, n), y = n)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Count of Geography Types",
x = "Geography Type ID",
y = "Top 5 Count")
Top 3 geography type occurrences in geos_raw:
Blockgroups, census tracks, zip codes
indicators_raw %>%
count(source_name) %>%
slice_max(n = 10, order_by = n) %>%
ggplot(aes(x = reorder(source_name, n), y = n)) +
geom_col(fill = "orange") +
coord_flip() +
labs(title = "Indicators by Source",
x = "Source Name",
y = "Top 10 Count")
# further statistical analysis
# standard deviation
# bias