install.packages("tidycensus")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidycensus' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Campo\AppData\Local\Temp\RtmpGUygYs\downloaded_packages
install.packages("tidyverse")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Campo\AppData\Local\Temp\RtmpGUygYs\downloaded_packages
library(tidycensus)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ca_msa <- read_csv("C:/Users/Campo/Downloads/CA_MSA.csv")
## Rows: 5568 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): GEOID.x, NAME, NAMELSAD, LSAD
## dbl (27): tpop, tpopr, pnhwhite, pnhasn, pnhblk, phisp, nhwhite, nhasn, nhbl...
## lgl (1): geometry
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(colnames(ca_msa))
## [1] "GEOID.x" "tpop" "tpopr" "pnhwhite" "pnhasn" "pnhblk"
## [7] "phisp" "nhwhite" "nhasn" "nhblk" "hisp" "nonwhite"
## [13] "pnonwhite" "oth" "poth" "CSAFP" "CBSAFP" "GEOID.y"
## [19] "NAME" "NAMELSAD" "LSAD" "geometry" "nhwhitec" "nonwhitec"
## [25] "nhasnc" "nhblkc" "othc" "hispc" "tpoprc" "wb"
## [31] "wa" "wh"
tpop <- ca_msa %>%
group_by(NAME) %>%
summarize(Total_Population = sum(tpop, na.rm = TRUE))
print(tpop)
## # A tibble: 6 × 2
## NAME Total_Population
## <chr> <dbl>
## 1 Fresno, CA 831368
## 2 Los Angeles-Long Beach-Anaheim, CA 12757713
## 3 Riverside-San Bernardino-Ontario, CA 4343189
## 4 San Diego-Chula Vista-Carlsbad, CA 3130711
## 5 San Francisco-Oakland-Berkeley, CA 4168987
## 6 San Jose-Sunnyvale-Santa Clara, CA 1934045
dissimilarity_index <- ca_msa %>%
group_by(NAME) %>%
summarize(
Total_Asian = sum(nhasn, na.rm = TRUE),
Total_White = sum(nhwhite, na.rm = TRUE)
) %>%
left_join(ca_msa, by = "NAME") %>%
mutate(
Asian_Share = nhasn / Total_Asian,
White_Share = nhwhite / Total_White,
Diff_Share = abs(Asian_Share - White_Share)
) %>%
group_by(NAME) %>%
summarize(D_index = 0.5 * sum(Diff_Share, na.rm = TRUE))
print(dissimilarity_index)
## # A tibble: 6 × 2
## NAME D_index
## <chr> <dbl>
## 1 Fresno, CA 0.378
## 2 Los Angeles-Long Beach-Anaheim, CA 0.476
## 3 Riverside-San Bernardino-Ontario, CA 0.421
## 4 San Diego-Chula Vista-Carlsbad, CA 0.480
## 5 San Francisco-Oakland-Berkeley, CA 0.455
## 6 San Jose-Sunnyvale-Santa Clara, CA 0.428
holc_data <- read_csv("C:/Users/Campo/Downloads/holc_census_tracts.csv")
## Rows: 39750 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): holc_id, holc_grade, name, state_code, county_cod, census_tra, geoi...
## dbl (9): id, polygon_id, sheets, holc_area, year, msamd, tract_prop, holc_pr...
## lgl (1): municipali
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(colnames(holc_data))
## [1] "holc_id" "holc_grade" "id" "polygon_id" "sheets"
## [6] "name" "municipali" "holc_area" "year" "msamd"
## [11] "state_code" "county_cod" "census_tra" "geoid" "tract_prop"
## [16] "holc_prop" "map_id" "st_name" "state"
average_holc_area <- holc_data %>%
group_by(state) %>%
summarize(Average_HOLC_Area = mean(holc_area, na.rm = TRUE))
print(average_holc_area)
## # A tibble: 38 × 2
## state Average_HOLC_Area
## <chr> <dbl>
## 1 AL 1.43
## 2 AR 0.626
## 3 AZ 0.773
## 4 CA 1.30
## 5 CO 0.704
## 6 CT 0.763
## 7 FL 1.38
## 8 GA 0.507
## 9 IA 1.99
## 10 IL 0.575
## # ℹ 28 more rows
holc_data %>%
ggplot(aes(x = state, y = holc_area)) +
geom_boxplot() +
labs(title = "Distribution of HOLC Area by State",
x = "State",
y = "HOLC Area") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

holc_grade_d_count <- holc_data %>%
filter(state == "Texas", holc_grade == "D") %>%
group_by(municipali) %>%
summarize(count = n())
print(holc_grade_d_count)
## # A tibble: 0 × 2
## # ℹ 2 variables: municipali <lgl>, count <int>
variables <- c(
poverty = "B17001_002",
total_population = "B17001_001",
black_population = "B02001_003"
)
san_antonio_data <- get_acs(
geography = "tract",
variables = variables,
state = "TX",
county = "Bexar",
year = 2020,
survey = "acs5"
)
## Getting data from the 2016-2020 5-year ACS
san_antonio_data <- san_antonio_data %>%
select(GEOID, variable, estimate) %>%
pivot_wider(names_from = variable, values_from = estimate)
san_antonio_data <- san_antonio_data %>%
mutate(
poverty_percentage = 100 * poverty / total_population,
black_percentage = 100 * black_population / total_population
)
print(san_antonio_data)
## # A tibble: 375 × 6
## GEOID black_population total_population poverty poverty_percentage
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 48029110100 151 2721 536 19.7
## 2 48029110300 71 3039 798 26.3
## 3 48029110500 140 2425 1646 67.9
## 4 48029110600 718 3076 1242 40.4
## 5 48029110700 44 1079 421 39.0
## 6 48029111000 107 2767 811 29.3
## 7 48029111100 123 3485 539 15.5
## 8 48029120100 616 3863 205 5.31
## 9 48029120301 82 3836 256 6.67
## 10 48029120302 29 4601 77 1.67
## # ℹ 365 more rows
## # ℹ 1 more variable: black_percentage <dbl>
print(colnames(holc_data))
## [1] "holc_id" "holc_grade" "id" "polygon_id" "sheets"
## [6] "name" "municipali" "holc_area" "year" "msamd"
## [11] "state_code" "county_cod" "census_tra" "geoid" "tract_prop"
## [16] "holc_prop" "map_id" "st_name" "state"
print(colnames(san_antonio_data))
## [1] "GEOID" "black_population" "total_population"
## [4] "poverty" "poverty_percentage" "black_percentage"
holc_data <- holc_data %>%
rename(GEOID = geoid)
merged_data <- left_join(holc_data, san_antonio_data, by = "GEOID")
print(head(merged_data))
## # A tibble: 6 × 24
## holc_id holc_grade id polygon_id sheets name municipali holc_area year
## <chr> <chr> <dbl> <dbl> <dbl> <chr> <lgl> <dbl> <dbl>
## 1 C2 C 10 230 1 West En… NA 0.842 2019
## 2 C1 C 13 8504 0 <NA> NA 2.14 2019
## 3 D63 D 16 7493 1 <NA> NA 5.16 2019
## 4 C33 C 16 7612 1 <NA> NA 0.621 2019
## 5 B9 B 16 7760 1 <NA> NA 0.886 2019
## 6 C46 C 16 7792 1 <NA> NA 3.30 2019
## # ℹ 15 more variables: msamd <dbl>, state_code <chr>, county_cod <chr>,
## # census_tra <chr>, GEOID <chr>, tract_prop <dbl>, holc_prop <dbl>,
## # map_id <dbl>, st_name <chr>, state <chr>, black_population <dbl>,
## # total_population <dbl>, poverty <dbl>, poverty_percentage <dbl>,
## # black_percentage <dbl>
average_black_percentage <- merged_data %>%
group_by(holc_grade) %>%
summarize(Average_Black_Percentage = mean(black_percentage, na.rm = TRUE))
ggplot(average_black_percentage, aes(x = holc_grade, y = Average_Black_Percentage, fill = holc_grade)) +
geom_bar(stat = "identity") +
labs(
title = "Average Black Percentage by HOLC Grade in San Antonio",
x = "HOLC Grade",
y = "Average Black Percentage"
) +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

ggplot(merged_data, aes(x = holc_grade, y = holc_area, fill = holc_grade)) +
geom_boxplot() +
labs(
title = "Distribution of HOLC Area by Grade in San Antonio",
x = "HOLC Grade",
y = "HOLC Area"
) +
theme_minimal()
