install.packages("tidycensus")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidycensus' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Campo\AppData\Local\Temp\RtmpGUygYs\downloaded_packages
install.packages("tidyverse")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Campo\AppData\Local\Temp\RtmpGUygYs\downloaded_packages
library(tidycensus)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ca_msa <- read_csv("C:/Users/Campo/Downloads/CA_MSA.csv")
## Rows: 5568 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): GEOID.x, NAME, NAMELSAD, LSAD
## dbl (27): tpop, tpopr, pnhwhite, pnhasn, pnhblk, phisp, nhwhite, nhasn, nhbl...
## lgl  (1): geometry
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(colnames(ca_msa))
##  [1] "GEOID.x"   "tpop"      "tpopr"     "pnhwhite"  "pnhasn"    "pnhblk"   
##  [7] "phisp"     "nhwhite"   "nhasn"     "nhblk"     "hisp"      "nonwhite" 
## [13] "pnonwhite" "oth"       "poth"      "CSAFP"     "CBSAFP"    "GEOID.y"  
## [19] "NAME"      "NAMELSAD"  "LSAD"      "geometry"  "nhwhitec"  "nonwhitec"
## [25] "nhasnc"    "nhblkc"    "othc"      "hispc"     "tpoprc"    "wb"       
## [31] "wa"        "wh"
tpop <- ca_msa %>%
  group_by(NAME) %>%
  summarize(Total_Population = sum(tpop, na.rm = TRUE))

print(tpop)
## # A tibble: 6 × 2
##   NAME                                 Total_Population
##   <chr>                                           <dbl>
## 1 Fresno, CA                                     831368
## 2 Los Angeles-Long Beach-Anaheim, CA           12757713
## 3 Riverside-San Bernardino-Ontario, CA          4343189
## 4 San Diego-Chula Vista-Carlsbad, CA            3130711
## 5 San Francisco-Oakland-Berkeley, CA            4168987
## 6 San Jose-Sunnyvale-Santa Clara, CA            1934045
dissimilarity_index <- ca_msa %>%
  group_by(NAME) %>%
  summarize(
    Total_Asian = sum(nhasn, na.rm = TRUE),
    Total_White = sum(nhwhite, na.rm = TRUE)
  ) %>%
  left_join(ca_msa, by = "NAME") %>%
  mutate(
    Asian_Share = nhasn / Total_Asian,
    White_Share = nhwhite / Total_White,
    Diff_Share = abs(Asian_Share - White_Share)
  ) %>%
  group_by(NAME) %>%
  summarize(D_index = 0.5 * sum(Diff_Share, na.rm = TRUE))


print(dissimilarity_index)
## # A tibble: 6 × 2
##   NAME                                 D_index
##   <chr>                                  <dbl>
## 1 Fresno, CA                             0.378
## 2 Los Angeles-Long Beach-Anaheim, CA     0.476
## 3 Riverside-San Bernardino-Ontario, CA   0.421
## 4 San Diego-Chula Vista-Carlsbad, CA     0.480
## 5 San Francisco-Oakland-Berkeley, CA     0.455
## 6 San Jose-Sunnyvale-Santa Clara, CA     0.428
holc_data <- read_csv("C:/Users/Campo/Downloads/holc_census_tracts.csv")
## Rows: 39750 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): holc_id, holc_grade, name, state_code, county_cod, census_tra, geoi...
## dbl (9): id, polygon_id, sheets, holc_area, year, msamd, tract_prop, holc_pr...
## lgl (1): municipali
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(colnames(holc_data))
##  [1] "holc_id"    "holc_grade" "id"         "polygon_id" "sheets"    
##  [6] "name"       "municipali" "holc_area"  "year"       "msamd"     
## [11] "state_code" "county_cod" "census_tra" "geoid"      "tract_prop"
## [16] "holc_prop"  "map_id"     "st_name"    "state"
average_holc_area <- holc_data %>%
  group_by(state) %>%
  summarize(Average_HOLC_Area = mean(holc_area, na.rm = TRUE))


print(average_holc_area)
## # A tibble: 38 × 2
##    state Average_HOLC_Area
##    <chr>             <dbl>
##  1 AL                1.43 
##  2 AR                0.626
##  3 AZ                0.773
##  4 CA                1.30 
##  5 CO                0.704
##  6 CT                0.763
##  7 FL                1.38 
##  8 GA                0.507
##  9 IA                1.99 
## 10 IL                0.575
## # ℹ 28 more rows
holc_data %>%
  ggplot(aes(x = state, y = holc_area)) +
  geom_boxplot() +
  labs(title = "Distribution of HOLC Area by State",
       x = "State",
       y = "HOLC Area") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

holc_grade_d_count <- holc_data %>%
  filter(state == "Texas", holc_grade == "D") %>%
  group_by(municipali) %>%
  summarize(count = n())

print(holc_grade_d_count)
## # A tibble: 0 × 2
## # ℹ 2 variables: municipali <lgl>, count <int>
variables <- c(
  poverty = "B17001_002",
  total_population = "B17001_001",
  black_population = "B02001_003"
)

san_antonio_data <- get_acs(
  geography = "tract",
  variables = variables,
  state = "TX",
  county = "Bexar",
  year = 2020,
  survey = "acs5"
)
## Getting data from the 2016-2020 5-year ACS
san_antonio_data <- san_antonio_data %>%
  select(GEOID, variable, estimate) %>%
  pivot_wider(names_from = variable, values_from = estimate)

san_antonio_data <- san_antonio_data %>%
  mutate(
    poverty_percentage = 100 * poverty / total_population,
    black_percentage = 100 * black_population / total_population
  )


print(san_antonio_data)
## # A tibble: 375 × 6
##    GEOID       black_population total_population poverty poverty_percentage
##    <chr>                  <dbl>            <dbl>   <dbl>              <dbl>
##  1 48029110100              151             2721     536              19.7 
##  2 48029110300               71             3039     798              26.3 
##  3 48029110500              140             2425    1646              67.9 
##  4 48029110600              718             3076    1242              40.4 
##  5 48029110700               44             1079     421              39.0 
##  6 48029111000              107             2767     811              29.3 
##  7 48029111100              123             3485     539              15.5 
##  8 48029120100              616             3863     205               5.31
##  9 48029120301               82             3836     256               6.67
## 10 48029120302               29             4601      77               1.67
## # ℹ 365 more rows
## # ℹ 1 more variable: black_percentage <dbl>
print(colnames(holc_data))
##  [1] "holc_id"    "holc_grade" "id"         "polygon_id" "sheets"    
##  [6] "name"       "municipali" "holc_area"  "year"       "msamd"     
## [11] "state_code" "county_cod" "census_tra" "geoid"      "tract_prop"
## [16] "holc_prop"  "map_id"     "st_name"    "state"
print(colnames(san_antonio_data))
## [1] "GEOID"              "black_population"   "total_population"  
## [4] "poverty"            "poverty_percentage" "black_percentage"
holc_data <- holc_data %>%
  rename(GEOID = geoid)

merged_data <- left_join(holc_data, san_antonio_data, by = "GEOID")


print(head(merged_data))
## # A tibble: 6 × 24
##   holc_id holc_grade    id polygon_id sheets name     municipali holc_area  year
##   <chr>   <chr>      <dbl>      <dbl>  <dbl> <chr>    <lgl>          <dbl> <dbl>
## 1 C2      C             10        230      1 West En… NA             0.842  2019
## 2 C1      C             13       8504      0 <NA>     NA             2.14   2019
## 3 D63     D             16       7493      1 <NA>     NA             5.16   2019
## 4 C33     C             16       7612      1 <NA>     NA             0.621  2019
## 5 B9      B             16       7760      1 <NA>     NA             0.886  2019
## 6 C46     C             16       7792      1 <NA>     NA             3.30   2019
## # ℹ 15 more variables: msamd <dbl>, state_code <chr>, county_cod <chr>,
## #   census_tra <chr>, GEOID <chr>, tract_prop <dbl>, holc_prop <dbl>,
## #   map_id <dbl>, st_name <chr>, state <chr>, black_population <dbl>,
## #   total_population <dbl>, poverty <dbl>, poverty_percentage <dbl>,
## #   black_percentage <dbl>
average_black_percentage <- merged_data %>%
  group_by(holc_grade) %>%
  summarize(Average_Black_Percentage = mean(black_percentage, na.rm = TRUE))

ggplot(average_black_percentage, aes(x = holc_grade, y = Average_Black_Percentage, fill = holc_grade)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Average Black Percentage by HOLC Grade in San Antonio",
    x = "HOLC Grade",
    y = "Average Black Percentage"
  ) +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

ggplot(merged_data, aes(x = holc_grade, y = holc_area, fill = holc_grade)) +
  geom_boxplot() +
  labs(
    title = "Distribution of HOLC Area by Grade in San Antonio",
    x = "HOLC Grade",
    y = "HOLC Area"
  ) +
  theme_minimal()