# Load libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
# Read CA_MSA data
df <- fread("C:/Users/dagob/OneDrive/Desktop/Data/CA_MSA.csv")  # Replace with your file path
names(df)
##  [1] "GEOID.x"   "tpop"      "tpopr"     "pnhwhite"  "pnhasn"    "pnhblk"   
##  [7] "phisp"     "nhwhite"   "nhasn"     "nhblk"     "hisp"      "nonwhite" 
## [13] "pnonwhite" "oth"       "poth"      "CSAFP"     "CBSAFP"    "GEOID.y"  
## [19] "NAME"      "NAMELSAD"  "LSAD"      "geometry"  "nhwhitec"  "nonwhitec"
## [25] "nhasnc"    "nhblkc"    "othc"      "hispc"     "tpoprc"    "wb"       
## [31] "wa"        "wh"
# Q1 Calculate total population by metropolitan area
total_population <- df %>%
  group_by(NAME) %>%
  summarize(total_pop = sum(tpop))  

# Q2 Calculate dissimilarity index
# Calculate Asian-White proportion difference for each row
df$asian_white_diff <- abs(df$nhasn / df$nhasnc - df$nhwhite / df$nhwhitec)

# Group by metropolitan area and calculate the dissimilarity index
dissimilarity_results <- df %>%
  group_by(NAME) %>%
  summarize(AWD = 0.5 * sum(asian_white_diff, na.rm = TRUE))

# View the result
print(dissimilarity_results)
## # A tibble: 6 × 2
##   NAME                                   AWD
##   <chr>                                <dbl>
## 1 Fresno, CA                           0.378
## 2 Los Angeles-Long Beach-Anaheim, CA   0.476
## 3 Riverside-San Bernardino-Ontario, CA 0.421
## 4 San Diego-Chula Vista-Carlsbad, CA   0.480
## 5 San Francisco-Oakland-Berkeley, CA   0.455
## 6 San Jose-Sunnyvale-Santa Clara, CA   0.428
# Q3 Calculate the average HOLC area for each state
# Read HOLC data
HOLC <- fread("C:/Users/dagob/OneDrive/Desktop/Data/holc_census_tracts.csv")  
names(HOLC)
##  [1] "holc_id"    "holc_grade" "id"         "polygon_id" "sheets"    
##  [6] "name"       "municipali" "holc_area"  "year"       "msamd"     
## [11] "state_code" "county_cod" "census_tra" "geoid"      "tract_prop"
## [16] "holc_prop"  "map_id"     "st_name"    "state"
# Calculate average HOLC area by state
average_holc_area <- HOLC %>%
  group_by(state) %>%
  summarize(avg_area = mean(holc_area, na.rm = TRUE))

# View the result
print(average_holc_area)
## # A tibble: 38 × 2
##    state avg_area
##    <chr>    <dbl>
##  1 AL       1.43 
##  2 AR       0.626
##  3 AZ       0.773
##  4 CA       1.30 
##  5 CO       0.704
##  6 CT       0.763
##  7 FL       1.38 
##  8 GA       0.507
##  9 IA       1.99 
## 10 IL       0.575
## # ℹ 28 more rows
# Q4 Make a boxplot that shows the data distribution of holc_area each state, i.e., x axis for 50 states, and y axis indicate the boxplot distribution
# Load ggplot2
library(ggplot2)
# Create a boxplot for HOLC area distribution by state
ggplot(HOLC, aes(x = state, y = holc_area)) +
  geom_boxplot() +
  labs(title = "HOLC Area Distribution by State", x = "State", y = "HOLC Area")

# Q5 Create a dataframe that summarizes the count of HOLC grade D each Texas city
# Filter HOLC data for Texas and grade D, then count by city
TX_HOLC <- HOLC %>%
  filter(state == "TX" & holc_grade == "D") %>%
  group_by(st_name) %>%
  summarize(grade_D_count = n())

# View the result
print(TX_HOLC)
## # A tibble: 8 × 2
##   st_name     grade_D_count
##   <chr>               <int>
## 1 Austin                 29
## 2 Dallas                 41
## 3 El Paso                28
## 4 Fort Worth             74
## 5 Galveston               6
## 6 Houston                42
## 7 San Antonio            49
## 8 Waco                    4
# Q6 Use Census API to get the poverty and black percentage of each census tract in San Antonio
# Load tidycensus
library(tidycensus)

# Define Census API variables
vars <- c(
  poptotal = 'B03002_001E', 
  black = 'B03002_004E', 
  poverty = 'B17017_002E'
)

# Set your Census API key (you only need to do this once)
#census_api_key("your_api_key_here", install = TRUE)

# Get data for Bexar County, San Antonio
sa_data <- get_acs(  geography = "tract",   state = "TX",   county = "Bexar", 
  year = 2021,   output = "wide", variables = vars,)
## Getting data from the 2017-2021 5-year ACS
# Calculate percentages for Black population and poverty
sa_data <- sa_data %>%
  mutate(
    black_pct = black / poptotal * 100,
    poverty_pct = poverty / poptotal * 100
  )

# View the result
print(sa_data)
## # A tibble: 375 × 10
##    GEOID       NAME   poptotal B03002_001M black B03002_004M poverty B17017_002M
##    <chr>       <chr>     <dbl>       <dbl> <dbl>       <dbl>   <dbl>       <dbl>
##  1 48029110100 Censu…     2934         565    58          47     400         147
##  2 48029110300 Censu…     2930         652    49          48     342         132
##  3 48029110500 Censu…     2201         309   152         111     638         131
##  4 48029110600 Censu…     5384        1620   862         490     683         177
##  5 48029110700 Censu…      982         246    24          29     254          88
##  6 48029111000 Censu…     2709         404    86          83     241          74
##  7 48029111100 Censu…     3717         518   125          61     300          94
##  8 48029120100 Censu…     4655         751   609         216      69          40
##  9 48029120301 Censu…     3294         480    46          67      93          80
## 10 48029120302 Censu…     4066         478     8          15     121         101
## # ℹ 365 more rows
## # ℹ 2 more variables: black_pct <dbl>, poverty_pct <dbl>
# Q7 Merge the census data with HOLC data (1'), and make a bar chart showing the average black percentage for different HOLC grades for all San Antonio tracts
# Filter HOLC data for San Antonio
SA_HOLC <- HOLC %>%
  filter(st_name == "San Antonio")

# Ensure GEOID is character type for both datasets
SA_HOLC$geoid <- as.character(SA_HOLC$geoid)
sa_data$GEOID <- as.character(sa_data$GEOID)

# Merge HOLC data with census data
SA_HOLC_demo <- merge(SA_HOLC, sa_data, by.x = "geoid", by.y = "GEOID")

# Calculate average Black percentage by HOLC grade
avg_black_pct <- SA_HOLC_demo %>%
  group_by(holc_grade) %>%
  summarize(avg_black_pct = mean(black_pct, na.rm = TRUE))

# Plot average Black percentage by HOLC grade
ggplot(avg_black_pct, aes(x = holc_grade, y = avg_black_pct, fill = holc_grade)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Black Percentage by HOLC Grade in San Antonio", x = "HOLC Grade", y = "Average Black Percentage")

# Q8 Make a boxplot showing the HOLC_area data structure (y axis)  of each grade (x axis) for San Antonio
# Boxplot for HOLC area in San Antonio by grade
ggplot(SA_HOLC, aes(x = holc_grade, y = holc_area, fill = holc_grade)) +
  geom_boxplot() +
  labs(title = "HOLC Area Distribution by Grade in San Antonio", x = "HOLC Grade", y = "HOLC Area")