# Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# Read CA_MSA data
df <- fread("C:/Users/dagob/OneDrive/Desktop/Data/CA_MSA.csv") # Replace with your file path
names(df)
## [1] "GEOID.x" "tpop" "tpopr" "pnhwhite" "pnhasn" "pnhblk"
## [7] "phisp" "nhwhite" "nhasn" "nhblk" "hisp" "nonwhite"
## [13] "pnonwhite" "oth" "poth" "CSAFP" "CBSAFP" "GEOID.y"
## [19] "NAME" "NAMELSAD" "LSAD" "geometry" "nhwhitec" "nonwhitec"
## [25] "nhasnc" "nhblkc" "othc" "hispc" "tpoprc" "wb"
## [31] "wa" "wh"
# Q1 Calculate total population by metropolitan area
total_population <- df %>%
group_by(NAME) %>%
summarize(total_pop = sum(tpop))
# Q2 Calculate dissimilarity index
# Calculate Asian-White proportion difference for each row
df$asian_white_diff <- abs(df$nhasn / df$nhasnc - df$nhwhite / df$nhwhitec)
# Group by metropolitan area and calculate the dissimilarity index
dissimilarity_results <- df %>%
group_by(NAME) %>%
summarize(AWD = 0.5 * sum(asian_white_diff, na.rm = TRUE))
# View the result
print(dissimilarity_results)
## # A tibble: 6 × 2
## NAME AWD
## <chr> <dbl>
## 1 Fresno, CA 0.378
## 2 Los Angeles-Long Beach-Anaheim, CA 0.476
## 3 Riverside-San Bernardino-Ontario, CA 0.421
## 4 San Diego-Chula Vista-Carlsbad, CA 0.480
## 5 San Francisco-Oakland-Berkeley, CA 0.455
## 6 San Jose-Sunnyvale-Santa Clara, CA 0.428
# Q3 Calculate the average HOLC area for each state
# Read HOLC data
HOLC <- fread("C:/Users/dagob/OneDrive/Desktop/Data/holc_census_tracts.csv")
names(HOLC)
## [1] "holc_id" "holc_grade" "id" "polygon_id" "sheets"
## [6] "name" "municipali" "holc_area" "year" "msamd"
## [11] "state_code" "county_cod" "census_tra" "geoid" "tract_prop"
## [16] "holc_prop" "map_id" "st_name" "state"
# Calculate average HOLC area by state
average_holc_area <- HOLC %>%
group_by(state) %>%
summarize(avg_area = mean(holc_area, na.rm = TRUE))
# View the result
print(average_holc_area)
## # A tibble: 38 × 2
## state avg_area
## <chr> <dbl>
## 1 AL 1.43
## 2 AR 0.626
## 3 AZ 0.773
## 4 CA 1.30
## 5 CO 0.704
## 6 CT 0.763
## 7 FL 1.38
## 8 GA 0.507
## 9 IA 1.99
## 10 IL 0.575
## # ℹ 28 more rows
# Q4 Make a boxplot that shows the data distribution of holc_area each state, i.e., x axis for 50 states, and y axis indicate the boxplot distribution
# Load ggplot2
library(ggplot2)
# Create a boxplot for HOLC area distribution by state
ggplot(HOLC, aes(x = state, y = holc_area)) +
geom_boxplot() +
labs(title = "HOLC Area Distribution by State", x = "State", y = "HOLC Area")

# Q5 Create a dataframe that summarizes the count of HOLC grade D each Texas city
# Filter HOLC data for Texas and grade D, then count by city
TX_HOLC <- HOLC %>%
filter(state == "TX" & holc_grade == "D") %>%
group_by(st_name) %>%
summarize(grade_D_count = n())
# View the result
print(TX_HOLC)
## # A tibble: 8 × 2
## st_name grade_D_count
## <chr> <int>
## 1 Austin 29
## 2 Dallas 41
## 3 El Paso 28
## 4 Fort Worth 74
## 5 Galveston 6
## 6 Houston 42
## 7 San Antonio 49
## 8 Waco 4
# Q6 Use Census API to get the poverty and black percentage of each census tract in San Antonio
# Load tidycensus
library(tidycensus)
# Define Census API variables
vars <- c(
poptotal = 'B03002_001E',
black = 'B03002_004E',
poverty = 'B17017_002E'
)
# Set your Census API key (you only need to do this once)
#census_api_key("your_api_key_here", install = TRUE)
# Get data for Bexar County, San Antonio
sa_data <- get_acs( geography = "tract", state = "TX", county = "Bexar",
year = 2021, output = "wide", variables = vars,)
## Getting data from the 2017-2021 5-year ACS
# Calculate percentages for Black population and poverty
sa_data <- sa_data %>%
mutate(
black_pct = black / poptotal * 100,
poverty_pct = poverty / poptotal * 100
)
# View the result
print(sa_data)
## # A tibble: 375 × 10
## GEOID NAME poptotal B03002_001M black B03002_004M poverty B17017_002M
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 48029110100 Censu… 2934 565 58 47 400 147
## 2 48029110300 Censu… 2930 652 49 48 342 132
## 3 48029110500 Censu… 2201 309 152 111 638 131
## 4 48029110600 Censu… 5384 1620 862 490 683 177
## 5 48029110700 Censu… 982 246 24 29 254 88
## 6 48029111000 Censu… 2709 404 86 83 241 74
## 7 48029111100 Censu… 3717 518 125 61 300 94
## 8 48029120100 Censu… 4655 751 609 216 69 40
## 9 48029120301 Censu… 3294 480 46 67 93 80
## 10 48029120302 Censu… 4066 478 8 15 121 101
## # ℹ 365 more rows
## # ℹ 2 more variables: black_pct <dbl>, poverty_pct <dbl>
# Q7 Merge the census data with HOLC data (1'), and make a bar chart showing the average black percentage for different HOLC grades for all San Antonio tracts
# Filter HOLC data for San Antonio
SA_HOLC <- HOLC %>%
filter(st_name == "San Antonio")
# Ensure GEOID is character type for both datasets
SA_HOLC$geoid <- as.character(SA_HOLC$geoid)
sa_data$GEOID <- as.character(sa_data$GEOID)
# Merge HOLC data with census data
SA_HOLC_demo <- merge(SA_HOLC, sa_data, by.x = "geoid", by.y = "GEOID")
# Calculate average Black percentage by HOLC grade
avg_black_pct <- SA_HOLC_demo %>%
group_by(holc_grade) %>%
summarize(avg_black_pct = mean(black_pct, na.rm = TRUE))
# Plot average Black percentage by HOLC grade
ggplot(avg_black_pct, aes(x = holc_grade, y = avg_black_pct, fill = holc_grade)) +
geom_bar(stat = "identity") +
labs(title = "Average Black Percentage by HOLC Grade in San Antonio", x = "HOLC Grade", y = "Average Black Percentage")

# Q8 Make a boxplot showing the HOLC_area data structure (y axis) of each grade (x axis) for San Antonio
# Boxplot for HOLC area in San Antonio by grade
ggplot(SA_HOLC, aes(x = holc_grade, y = holc_area, fill = holc_grade)) +
geom_boxplot() +
labs(title = "HOLC Area Distribution by Grade in San Antonio", x = "HOLC Grade", y = "HOLC Area")
