#install.packages("dplyr")
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("readxl")
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
#Calculate the total population for each metropolitan areas in CA_MSA.csv using the chain approach (%>%) (1’)
df<- fread("C:/Users/witht/Documents/2024MethodsI/A3/CA_MSA.csv")
totalpop <- df %>%
group_by(NAMELSAD) %>%
summarize(TotalPopulation = sum(tpop, na.rm = TRUE))
print(totalpop)
## # A tibble: 6 × 2
## NAMELSAD TotalPopulation
## <chr> <int>
## 1 Fresno, CA Metro Area 831368
## 2 Los Angeles-Long Beach-Anaheim, CA Metro Area 12757713
## 3 Riverside-San Bernardino-Ontario, CA Metro Area 4343189
## 4 San Diego-Chula Vista-Carlsbad, CA Metro Area 3130711
## 5 San Francisco-Oakland-Berkeley, CA Metro Area 4168987
## 6 San Jose-Sunnyvale-Santa Clara, CA Metro Area 1934045
#Calculate the asian-white dissimilarity index for each metropolitan areas in CA_MSA.csv (1’)
df$wa <- abs(df$nhasn/df$nhasnc-df$nhwhite/df$nhwhitec)
Diss <- df %>%
group_by(NAMELSAD) %>%
summarize(AWD = 0.5*sum(wa))
print(Diss)
## # A tibble: 6 × 2
## NAMELSAD AWD
## <chr> <dbl>
## 1 Fresno, CA Metro Area 0.378
## 2 Los Angeles-Long Beach-Anaheim, CA Metro Area 0.476
## 3 Riverside-San Bernardino-Ontario, CA Metro Area 0.421
## 4 San Diego-Chula Vista-Carlsbad, CA Metro Area 0.480
## 5 San Francisco-Oakland-Berkeley, CA Metro Area 0.455
## 6 San Jose-Sunnyvale-Santa Clara, CA Metro Area 0.428
#Calculate the average HOLC area for each state using holc_census_tracts.csv(1’)
HOLC <- fread("C:/Users/witht/Documents/2024MethodsI/A3/holc_census_tracts.csv")
ave_area <-HOLC %>%
group_by(state) %>%
summarize(avg=mean(holc_area))
print(ave_area)
## # A tibble: 38 × 2
## state avg
## <chr> <dbl>
## 1 AL 1.43
## 2 AR 0.626
## 3 AZ 0.773
## 4 CA 1.30
## 5 CO 0.704
## 6 CT 0.763
## 7 FL 1.38
## 8 GA 0.507
## 9 IA 1.99
## 10 IL 0.575
## # ℹ 28 more rows
#Make a boxplot that shows the data distribution of holc_area each state, i.e., x axis for all states, and y axis indicate the boxplot distribution (1’)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
ggplot(HOLC, aes(x=state, y=holc_area, fill = state)) +
geom_boxplot() +
labs(title = "HOLC Area by State",
x = "State",
y = "HOLC Area")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
#Create a dataframe that summarizes the count of HOLC grade D each Texas city using the chain approach. Hint: summarize(count=n()) can be used to get the count of conditioned rows in the summarize function of chain approach.(2’)
TX_HOLC<- HOLC[HOLC$state=="TX" & HOLC$holc_grade=="D",] %>%
group_by(st_name) %>%
summarize(count=n())
print(TX_HOLC)
## # A tibble: 8 × 2
## st_name count
## <chr> <int>
## 1 Austin 29
## 2 Dallas 41
## 3 El Paso 28
## 4 Fort Worth 74
## 5 Galveston 6
## 6 Houston 42
## 7 San Antonio 49
## 8 Waco 4
#Use Census API to get the poverty and black percentage of each census tract in San Antonio (1’)
library(tidycensus)
## Warning: package 'tidycensus' was built under R version 4.4.2
census_api_key("be13863f38ebdc792d7bf1fdbd7f8089b2a708f2", install = FALSE)
## To install your API key for use in future sessions, run this function with `install = TRUE`.
var <- c(poptotal='B03002_001E',
white='B03002_003E',
black='B03002_004E',
poverty='B17017_002E')
st <-"TX"
ct <-"Bexar"
cbg <- get_acs(geography = "tract", variables = var, count=ct,
state = st,output="wide", year = 2021, geometry = TRUE)
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## | | | 0% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========== | 14% | |========== | 15% | |=========== | 15% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 20% | |=============== | 22% | |================ | 22% | |================ | 23% | |================= | 24% | |================== | 26% | |=================== | 27% | |==================== | 29% | |===================== | 30% | |======================= | 32% | |========================= | 36% | |=========================== | 39% | |============================= | 42% | |=============================== | 45% | |================================== | 48% | |==================================== | 51% | |====================================== | 55% | |======================================== | 58% | |=========================================== | 61% | |============================================= | 64% | |=============================================== | 67% | |================================================= | 70% | |=================================================== | 73% | |====================================================== | 77% | |====================================================== | 78% | |======================================================== | 80% | |========================================================== | 83% | |============================================================ | 86% | |=============================================================== | 90% | |================================================================ | 91% | |================================================================= | 93% | |================================================================== | 94% | |==================================================================== | 97% | |======================================================================| 100%
cbg$poverty_pct <- 100 * (cbg$poverty /cbg$poptotal)
cbg$black_pct <- 100 * (cbg$black/cbg$poptotal)
#Merge the census data with HOLC data (1’), and make a bar chart showing the average black percentage for different HOLC grades for all San Antonio tracts (1’)
SA <- HOLC[HOLC$st_name=="San Antonio",]
SA <- SA[,c(2,8,14,18)]
SA$geoid <- as.character(SA$geoid)
names(SA)[3] <-"GEOID"
cbg<-cbg[,c(1,12,13)]
SA_HOLC <-merge(SA,cbg, by="GEOID")
avg_black_pct <- SA_HOLC %>%
group_by(holc_grade) %>%
summarise(avg=mean(black_pct))
ggplot(avg_black_pct, aes(x = holc_grade, y = avg, fill = holc_grade)) +
geom_bar(stat = "identity") +
labs(title = "Average Black Population Percent by HOLC Grade",
x= "HOLC Grade",
y= "Average Black Population Percentage")
#Make a boxplot showing the HOLC_area data structure (y axis) of each grade (x axis) for San Antonio (1’).
ggplot(SA_HOLC, aes(x=holc_grade, y=holc_area, fill = holc_grade)) +
geom_boxplot() +
labs(title = "San Antonio HOLC Area by Grade",
x = "Holc Grade",
y = "HOLC Area")