#install.packages("dplyr")
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("readxl")
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

#Calculate the total population for each metropolitan areas in CA_MSA.csv using the chain approach (%>%) (1’)

df<- fread("C:/Users/witht/Documents/2024MethodsI/A3/CA_MSA.csv")

totalpop <- df %>%
  group_by(NAMELSAD) %>%
  summarize(TotalPopulation = sum(tpop, na.rm = TRUE))

print(totalpop)
## # A tibble: 6 × 2
##   NAMELSAD                                        TotalPopulation
##   <chr>                                                     <int>
## 1 Fresno, CA Metro Area                                    831368
## 2 Los Angeles-Long Beach-Anaheim, CA Metro Area          12757713
## 3 Riverside-San Bernardino-Ontario, CA Metro Area         4343189
## 4 San Diego-Chula Vista-Carlsbad, CA Metro Area           3130711
## 5 San Francisco-Oakland-Berkeley, CA Metro Area           4168987
## 6 San Jose-Sunnyvale-Santa Clara, CA Metro Area           1934045

#Calculate the asian-white dissimilarity index for each metropolitan areas in CA_MSA.csv (1’)

df$wa <- abs(df$nhasn/df$nhasnc-df$nhwhite/df$nhwhitec)

Diss <- df %>%
  group_by(NAMELSAD) %>%
  summarize(AWD = 0.5*sum(wa))

print(Diss)
## # A tibble: 6 × 2
##   NAMELSAD                                          AWD
##   <chr>                                           <dbl>
## 1 Fresno, CA Metro Area                           0.378
## 2 Los Angeles-Long Beach-Anaheim, CA Metro Area   0.476
## 3 Riverside-San Bernardino-Ontario, CA Metro Area 0.421
## 4 San Diego-Chula Vista-Carlsbad, CA Metro Area   0.480
## 5 San Francisco-Oakland-Berkeley, CA Metro Area   0.455
## 6 San Jose-Sunnyvale-Santa Clara, CA Metro Area   0.428

#Calculate the average HOLC area for each state using holc_census_tracts.csv(1’)

HOLC <- fread("C:/Users/witht/Documents/2024MethodsI/A3/holc_census_tracts.csv")

ave_area <-HOLC %>%
  group_by(state) %>%
  summarize(avg=mean(holc_area))

print(ave_area)
## # A tibble: 38 × 2
##    state   avg
##    <chr> <dbl>
##  1 AL    1.43 
##  2 AR    0.626
##  3 AZ    0.773
##  4 CA    1.30 
##  5 CO    0.704
##  6 CT    0.763
##  7 FL    1.38 
##  8 GA    0.507
##  9 IA    1.99 
## 10 IL    0.575
## # ℹ 28 more rows

#Make a boxplot that shows the data distribution of holc_area each state, i.e., x axis for all states, and y axis indicate the boxplot distribution (1’)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
ggplot(HOLC, aes(x=state, y=holc_area, fill = state)) + 
  geom_boxplot() +
  labs(title = "HOLC Area by State",
       x = "State",
       y = "HOLC Area")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

#Create a dataframe that summarizes the count of HOLC grade D each Texas city using the chain approach. Hint: summarize(count=n()) can be used to get the count of conditioned rows in the summarize function of chain approach.(2’)

TX_HOLC<- HOLC[HOLC$state=="TX" & HOLC$holc_grade=="D",] %>%
  group_by(st_name) %>%
  summarize(count=n())

print(TX_HOLC)
## # A tibble: 8 × 2
##   st_name     count
##   <chr>       <int>
## 1 Austin         29
## 2 Dallas         41
## 3 El Paso        28
## 4 Fort Worth     74
## 5 Galveston       6
## 6 Houston        42
## 7 San Antonio    49
## 8 Waco            4

#Use Census API to get the poverty and black percentage of each census tract in San Antonio (1’)

library(tidycensus)
## Warning: package 'tidycensus' was built under R version 4.4.2
census_api_key("be13863f38ebdc792d7bf1fdbd7f8089b2a708f2", install = FALSE)
## To install your API key for use in future sessions, run this function with `install = TRUE`.
var <- c(poptotal='B03002_001E', 
         white='B03002_003E',
         black='B03002_004E', 
         poverty='B17017_002E') 

st <-"TX"
ct <-"Bexar"

cbg <- get_acs(geography = "tract", variables = var, count=ct,
               state = st,output="wide", year = 2021, geometry = TRUE)
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
##   |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  15%  |                                                                              |============                                                          |  17%  |                                                                              |============                                                          |  18%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  22%  |                                                                              |================                                                      |  23%  |                                                                              |=================                                                     |  24%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |=======================                                               |  32%  |                                                                              |=========================                                             |  36%  |                                                                              |===========================                                           |  39%  |                                                                              |=============================                                         |  42%  |                                                                              |===============================                                       |  45%  |                                                                              |==================================                                    |  48%  |                                                                              |====================================                                  |  51%  |                                                                              |======================================                                |  55%  |                                                                              |========================================                              |  58%  |                                                                              |===========================================                           |  61%  |                                                                              |=============================================                         |  64%  |                                                                              |===============================================                       |  67%  |                                                                              |=================================================                     |  70%  |                                                                              |===================================================                   |  73%  |                                                                              |======================================================                |  77%  |                                                                              |======================================================                |  78%  |                                                                              |========================================================              |  80%  |                                                                              |==========================================================            |  83%  |                                                                              |============================================================          |  86%  |                                                                              |===============================================================       |  90%  |                                                                              |================================================================      |  91%  |                                                                              |=================================================================     |  93%  |                                                                              |==================================================================    |  94%  |                                                                              |====================================================================  |  97%  |                                                                              |======================================================================| 100%
cbg$poverty_pct <- 100 * (cbg$poverty /cbg$poptotal)
cbg$black_pct <- 100 * (cbg$black/cbg$poptotal)

#Merge the census data with HOLC data (1’), and make a bar chart showing the average black percentage for different HOLC grades for all San Antonio tracts (1’)

SA <- HOLC[HOLC$st_name=="San Antonio",]

SA <- SA[,c(2,8,14,18)]

SA$geoid <- as.character(SA$geoid)

names(SA)[3] <-"GEOID"

cbg<-cbg[,c(1,12,13)]

SA_HOLC <-merge(SA,cbg, by="GEOID")

avg_black_pct <- SA_HOLC %>%
  group_by(holc_grade) %>%
  summarise(avg=mean(black_pct))

ggplot(avg_black_pct, aes(x = holc_grade, y = avg, fill = holc_grade)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Black Population Percent by HOLC Grade",
       x= "HOLC Grade",
       y= "Average Black Population Percentage")

#Make a boxplot showing the HOLC_area data structure (y axis) of each grade (x axis) for San Antonio (1’).

ggplot(SA_HOLC, aes(x=holc_grade, y=holc_area, fill = holc_grade)) + 
  geom_boxplot() +
  labs(title = "San Antonio HOLC Area by Grade",
       x = "Holc Grade",
       y = "HOLC Area")