Dem 7283 HW 5

dat17 <- read_excel("acs5_2017_comp.xlsx", sheet = "2017acs5yr", skip = 1)
dat12 <- read_excel("acs5_2017_comp.xlsx", sheet = "2012acs5yr", skip = 1)
#replace.value(dat12, from = "-", to = as.integer(0), verbose = F, names = c(""))
dat12 <- lapply(dat12, FUN = function(x) recode(x, "'-'=0"))
dat12 <- as.data.frame(dat12)
dat17 <- lapply(dat17, FUN = function(x) recode(x, "'-'=0"))
dat17 <- as.data.frame(dat17)

dat17 <- dat17 %>% 
  dplyr::transmute(
    tract = Id2,
    pop_tot17 = Total..Estimate..Total.population,
    pop_mal17 = Male..Estimate..Total.population,
    pop_fem17 = Female..Estimate..Total.population,
    age_med17 = Total..Estimate..SUMMARY.INDICATORS...Median.age..years.,
    age_med_mal17 = Male..Estimate..SUMMARY.INDICATORS...Median.age..years.,
    age_med_fem17 = Female..Estimate..SUMMARY.INDICATORS...Median.age..years.,
    age_dep17 = Total..Estimate..SUMMARY.INDICATORS...Age.dependency.ratio,
    age_dep_old17 = Total..Estimate..SUMMARY.INDICATORS...Old.age.dependency.ratio,
    age_dep_chld17 = Total..Estimate..SUMMARY.INDICATORS...Child.dependency.ratio,
    hh17 = Number..Estimate..Households,
    hhinc17 = Median.income..dollars...Estimate..Households,
    race_hisp17 = Estimate..HISPANIC.OR.LATINO.AND.RACE...Total.population...Hispanic.or.Latino..of.any.race.,
    race_nhw17 = Estimate..HISPANIC.OR.LATINO.AND.RACE...Total.population...Not.Hispanic.or.Latino...White.alone,
    race_nhb17 = Estimate..HISPANIC.OR.LATINO.AND.RACE...Total.population...Not.Hispanic.or.Latino...Black.or.African.American.alone,
    race_othr17 = Estimate..HISPANIC.OR.LATINO.AND.RACE...Total.population...Not.Hispanic.or.Latino...American.Indian.and.Alaska.Native.alone+
      Estimate..HISPANIC.OR.LATINO.AND.RACE...Total.population...Not.Hispanic.or.Latino...Asian.alone +
      Estimate..HISPANIC.OR.LATINO.AND.RACE...Total.population...Not.Hispanic.or.Latino...Native.Hawaiian.and.Other.Pacific.Islander.alone +
      Estimate..HISPANIC.OR.LATINO.AND.RACE...Total.population...Not.Hispanic.or.Latino...Some.other.race.alone +
      Estimate..HISPANIC.OR.LATINO.AND.RACE...Total.population...Not.Hispanic.or.Latino...Two.or.more.races,
    hh_occ17 = Occupied.housing.units..Estimate..Occupied.housing.units,
    hh_occ_own17 = Owner.occupied.housing.units..Estimate..Occupied.housing.units,
    hh_occ_rnt17 = Renter.occupied.housing.units..Estimate..Occupied.housing.units,
    pop25_tot17 = Total..Estimate..Population.25.years.and.over,
    pop25_mal17 = Male..Estimate..Population.25.years.and.over,
    pop25_fem17 = Female..Estimate..Population.25.years.and.over,
    pop25_tot17_lths = Total..Estimate..Population.25.years.and.over...Less.than.9th.grade + Total..Estimate..Population.25.years.and.over...9th.to.12th.grade..no.diploma,
    pop25_tot17_hs = Total..Estimate..Population.25.years.and.over...High.school.graduate..includes.equivalency.,
    pop25_tot17_coll = Total..Estimate..Population.25.years.and.over...Some.college..no.degree + Total..Estimate..Population.25.years.and.over...Associate.s.degree,
    pop25_tot17_bach = Total..Estimate..Population.25.years.and.over...Bachelor.s.degree,
    pop25_tot17_grad = Total..Estimate..Population.25.years.and.over...Graduate.or.professional.degree,
    pop25_mal17_lths = Male..Estimate..Population.25.years.and.over...Less.than.9th.grade + Male..Estimate..Population.25.years.and.over...9th.to.12th.grade..no.diploma,
    pop25_mal17_hs = Male..Estimate..Population.25.years.and.over...High.school.graduate..includes.equivalency.,
    pop25_mal17_coll = Male..Estimate..Population.25.years.and.over...Some.college..no.degree + Male..Estimate..Population.25.years.and.over...Associate.s.degree,
    pop25_mal17_bach = Male..Estimate..Population.25.years.and.over...Bachelor.s.degree,
    pop25_mal17_grad = Male..Estimate..Population.25.years.and.over...Graduate.or.professional.degree,
    pop25_fem17_lths = Female..Estimate..Population.25.years.and.over...Less.than.9th.grade + Female..Estimate..Population.25.years.and.over...9th.to.12th.grade..no.diploma,
    pop25_fem17_hs = Female..Estimate..Population.25.years.and.over...High.school.graduate..includes.equivalency.,
    pop25_fem17_coll = Female..Estimate..Population.25.years.and.over...Some.college..no.degree + Female..Estimate..Population.25.years.and.over...Associate.s.degree,
    pop25_fem17_bach = Female..Estimate..Population.25.years.and.over...Bachelor.s.degree,
    pop25_fem17_grad = Female..Estimate..Population.25.years.and.over...Graduate.or.professional.degree
  )

dat12 <- dat12 %>% 
  dplyr::transmute(
    tract = Id2,
    pop_tot12 = Total..Estimate..Total.population,
    pop_mal12 = Male..Estimate..Total.population,
    pop_fem12 = Female..Estimate..Total.population,
    age_med12 = Total..Estimate..SUMMARY.INDICATORS...Median.age..years.,
    age_med_mal12 = Male..Estimate..SUMMARY.INDICATORS...Median.age..years.,
    age_med_fem12 = Female..Estimate..SUMMARY.INDICATORS...Median.age..years.,
    age_dep12 = Total..Estimate..SUMMARY.INDICATORS...Age.dependency.ratio,
    age_dep_old12 = Total..Estimate..SUMMARY.INDICATORS...Age.dependency.ratio...Old.age.dependency.ratio,
    age_dep_chld12 = Total..Estimate..SUMMARY.INDICATORS...Age.dependency.ratio...Child.dependency.ratio,
    hh12 = Total..Estimate..Households,
    hhinc12 = Median.income..dollars...Estimate..Households,
    race_hisp12 =Estimate..HISPANIC.OR.LATINO.AND.RACE...Hispanic.or.Latino..of.any.race.,
    race_nhw12 = Estimate..HISPANIC.OR.LATINO.AND.RACE...Not.Hispanic.or.Latino...White.alone,
    race_nhb12 = Estimate..HISPANIC.OR.LATINO.AND.RACE...Not.Hispanic.or.Latino...Black.or.African.American.alone,
    race_othr12 = Estimate..HISPANIC.OR.LATINO.AND.RACE...Not.Hispanic.or.Latino...American.Indian.and.Alaska.Native.alone +
      Estimate..HISPANIC.OR.LATINO.AND.RACE...Not.Hispanic.or.Latino...Asian.alone +
      Estimate..HISPANIC.OR.LATINO.AND.RACE...Not.Hispanic.or.Latino...Native.Hawaiian.and.Other.Pacific.Islander.alone +
      Estimate..HISPANIC.OR.LATINO.AND.RACE...Not.Hispanic.or.Latino...Some.other.race.alone +
      Estimate..HISPANIC.OR.LATINO.AND.RACE...Not.Hispanic.or.Latino...Two.or.more.races,
    hh_occ12 = Occupied.housing.total,
    hh_occ_own12 = Estimate..HOUSING.TENURE...Owner.occupied,
    hh_occ_rnt12 = Estimate..HOUSING.TENURE...Renter.occupied,
    pop25_tot12 = Total..Estimate..Population.25.years.and.over,
    pop25_mal12 = Male..Estimate..Population.25.years.and.over,
    pop25_fem12 = Female..Estimate..Population.25.years.and.over,
    pop25_tot12_lths = Total..Estimate..Less.than.9th.grade + Total..Estimate..9th.to.12th.grade..no.diploma,
    pop25_tot12_hs = Total..Estimate..High.school.graduate..includes.equivalency.,
    pop25_tot12_coll = Total..Estimate..Some.college..no.degree + Total..Estimate..Associate.s.degree,
    pop25_tot12_bach = Total..Estimate..Bachelor.s.degree,
    pop25_tot12_grad = Total..Estimate..Graduate.or.professional.degree,
    pop25_mal12_lths =Male..Estimate..Less.than.9th.grade + Male..Estimate..9th.to.12th.grade..no.diploma,
    pop25_mal12_hs = Male..Estimate..High.school.graduate..includes.equivalency.,
    pop25_mal12_coll = Male..Estimate..Some.college..no.degree + Male..Estimate..Associate.s.degree,
    pop25_mal12_bach = Male..Estimate..Bachelor.s.degree,
    pop25_mal12_grad = Male..Estimate..Graduate.or.professional.degree,
    pop25_fem12_lths = Female..Estimate..Less.than.9th.grade + Female..Estimate..9th.to.12th.grade..no.diploma,
    pop25_fem12_hs = Female..Estimate..High.school.graduate..includes.equivalency.,
    pop25_fem12_coll = Female..Estimate..Some.college..no.degree +Female..Estimate..Associate.s.degree,
    pop25_fem12_bach = Female..Estimate..Bachelor.s.degree,
    pop25_fem12_grad = Female..Estimate..Graduate.or.professional.degree
  )

#head(dat12)
#adjusting hhinc for inflation
# 2012 cpi = 229.6
# 2017 cpi = 245.1
dat12 <- dat12%>% 
  dplyr::mutate(hhinc12_adj = (hhinc12 * 245.1) / 229.6 )
#str(dat12)
dat <- dplyr::full_join(dat12, dat17)

## Joining, by = "tract"

dat <- dat %>%
  dplyr::mutate(
    pop_tot_delta = pop_tot17 - pop_tot12,
    pop_mal_delta = pop_mal17 - pop_mal12,
    pop_fem_delta = pop_fem17 - pop_fem12,
    age_med_delta = age_med17 - age_med12,
    age_med_mal_delta = age_med_mal17 - age_med_mal12,
    age_med_fem_delta = age_med_fem17 - age_med_fem12,
    age_dep_rat = age_dep12/age_dep17,
    age_dep_old_rat = age_dep_old12/age_dep_old17,
    age_dep_chld_rat = age_dep_chld12/ age_dep_chld17,
    hh_delta = hh12 - hh17,
    hhinc_delta = hhinc17 - hhinc12_adj,
    race_hisp_delta = race_hisp17 / pop_tot17 - race_hisp12 / pop_tot12,
    race_nhw_delta = race_nhw17  / pop_tot17 - race_nhw12 / pop_tot12,
    race_nhb_delta = race_nhb17  / pop_tot17 - race_nhb12 / pop_tot12,
    race_othr_delta = race_othr17  / pop_tot17 - race_othr12 / pop_tot12,
    hh_occ_delta = hh_occ17 - hh_occ12,
    hh_occ_own_delta = hh_occ_own17 - hh_occ_own12,
    hh_occ_rnt_delta = hh_occ_rnt17 - hh_occ_rnt12,
    pop25_tot_delta = pop25_tot17 - pop25_tot12,
    pop25_mal_delta = pop25_mal17 - pop25_mal12,
    pop25_fem_delta = pop25_fem17 - pop25_fem12,
    pop25_tot_lths_delta = pop25_tot17_lths - pop25_tot12_lths,
    pop25_tot_hs_delta = pop25_tot17_hs - pop25_tot12_hs,
    pop25_tot_coll_delta = pop25_tot17_coll - pop25_tot12_coll,
    pop25_tot_bach_delta = pop25_tot17_bach - pop25_tot12_bach,
    pop25_tot_grad_delta = pop25_tot17_grad - pop25_tot12_grad
  )

###Finding tracts that may have undergone gentrification
#dat12 <- dat12%>% 
 # dplyr::arrange(hhinc12)
quint_trct12 <- Hmisc::cut2(dat12$hhinc12, g=5)
levels(quint_trct12)

## [1] "[    0, 30988)" "[30988, 38056)" "[38056, 54308)" "[54308, 72132)"
## [5] "[72132,185867]"

cutpoints <- Hmisc::cut2(dat12$hhinc12, g = 5, onlycuts = T)
quint_df12 <- data.frame(dat12, quint_trct12)
#names(quint_df_17) <- c("value", "quint group")
#quint_df_17
table(quint_df12$quint_trct12)

## 
## [    0, 30988) [30988, 38056) [38056, 54308) [54308, 72132) [72132,185867] 
##             74             73             73             73             73

gent12 <- subset(quint_df12, quint_trct12 == "[    0, 30988)" | quint_trct12 == "[30988, 38056)")


#dat17 %>% 
 # dplyr::arrange(hhinc17)
quint_trct17 <- Hmisc::cut2(dat17$hhinc17, g=5)
levels(quint_trct17)

## [1] "[    0, 33929)" "[33929, 43125)" "[43125, 57676)" "[57676, 78404)"
## [5] "[78404,207188]"

cutpoints <- Hmisc::cut2(dat17$hhinc17, g = 5, onlycuts = T)
quint_df17 <- data.frame(dat17, quint_trct17)
#names(quint_df_17) <- c("value", "quint group")
#quint_df_17
#dplyr::arrange(quint_df12, tract)
#dplyr::arrange(quint_df17, tract)
table(quint_df17$quint_trct17)

## 
## [    0, 33929) [33929, 43125) [43125, 57676) [57676, 78404) [78404,207188] 
##             74             73             73             73             73

quint_df <- dplyr::full_join(quint_df12,quint_df17)

## Joining, by = "tract"

gent12 <- data.frame(0)
gent <- data.frame(0)

gent12 <- subset(quint_df, quint_trct12 == "[    0, 30988)" | quint_trct12 == "[30988, 38056)")

gent <- subset(gent12, quint_trct17 == "[43125, 57676)" | quint_df17 == "[57676, 78404)" | 
                    quint_df17 == "[78404,207188]")

## Warning in quint_trct17 == "[43125, 57676)" | quint_df17 == "[57676,
## 78404)": longer object length is not a multiple of shorter object length

gent <- gent[complete.cases(gent),]

gent17 <- subset(quint_df12, quint_trct17 == "[41010, 53611)" | 
  quint_trct17 == "[53611, 74069)" | quint_trct17 == "[74069,250001]")

intersect <- function(x, y) y[match(x, y, nomatch = 0)]
#intersect # the R function in base is slightly more careful
intersect(1:10, 7:20)

## [1]  7  8  9 10

#gent <- intersect(gent_12$GEOID,gent_17$GEOID)

Idea for Latent construct

For this Homework, I’m going to work on the idea of gentrification as a latent construct. Forgive the code above, I went a little above and beyond (working a bit for my paper) and tidied up some data from the 2012 and 2017 ACS 5 yr surveys that I got from American Fact Finder.

I’ll start by saying I tried both tidycensus and the acs packages/libraries, but after much frustration was unable to extract the data I needed for the years I was looking for. I ended up doing the painstaking fact finder tables and put together a number of variable that I thought would be useful for descriptive statistics and possibly to help identify how far a neighborhood (census tract) was in terms of gentrification.

Bexar county was my focus.

I pulled two 5yr samples and then concentrated on either the delta between 2017 and 2012 or the ratio of the two years (as in the case of age dependency). Also of note, I had issues with the dplyr package (usually autoloaded through tidyverse), so to use dplyr commands I had to force them with dplyr::.

I ended up choosing 11 variables, all z-scaled:
inc = differrence in 2017 and 2012 median incomes by census tract
age = differrence in 2017 and 2012 median age by census tract
white = differrence in 2017 and 2012 proportion non-Hispanic white population by census tract (increasing proportion of white population should be a sign of gentrification) black = differrence in 2017 and 2012 proportion non-Hispanic black population by census tract (multiplied by -1 with the thought that a lower proportion of non-Hispanic black population in a census tract would be a sign of gentrification)
hisp = differrence in 2017 and 2012 proportion HIspanic population by census tract (multiplied by -1 with the thought that a lower proportion of hispanic population in a census tract would be a sign of gentrification) grad = differrence in 2017 and 2012 with graduate/professional degrees
bach = differrence in 2017 and 2012 with bachelors degrees
lths = differrence in 2017 and 2012 who have do not have a high school degree (multiplied by -1 thinking that gentrification would show signs of higher education within a neighborhood)
owner = differrence in 2017 and 2012 occupied homes owned
dep_old = ratio of 2012/2017 old age dependency (thinking that gentrification would bring with it a younger population and therefore decrease the number of elderly)
dep_chld = ratio of 2012/2017 child age dependency (multiplied by -1 due to the above comment)

Scree plot

The Scree plot indicates that we should choose no more than 5 principal components, however, when looking at the summary table, and with the thought that we’d like to be able to come close to explaining nearly 70 percent of the variation, and reduce the number of variables, we will work with the first four principal components.

####now working on PCA
dat <- dat[complete.cases(dat),]

dat_pc <- dat %>% 
  dplyr::filter(age_dep_old_rat <= 100) %>% 
  dplyr::transmute(
    inc = scale(hhinc_delta),
    age = -1 * scale(age_med_delta),
    white = scale(race_nhw_delta),
    black = -1 * scale(race_nhb_delta),
    hisp = -1 * scale(race_hisp_delta),
    grad = scale(pop25_tot_grad_delta),
    bach = scale(pop25_tot_bach_delta),
    lths = -1 * scale(pop25_tot_lths_delta),
    owner = scale(hh_occ_own_delta),
    dep_old = scale(age_dep_old_rat),#when I scaled this it returned all NaN's
    dep_chld = -1 * scale(age_dep_chld_rat))###expecting gentrified areas to have less elderly and more children
dat_pc.pc <- prcomp(~ inc + age + white + black +hisp + grad + bach + lths + owner + dep_old + dep_chld, data = dat_pc, 
                    center = T, scale = T, retx = T)
#Screeplot
screeplot(dat_pc.pc, type = "l", main = "Scree Plot")
abline(h=1)

summary(dat_pc.pc)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.6144 1.3274 1.2408 1.1188 1.04350 0.95184 0.89153
## Proportion of Variance 0.2369 0.1602 0.1399 0.1138 0.09899 0.08236 0.07226
## Cumulative Proportion  0.2369 0.3971 0.5371 0.6509 0.74986 0.83223 0.90448
##                            PC8     PC9    PC10   PC11
## Standard deviation     0.74272 0.53840 0.34811 0.2967
## Proportion of Variance 0.05015 0.02635 0.01102 0.0080
## Cumulative Proportion  0.95463 0.98098 0.99200 1.0000

PC rotation

When exmamining the first four PCs impact, we’ll use |.30| as the the bar to determine the characteristics of each of the PC’s

The first PC is the primarily focused on education and is inline with expectations. An increase in the number of bachelors degrees and beyond as well as less people who have not completed high school is the strongest component of our analysis.

The second PC shows that an increase in the median age of a census tract(unexpected, could possibily be accounted for by the difference in the age of white incomers and hispanic outgoers?) as well as an increase in the proportion of white and decrease of the proportion of hispanic population is the second strongest component.

The third PC shows that the median income and lower overall age and a correspondingly decrease in the dependency on the elderly and children (the lower dependency on children is slightly unexpected, but possibly explained for by the decreasing proportion of hispanic population in our second PC)

The fourth PC shows a decrease in the median age, increasing proportion of black population and decreasing proportion of Hispanic population. The increasing proportion of black population is unexpected, but could possibly be explained by the relatively low proportion (as compared to the US) within Bexar county and perhaps, census tracts that have Hispanics being displaced by non-Hispanic black is an indicator?

This last PC is what I’d like to explore further studies.

dat_pc.pc$rotation

##                   PC1         PC2         PC3          PC4         PC5
## inc      -0.005498607  0.14911234 -0.44247962  0.171165156 -0.08871078
## age       0.030710203 -0.31914498 -0.33917429 -0.316493630 -0.60669408
## white    -0.240996034  0.59413704  0.05996152 -0.002176869 -0.30305099
## black    -0.060730187  0.06858280  0.14371492  0.730798914 -0.39111727
## hisp     -0.171476227  0.57705030 -0.04531274 -0.471883221 -0.04447253
## grad      0.547787256  0.16037605 -0.11849276  0.020477304 -0.01539899
## bach      0.554325758  0.16419205 -0.08822108  0.025775661 -0.02685614
## lths      0.396051571  0.02397296 -0.10789816 -0.101679692  0.18084847
## owner     0.291139497  0.26650568 -0.04046461  0.139411005 -0.25578614
## dep_old  -0.189420190 -0.12349356 -0.65423261  0.048361266 -0.09773856
## dep_chld  0.151684947 -0.21375774  0.44693040 -0.283887303 -0.51828404
##                  PC6         PC7         PC8         PC9         PC10
## inc       0.59444145 -0.57498009  0.23327616 -0.05209086 -0.037602456
## age      -0.16718745 -0.01176028 -0.15836212 -0.51029550 -0.016886356
## white    -0.26576570 -0.08771424 -0.02187757  0.05327309 -0.641052254
## black    -0.28774886 -0.15030947 -0.03197276 -0.03680344  0.419345364
## hisp     -0.04755128 -0.02328592  0.03876207 -0.01290602  0.637106876
## grad     -0.16654026  0.06731708  0.37597670 -0.05573136 -0.051086078
## bach     -0.15334358  0.14716710  0.31309674 -0.03430183  0.011384242
## lths     -0.29533438 -0.54135408 -0.59801928  0.22305326  0.013777164
## owner     0.49925702  0.46712523 -0.52872507  0.08400794  0.005541665
## dep_old  -0.21208656  0.24919669  0.06460623  0.62898239  0.045190788
## dep_chld  0.17402641 -0.19715901  0.19173162  0.52518037  0.024394535
##                  PC11
## inc      -0.047665840
## age      -0.007943093
## white    -0.029870164
## black     0.017171685
## hisp      0.022944160
## grad      0.692943117
## bach     -0.715602637
## lths     -0.005023965
## owner     0.060094634
## dep_old   0.006531921
## dep_chld  0.003844385

hist(dat_pc.pc$x[,1])

hist(dat_pc.pc$x[,2])

hist(dat_pc.pc$x[,3])

hist(dat_pc.pc$x[,4])

Here we can see the histograms of the first four principal components.

cor(dat_pc.pc$x[,1:4])

##               PC1           PC2           PC3           PC4
## PC1  1.000000e+00 -4.792526e-16 -6.273293e-16 -3.004241e-18
## PC2 -4.792526e-16  1.000000e+00  4.426671e-16  2.691316e-16
## PC3 -6.273293e-16  4.426671e-16  1.000000e+00  1.482957e-16
## PC4 -3.004241e-18  2.691316e-16  1.482957e-16  1.000000e+00

From the correlation matrix we can see that the PCs are indeed orthagonal

scores <- data.frame(dat_pc.pc$x)
scores$name <- rownames(dat_pc.pc$x)
dat_pc$name <- rownames(dat_pc)
dat_pc <- merge(dat_pc, scores, by.x = "name", by.y = "name", all.x = F)
tail(names(dat_pc), 20)

##  [1] "white"    "black"    "hisp"     "grad"     "bach"     "lths"    
##  [7] "owner"    "dep_old"  "dep_chld" "PC1"      "PC2"      "PC3"     
## [13] "PC4"      "PC5"      "PC6"      "PC7"      "PC8"      "PC9"     
## [19] "PC10"     "PC11"

#round(cor(dat_pc[, c("inc", "age", "white", "black", "hisp", "grad", "bach", "lths", "owner",
                  #   "dep_old", "dep_chld")],3))
#round(cor(dat_pc[, c("inc", "age", "white", "black", "hisp", "grad", "bach", "lths", "owner",
                 #    "dep_old", "dep_chld", "PC1", "PC2", "PC3", "PC4")], method = "spearman"),3)

If deemed appropriate….

Well, I think it would have been appropriate to conduct some testing, but I now realize that I coded my variables incorrectly. I’ll need to recode so that I can stratify the years so that I can apply the different weights for each of the 5 yr ACS surveys. I do plan on recoding, however, I have spent more time than I planned on this home work and many other duteis are calling, so i am now going to deem this as inappropriate.