Some background on segregation

The index of dissimilarity has long been the industry standard for measuring segregation. Since 1976, a host of new measures, rediscovered old indices, and a variety of definitions for segregation have been proposed. There is little agreement about which measure to use under which circumstances Massey and Denton (1988) attempted to quell the disagreement by incorporating many indices under one conceptual framework.

Methodological issues in the measurement of spatial segregation Segregation can be thought of as the extent to which individuals of different groups occupy or experience different social environments. A measure of segregation, then, requires that we define the social environment of each individual that we quantify the extent to which these social environments differ across individuals.

The Dimensions of Residential Segregation Segregatin can be thought of as the degree to which two or more groups live separately from one another. Living apart could imply that groups are segregated in a variety of ways. Researchers argue for the adoption of one index and exclude others—fruitless according to Massey and Denton. Massey and Denton (1988) identify 5 distinct dimensions of residential segregation:

Evenness is the degree to which the percentage of minority members within residential areas approaches the minority percentage of the entire neighborhood

Exposure is the degree of potential contact between minority and majority members in neighborhoods

Concentration is the relative amount of physical space occupied by a minority group

Centralization is the degree to which minority members settle in and around the center of a neighborhood

Clustering is the extent to which minority areas adjoin one another in space

Doing this in R

First we need to load some libraries. We will use R to get all of our census data for us, either from the 2010 100% Summary File 1 (via the UScensus2010 library suite), or the acs package.

library(spdep)
library(seg)
library(UScensus2010)
library(UScensus2010tract)
## 
## UScensus2010tract: US Census 2010 Tract Level Shapefiles and Additional Demographic
## Data 
## Version 1.00 created on 2011-11-06 
## copyright (c) 2011, Zack W. Almquist, University of California-Irvine
## Type help(package="UScensus2010tract") to get started.
## 
## For citation information, type citation("UScensus2010tract").
library(UScensus2010county)
## 
## UScensus2010county: US Census 2010 County Level Shapefiles and Additional
## Demographic Data 
## Version 1.00 created on 2011-11-06 
## copyright (c) 2011, Zack W. Almquist, University of California-Irvine
## Type help(package="UScensus2010county") to get started.
## 
## For citation information, type citation("UScensus2010county").
library(acs)
library(sp)
library(RColorBrewer)

Before we can use the SF 1 data, you must install the various geographies you want to use to make your segregation indices. I’ll be using tracts, summed up to counties. So, i’ll do: install.tract("osx") and install.county("osx") since i’m on a Mac. I’ve had bad luck using install.*("windows") on my Windows box, and typically use install.*("linux") on either linux or windows.

The first example mimics that from Sparks (2014), Where I show how to calculate three common measures of segregation: Dissimiliarity, Isolation/Interaction and Theile’s Entropy index for multiple groups.

The general process is 1. Create a lower-level unit data set, that has your populations subdivided by race, SES, etc. Here i’m using census tracts. + 1a do any calculations, such as percentages on this data set. 2. Sum the populations up to the higher level unit, such as the county below + 2a Again, do any calculations you need to do 3. Use the formula to put the various parts together. 4. Use tapply() to sum your index across lower level units within higher level units 5. congratulate youself.

#create a set of three race variables, total, white and black for each census tract in the state of Texas from the 2010 SF1 data
mydem<-demographics(dem=c("P0030001", "P0030002", "P0030003","P0030004","P0030005","P0030006","P0030007","P0030008"), state="48", statefips=TRUE, level="tract")

#assemble a data frame with sensical names and create a county FIPS code for each tract
#First, I assemble the "other" race group, which in the SF1 is composed of 5 tables P0030004 to P0030008
other<-data.frame(cofips=substr(rownames(mydem), 1,5),fips=rownames(mydem),oth1=unlist(mydem[1,"P0030004"]),
                  oth2=unlist(mydem[1,"P0030005"]),
                  oth3=unlist(mydem[1,"P0030006"]),
                  oth4=unlist(mydem[1,"P0030007"]),
                  oth5=unlist(mydem[1,"P0030008"]))

#build the tract population data
trdat<-data.frame(cofips=substr(rownames(mydem), 1,5),fips=rownames(mydem), total=unlist(mydem[1,"P0030001"]), white=unlist(mydem[1,"P0030002"]), black=unlist(mydem[1,"P0030003"]),other=apply(other[,3:7],1,sum))

#sort the data by county and tract
trdat<-trdat[order(trdat$cofips, trdat$fips),]
#look at the first few cases
head(trdat)
##      cofips        fips total white black other
## 4686  48001 48001950100  4685  4012   452   221
## 4676  48001 48001950401  5422  1825  2266  1331
## 4677  48001 48001950402  7535  2591  3248  1696
## 4683  48001 48001950500  4377  2737   800   840
## 4682  48001 48001950600  6405  3831  1674   900
## 4681  48001 48001950700  2640  1051  1164   425
#We need the county-level totals for the total population and each race group
co_total<-tapply(trdat$total, trdat$cofips, sum)
co_total<-data.frame(cofips=names(unlist(co_total)), pop=unlist(co_total))
co_wht<-tapply(trdat$white, trdat$cofips, sum)
co_wht<-data.frame(cofips=names(unlist(co_wht)), pop=unlist(co_wht))
co_blk<-tapply(trdat$black, trdat$cofips, sum)
co_blk<-data.frame(cofips=names(unlist(co_blk)), pop=unlist(co_blk))
co_oth<-tapply(trdat$other, trdat$cofips, sum)
co_oth<-data.frame(cofips=names(unlist(co_oth)), pop=unlist(co_oth))

#For the multi-group measure of segregation, we also need population proportions
c_pwhite<-co_wht$pop/co_total$pop
c_pblack<-co_blk$pop/co_total$pop
c_pother<-co_oth$pop/co_total$pop

#we assemble them into a county-level data frame with easy names
county_dat<-data.frame(cofips=co_total$cofips, co_total=co_total$pop, co_wht_total=co_wht$pop, co_blk_total=co_blk$pop, co_oth_total=co_oth$pop, c_pwhite=c_pwhite, c_pblack=c_pblack, c_pother=c_pother)

#Now I make the county-level Entropy measure to be used later, it's easier to do it before
#merging back to the tracts.
county_dat$c_ent<-county_dat$c_pwhite*(log(1/county_dat$c_pwhite))+county_dat$c_pblack*(log(1/county_dat$c_pblack))+county_dat$c_pother*(log(1/county_dat$c_pother))
county_dat$c_ent<-ifelse(is.na(county_dat$c_ent)==T, 0,county_dat$c_ent)


#we merge the county data back to the tract data by the county FIPS code
merged<-merge(x=trdat,y=county_dat, by="cofips", all.x=T )
#have a look and make sure it looks ok
head(merged)
##   cofips        fips total white black other co_total co_wht_total
## 1  48001 48001950100  4685  4012   452   221    58458        38632
## 2  48001 48001950401  5422  1825  2266  1331    58458        38632
## 3  48001 48001950402  7535  2591  3248  1696    58458        38632
## 4  48001 48001950500  4377  2737   800   840    58458        38632
## 5  48001 48001950600  6405  3831  1674   900    58458        38632
## 6  48001 48001950700  2640  1051  1164   425    58458        38632
##   co_blk_total co_oth_total  c_pwhite  c_pblack  c_pother     c_ent
## 1        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 2        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 3        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 4        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 5        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 6        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364

That is the mechanics for setting up the data. Now we will calculate the indices:

#2 group segregation measures
#Now we begin the segregation calculations
#Calculate the tract-specific contribution to the county dissimilarity index
merged$d.wb<-.5*(abs(merged$white/merged$co_wht_total - merged$black/merged$co_blk_total))
#The county-level dissimilarity index is just the sum of the tract values within a county, this is easily done with the tapply() function, which in this case will sum the tract-specific contributions to the index within each county.
dissim.wb.tr<-tapply(merged$d.wb, merged$cofips, sum, na.rm=T)
hist(dissim.wb.tr)

#this in interaction index  for blacks and white
#first population is minority population, second is non-minority population
merged$int.wb<-(merged$black/merged$co_blk_total * merged$white/merged$total)
#Again, we sum these within each county
int.wb.tr<-tapply(merged$int.wb, merged$cofips, sum, na.rm=T)
head(int.wb.tr)
##     48001     48003     48005     48007     48009     48011 
## 0.5198444 0.7840112 0.5427863 0.8582850 0.9498711 0.9331931
hist(int.wb.tr)

#this is the isolation index for blacks
merged$iso.b<-(merged$black/merged$co_blk_total * merged$black/merged$total)
#Again, we sum these within each county
isol.b.tr<-tapply(merged$iso.b, merged$cofips, sum, na.rm=T)
head(isol.b.tr)
##       48001       48003       48005       48007       48009       48011 
## 0.307966255 0.018252381 0.290047066 0.018043037 0.006946279 0.005786428
hist(isol.b.tr)

So, those are the typical dissimilarity, interaction and isolation indices for two groups. Next, we calculate Theile’s multi-group Entropy index using whites, blacks and all others, but of course we could break this into as many groups as you want, with only a few more lines of code.

#Multi-group segregation
#Here, I calculate the Theile Entropy index
#Just like for the counties, we also need population proportions at the tract level
merged$tr_pwhite<-merged$white/merged$total
merged$tr_pblk<-merged$black/merged$total
merged$tr_pother<-merged$other/merged$total

#This is the tract-level entropy measure
merged$tr_ent<-merged$tr_pwhite*(log(1/merged$tr_pwhite))+merged$tr_pblk*(log(1/merged$tr_pblk))+merged$tr_pother*(log(1/merged$tr_pother))
merged$tr_ent<-ifelse(is.na(merged$tr_ent)==T, 0,merged$tr_ent)

head(merged)
##   cofips        fips total white black other co_total co_wht_total
## 1  48001 48001950100  4685  4012   452   221    58458        38632
## 2  48001 48001950401  5422  1825  2266  1331    58458        38632
## 3  48001 48001950402  7535  2591  3248  1696    58458        38632
## 4  48001 48001950500  4377  2737   800   840    58458        38632
## 5  48001 48001950600  6405  3831  1674   900    58458        38632
## 6  48001 48001950700  2640  1051  1164   425    58458        38632
##   co_blk_total co_oth_total  c_pwhite  c_pblack  c_pother     c_ent
## 1        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 2        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 3        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 4        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 5        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
## 6        12310         7516 0.6608505 0.2105785 0.1285709 0.8655364
##          d.wb     int.wb       iso.b tr_pwhite    tr_pblk  tr_pother
## 1 0.033566807 0.03144356 0.003542495 0.8563501 0.09647812 0.04717182
## 2 0.068418678 0.06195912 0.076931154 0.3365917 0.41792696 0.24548137
## 3 0.098390888 0.09072816 0.113734109 0.3438620 0.43105508 0.22508295
## 4 0.002930093 0.04063780 0.011878056 0.6253141 0.18277359 0.19191227
## 5 0.018410254 0.08133742 0.035541334 0.5981265 0.26135831 0.14051522
## 6 0.033675922 0.03764382 0.041691160 0.3981061 0.44090909 0.16098485
##      tr_ent
## 1 0.5024684
## 2 1.0759163
## 3 1.0654821
## 4 0.9210035
## 5 0.9338686
## 6 1.0217681
#Now I calculate each tract's contribution to the H index
merged$Hcalc<-(merged$total*(merged$c_ent-merged$tr_ent))/(merged$co_total*merged$c_ent)
merged$Hcalc<-ifelse(is.na(merged$Hcalc)==T, 0, merged$Hcalc)
hist(merged$Hcalc)

#Then, we sum across tracts within counties to get the H index for each county
hindex<-unlist(tapply(merged$Hcalc, merged$cofips, sum))
head(hindex)
##      48001      48003      48005      48007      48009      48011 
## 0.11995495 0.01286551 0.13729605 0.01428389 0.03167802 0.00000000
hist(hindex)

That gives us four indices. Now we turn to more specialized indices: The spatial clustering index, that myself and some colleagues used in this paper and the general neighborhood sorting index derived by Paul Jargowsky.

Both of these are custom-written functions that work but I would minimize the amount of tinkering with them. I plan on putting these on github in a more proper form, but here they are in the raw. They both involve a series of checks that ensure that spatial weights are correct for counties with only 1 tract, etc.

#spatial proximity index
data(texas.tract10)
dat<-texas.tract10[, c(1,2,3,4,12:19)]

#this is my own function to calculate the spatial proximity index
spatseg<- function(dat) {
  nameco<-unique(dat$county)
  
  #####need to change fips and county for particular data set#####
  nwncounty<-tapply(as.character(dat$fips), as.character(dat$county), table)
  nwnco<-sapply(nwncounty, sum, USE.NAMES=T,simplify=T)
  ncos<-ifelse(nwnco>1,1,0)
  counties<-match(nameco, names(nwnco))
  t1<-names(ncos)
  t2<-sort(as.character(nameco))
  dat.sub<-data.frame(county=t2, gt1=ncos)
  subsamp<-subset(dat.sub, dat.sub$gt1>0)
  nco<-dim(subsamp) [1]
  dat.new<-dat[which(as.character(dat$county) %in% as.character(subsamp$county)),]
  nwncounty2<-tapply(dat.new$fips, dat.new$county, table)
  nwnco2<-sapply(nwncounty2, sum)
  nconew<- length(nwnco2[nwnco2>0])
  
  ####potentially need to change these to suite your population subgroups.
  Ptt<-numeric(nconew)
  Ptw <-numeric(nconew)
  Ptb <-numeric(nconew)
  Ptot<-numeric(nconew)
  Wtot<-numeric(nconew)
  Btot<-numeric(nconew)
  
  for (i in 1:nconew)
  {
    
    coi<-dat.new[as.character(dat.new$county)==as.character(subsamp$county[i]),]
#####You will need to customize these names, per your dataset #####
    popt<-coi$P0030001
    popw<-coi$P0030002
    popb<-coi$P0030003
    Ptot  <-aggregate(coi$P0030001, by=list(coi$county), sum, na.rm=T)$x
    Wtot  <-aggregate(coi$P0030002, by=list(coi$county), sum, na.rm=T)$x
    Btot   <-aggregate(coi$P0030003, by=list(coi$county), sum, na.rm=T)$x
    co.nb<-poly2nb(coi, queen=T)
    co.lw<-nb2listw(co.nb, zero.policy=T, style="B")
    co.sm<-nb2mat(co.nb, style="B", zero.policy=T)
     Ptt[i]<-(t(popt)%*%co.sm%*%popt)/Ptot^2
    Ptw[i]<-(t(popw)%*%co.sm%*%popw)/Wtot^2
    Ptb[i]<-(t(popb)%*%co.sm%*%popb)/Btot^2
   #  print (unique(as.character(coi$county)))
  }   
  return(list(Ptt=Ptt , Ptw=Ptw, Ptb=Ptb))
}
spatdat<-spatseg(dat)
Ptt<-spatdat$Ptt
Ptw<-spatdat$Ptw
Ptb<-spatdat$Ptb

####Change fips and county for particular data set####
nameco<-unique(dat$county)
nwncounty<-tapply(as.character(dat$fips), as.character(dat$county), table)
nwnco<-sapply(nwncounty, sum, USE.NAMES=T,simplify=T)
ncos<-ifelse(nwnco>1,1,0)

t1<-names(ncos)
t2<-sort(as.character(nameco))

dat.sub<-data.frame(county=t2, gt1=ncos)
goodcos<-dat.sub[dat.sub$gt1>0,]
dim(goodcos)[1]
## [1] 210
dat.new<-dat[which(as.character(dat$county) %in% as.character(goodcos$county)),]

popdat<-slot(dat.new, "data")

dat1<-subset(popdat, popdat$county %in%goodcos$county)

co.pop.t<-tapply(dat1$P0030001, as.character(dat1$county), sum)   
co.pop.w<- tapply(dat1$P0030002, as.character(dat1$county), sum)
co.pop.b<- tapply(dat1$P0030003, as.character(dat1$county), sum)    


P.tt<-Ptt
P.ww<-Ptw
P.bb<-Ptb

#repeat for white and black
Sp.wb<-((P.ww*co.pop.w)+(P.bb*co.pop.b))/(P.tt*co.pop.t)

Spprox<-data.frame(cofips=paste("48", names(Sp.wb), sep=""), spsegr=Sp.wb) 
head(Spprox)
##     cofips    spsegr
## 001  48001 0.8965521
## 003  48003 0.8163956
## 005  48005 0.9118763
## 007  48007 0.8893582
## 009  48009 0.9576126
## 013  48013 0.8602982
hist(Spprox$spsegr)