In this project we use First Name, Last Name, and Census Ethnicity Data from the American Community Survey to predict a person’s ethnicity. For professional purposes, this model has been used to predict the primary ethnicity of a property owner for a granular analysis of neighborhood ethnicity distribution.

Data

The datasets in this project are obtained from multiple sources, both private and public.

Target Variable - Ethnicity

Our dependent variable, ethnicity, is a nominal variable with 6 categories:

  • White = 1
  • Asian = 2
  • Hispanic = 3
  • Black = 4
  • Other = 5
  • Arab = 6

Load Libraries

library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringdist)
library(stringr)
## 
## Attaching package: 'stringr'
## The following object is masked from 'package:strucchange':
## 
##     boundary

Import Data

df.DataTree <- read.csv("~/Models/DataTree/Raw/06065.csv", sep="|", na.strings=c(""," ","NA", "<NA>", "\\N"), stringsAsFactors = FALSE, fill=TRUE, quote="", colClasses = "character")

#Source scripts to format Data Tree file
source("~/Models/Source/APN_Formatting.R") #Format APN for by county
## 
## Attaching package: 'taRifx'
## The following objects are masked from 'package:dplyr':
## 
##     between, distinct, first, last
source("~/Models/Source/Extract_Residential.R") #Extract residential properties
source("~/Models/Source/GeoID_processing.R") #Import GeoID and merge by apn

#Training Data
train.Ethnicity <- read.csv("~/Models/Ethnicity/train_ethnicity.csv", na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")

#Census Demographics
df.CA_Block_Group <- read.csv("~/Models/Ethnicity/Census/ethnicity_block_group.csv", na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")

df.CA_Census_Tract <- read.csv("~/Models/Ethnicity/Census/ethnicity_tract.csv" , na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")

#Format geoTract and geoBlockGroup ID
df.CA_Block_Group$geoBlockGroup <- sprintf("%012.0f", df.CA_Block_Group$geoBlockGroup)
df.CA_Census_Tract$geoTract <- sprintf("%011.0f", df.CA_Census_Tract$geoTract) 

df.CA_Block_Group$geoBlockGroup <- as.character(df.CA_Block_Group$geoBlockGroup)
df.CA_Census_Tract$geoTract <- as.character(df.CA_Census_Tract$geoTract)

#Percent Ethnicity by Last Name
df.NameLastEthnicityPct <- read.csv("~/Models/Ethnicity/Name Classification/nameLastEthnicityPct.csv", na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")
#Percent Ethnicity by First Name
df.NameFirstEthnicityPct <- read.csv("~/Models/Ethnicity/Name Classification/nameFirstEthnicityPct.csv", na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")

Data Pre-processing

Extract and Engineer Name Features

df.OwnerNamesMaster <- select(df.DataTree, apn, OwnerStd1FirstName, OwnerStd1LastName, CurrSaleBuyer1FullName, OwnerStd1CorpInd, OwnerStdNAME1FULL, geoID, SitusStdZIP5, county)

If OwnerStd1LastName is missing, replace with first word (last name) in CurrSaleBuyer1FullName

df.OwnerNamesMaster$OwnerStd1LastName <- ifelse(is.na(df.OwnerNamesMaster$OwnerStd1LastName), word(df.OwnerNamesMaster$CurrSaleBuyer1FullName, 1), df.OwnerNamesMaster$OwnerStd1LastName)

If OwnerStd1FirstName is missing, replace with second word (first name) in CurrSaleBuyer1FullName

df.OwnerNamesMaster$OwnerStd1FirstName <- ifelse(is.na(df.OwnerNamesMaster$OwnerStd1FirstName), word(df.OwnerNamesMaster$CurrSaleBuyer1FullName, 2), df.OwnerNamesMaster$OwnerStd1FirstName)

If OwnerStd1LastName is missing, replace with first word (last name) in OwnerStdNAME1FULL

df.OwnerNamesMaster$OwnerStd1LastName <- ifelse(is.na(df.OwnerNamesMaster$OwnerStd1LastName), word(df.OwnerNamesMaster$OwnerStdNAME1FULL, 1), df.OwnerNamesMaster$OwnerStd1LastName)

If OwnerStd1FirstName is missing, replace with second word (first name) in OwnerStdNAME1FULL

df.OwnerNamesMaster$OwnerStd1FirstName <- ifelse(is.na(df.OwnerNamesMaster$OwnerStd1FirstName), word(df.OwnerNamesMaster$OwnerStdNAME1FULL, 2), df.OwnerNamesMaster$OwnerStd1FirstName)

Merging Datasets

Merge Last Names - Simple Merge

df.OwnerNamesMergeLast <- inner_join(df.OwnerNamesMaster, df.NameLastEthnicityPct, by = c("OwnerStd1LastName" = "nameLast"))
#Obtain set of names that were not matched in simple merge
df.OwnerNamesMissingLast <- anti_join(df.OwnerNamesMaster, df.OwnerNamesMergeLast, by = "OwnerStd1LastName")

Perform Approximate String Matching on Last Name

We use approximate string matching to match mispelled surnames or names that are uniquely spelled, yet can still be said to retain the ethnic origin of the majority of characters in the name i.e. Gonzales, Gonzáles, Gonzalés, and Gonzalez are all of Spanish origin.

strMatch <- amatch(df.OwnerNamesMissingLast$OwnerStd1LastName, df.NameLastEthnicityPct$nameLast, maxDist = 2)
strMatch <- as.data.frame(strMatch)
df.OwnerNamesMissingLast <- bind_cols(df.OwnerNamesMissingLast, strMatch)
df.NameLastEthnicityPct$rownum <- row(df.NameLastEthnicityPct)
df.OwnerNamesMissingLast <- left_join(df.OwnerNamesMissingLast, df.NameLastEthnicityPct, by=c("strMatch" = "rownum"))
## Warning: Column `strMatch`/`rownum` has different attributes on LHS and RHS
## of join
df.OwnerNamesMissingLast$strMatch <- NULL
df.OwnerNamesMissingLast$nameLast <- NULL
df.OwnerNamesLast <- bind_rows(df.OwnerNamesMergeLast, df.OwnerNamesMissingLast)
df.OwnerNamesLast <- df.OwnerNamesLast %>%
  select(-CurrSaleBuyer1FullName, -OwnerStdNAME1FULL )

Merge First Names - Simple

df.OwnerNamesMergeFirst <- inner_join(df.OwnerNamesMaster, df.NameFirstEthnicityPct, by = c("OwnerStd1FirstName" = "nameFirst"))
df.OwnerNamesMissingFirst <- anti_join(df.OwnerNamesMaster, df.OwnerNamesMergeFirst, by = "OwnerStd1FirstName")

Perform Approximate String Matching on First Name

Approximate String Matching First Name Matched string returns row number of first name in nameFirstEthnicityPct. Max string distance = 2

strMatch <- amatch(df.OwnerNamesMissingFirst$OwnerStd1FirstName, df.NameFirstEthnicityPct$nameFirst, maxDist = 1)
strMatch <- as.data.frame(strMatch)
df.OwnerNamesMissingFirst <- bind_cols(df.OwnerNamesMissingFirst, strMatch)
df.NameFirstEthnicityPct$rownum <- row(df.NameFirstEthnicityPct)
df.OwnerNamesMissingFirst <- left_join(df.OwnerNamesMissingFirst, df.NameFirstEthnicityPct, by=c("strMatch" = "rownum"))
## Warning: Column `strMatch`/`rownum` has different attributes on LHS and RHS
## of join
df.OwnerNamesFirst <- bind_rows(df.OwnerNamesMergeFirst, df.OwnerNamesMissingFirst)
df.OwnerNamesFirst <- df.OwnerNamesFirst %>%
  select(-CurrSaleBuyer1FullName, -OwnerStdNAME1FULL )

GeoID Formatting & Extracting Census BG and Tract Features

df.OwnerNamesLast$geoBlockGroup <- substr(df.OwnerNamesLast$geoID, 1, 12)
df.OwnerNamesLast$geoTract <- substr(df.OwnerNamesLast$geoID, 1, 11)

Merge Census Demographic Data

df.ethnicityComplete <- left_join(df.OwnerNamesLast, df.CA_Block_Group, by = "geoBlockGroup")
df.ethnicityComplete <- left_join(df.ethnicityComplete, df.CA_Census_Tract, by = "geoTract")

Missing Census Data Imputation

If census tract ethnicity data is missing we use the zip code ethnicity distribution to replace the missing data. Then we replace missing census block group data with census tract ethnicity data.

pctTractWhite

df.ethnicityComplete <- df.ethnicityComplete %>%
  group_by(SitusStdZIP5) %>%
  mutate(mean_pctZipWhite = mean(as.numeric(pctTractWhite), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)

df.ethnicityComplete$pctTractWhite[is.na(df.ethnicityComplete$pctTractWhite)] <- df.ethnicityComplete$mean_pctZipWhite

pctTractAsian

df.ethnicityComplete <- df.ethnicityComplete %>%
  group_by(SitusStdZIP5) %>%
  mutate(mean_pctZipAsian = mean(as.numeric(pctTractAsian), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)

df.ethnicityComplete$pctTractAsian[is.na(df.ethnicityComplete$pctTractAsian)] <- df.ethnicityComplete$mean_pctZipAsian

pctTractBlack

df.ethnicityComplete <- df.ethnicityComplete %>%
  group_by(SitusStdZIP5) %>%
  mutate(mean_pctZipBlack = mean(as.numeric(pctTractBlack), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)

df.ethnicityComplete$pctTractBlack[is.na(df.ethnicityComplete$pctTractBlack)] <- df.ethnicityComplete$mean_pctZipBlack

pctTractHispanic

df.ethnicityComplete <- df.ethnicityComplete %>%
  group_by(SitusStdZIP5) %>%
  mutate(mean_pctZipHispanic = mean(as.numeric(pctTracHispanic), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)

df.ethnicityComplete$pctTracHispanic[is.na(df.ethnicityComplete$pctTracHispanic)] <- df.ethnicityComplete$mean_pctZipHispanic

pctTractOther

df.ethnicityComplete <- df.ethnicityComplete %>%
  group_by(SitusStdZIP5) %>%
  mutate(mean_pctZipOther = mean(as.numeric(pctTractOther), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)

df.ethnicityComplete$pctTractOther[is.na(df.ethnicityComplete$pctTractOther)] <- df.ethnicityComplete$mean_pctZipOther

df.ethnicityComplete$mean_pctZipWhite <- NULL
df.ethnicityComplete$mean_pctZipAsian <- NULL
df.ethnicityComplete$mean_pctZipOther <- NULL
df.ethnicityComplete$mean_pctZipHispanic <- NULL
df.ethnicityComplete$mean_pctZipBlack <- NULL

Now we’ll replace missing census block group data with the census tract ethnicity data.

df.ethnicityComplete$pctBlockGroupAsian[is.na(df.ethnicityComplete$pctBlockGroupAsian)] <- df.ethnicityComplete$pctTractAsian

df.ethnicityComplete$pctBlockGroupOther[is.na(df.ethnicityComplete$pctBlockGroupOther)] <- df.ethnicityComplete$pctTractOther

df.ethnicityComplete$pctBlockGroupWhite[is.na(df.ethnicityComplete$pctBlockGroupWhite)] <- df.ethnicityComplete$pctTractWhite

df.ethnicityComplete$pctBlockGroupBlack[is.na(df.ethnicityComplete$pctBlockGroupBlack)] <- df.ethnicityComplete$pctTractBlack

df.ethnicityComplete$pctBlockGroupHispanic[is.na(df.ethnicityComplete$pctBlockGroupHispanic)] <- df.ethnicityComplete$pctTracHispanic

Replace Remaining NA’s with 0

df.ethnicityComplete$pctLstNameOther <- 0
df.ethnicityComplete$pctLstNameOther[is.na(df.ethnicityComplete$pctLastNameBlack)] <- 1
df.ethnicityComplete$pctLstNameWhite[is.na(df.ethnicityComplete$pctLstNameWhite)] <- 0
df.ethnicityComplete$pctLastNameBlack[is.na(df.ethnicityComplete$pctLastNameBlack)] <- 0
df.ethnicityComplete$pctLastNameAsian[is.na(df.ethnicityComplete$pctLastNameAsian)] <- 0
df.ethnicityComplete$pctLastNameHispanic[is.na(df.ethnicityComplete$pctLastNameHispanic)] <- 0
df.ethnicityComplete$pctLstNameArab[is.na(df.ethnicityComplete$pctLstNameArab)] <- 0

Modeling

Extract ethnicity features

train.Ethnicity <- train.Ethnicity %>%
  select(-OwnerStd1FirstName, -OwnerStd1LastName)
train.Ethnicity$ownerEthnicity <- as.factor(train.Ethnicity$ownerEthnicity)

Build Ethnicity Tree

We use a conditional inference tree to build our ethnicity prediction model on the training data.

ethnicity_ctree <- ctree(ownerEthnicity ~ ., data=train.Ethnicity)

print(ethnicity_ctree)
## 
##   Conditional inference tree with 7 terminal nodes
## 
## Response:  ownerEthnicity 
## Inputs:  pctLstNameArab, pctLstNameWhite, pctLastNameBlack, pctLastNameAsian, pctLastNameHispanic, pctBlockGroupHispanic, pctBlockGroupAsian, pctBlockGroupWhite, pctBlockGroupBlack, pctBlockGroupOther, pctTractWhite, pctTractAsian, pctTractBlack, pctTractOther, pctTracHispanic 
## Number of observations:  110 
## 
## 1) pctLstNameArab <= 0.01; criterion = 1, statistic = 108.997
##   2) pctLstNameWhite <= 0.0329; criterion = 1, statistic = 82.188
##     3)*  weights = 12 
##   2) pctLstNameWhite > 0.0329
##     4) pctLastNameHispanic <= 0.0353; criterion = 1, statistic = 73.91
##       5) pctBlockGroupBlack <= 0.1535714; criterion = 1, statistic = 23.411
##         6)*  weights = 39 
##       5) pctBlockGroupBlack > 0.1535714
##         7)*  weights = 9 
##     4) pctLastNameHispanic > 0.0353
##       8) pctTractAsian <= 0.1190354; criterion = 1, statistic = 33.281
##         9) pctLastNameBlack <= 0.006; criterion = 1, statistic = 18.176
##           10)*  weights = 26 
##         9) pctLastNameBlack > 0.006
##           11)*  weights = 8 
##       8) pctTractAsian > 0.1190354
##         12)*  weights = 7 
## 1) pctLstNameArab > 0.01
##   13)*  weights = 9
plot(ethnicity_ctree)

Perform Predictions

ethnicityPredictionsInSample <- predict(ethnicity_ctree)

#Predict ethnicity for new data
df.ethnicityComplete$ownerEthnicity = predict(ethnicity_ctree, df.ethnicityComplete, type="response")