In this project we use First Name, Last Name, and Census Ethnicity Data from the American Community Survey to predict a person’s ethnicity. For professional purposes, this model has been used to predict the primary ethnicity of a property owner for a granular analysis of neighborhood ethnicity distribution.
The datasets in this project are obtained from multiple sources, both private and public.
Our dependent variable, ethnicity, is a nominal variable with 6 categories:
library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringdist)
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:strucchange':
##
## boundary
df.DataTree <- read.csv("~/Models/DataTree/Raw/06065.csv", sep="|", na.strings=c(""," ","NA", "<NA>", "\\N"), stringsAsFactors = FALSE, fill=TRUE, quote="", colClasses = "character")
#Source scripts to format Data Tree file
source("~/Models/Source/APN_Formatting.R") #Format APN for by county
##
## Attaching package: 'taRifx'
## The following objects are masked from 'package:dplyr':
##
## between, distinct, first, last
source("~/Models/Source/Extract_Residential.R") #Extract residential properties
source("~/Models/Source/GeoID_processing.R") #Import GeoID and merge by apn
#Training Data
train.Ethnicity <- read.csv("~/Models/Ethnicity/train_ethnicity.csv", na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")
#Census Demographics
df.CA_Block_Group <- read.csv("~/Models/Ethnicity/Census/ethnicity_block_group.csv", na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")
df.CA_Census_Tract <- read.csv("~/Models/Ethnicity/Census/ethnicity_tract.csv" , na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")
#Format geoTract and geoBlockGroup ID
df.CA_Block_Group$geoBlockGroup <- sprintf("%012.0f", df.CA_Block_Group$geoBlockGroup)
df.CA_Census_Tract$geoTract <- sprintf("%011.0f", df.CA_Census_Tract$geoTract)
df.CA_Block_Group$geoBlockGroup <- as.character(df.CA_Block_Group$geoBlockGroup)
df.CA_Census_Tract$geoTract <- as.character(df.CA_Census_Tract$geoTract)
#Percent Ethnicity by Last Name
df.NameLastEthnicityPct <- read.csv("~/Models/Ethnicity/Name Classification/nameLastEthnicityPct.csv", na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")
#Percent Ethnicity by First Name
df.NameFirstEthnicityPct <- read.csv("~/Models/Ethnicity/Name Classification/nameFirstEthnicityPct.csv", na.strings=c(""," ","NA", "<NA>"), stringsAsFactors = FALSE, fill=TRUE, quote="")
df.OwnerNamesMaster <- select(df.DataTree, apn, OwnerStd1FirstName, OwnerStd1LastName, CurrSaleBuyer1FullName, OwnerStd1CorpInd, OwnerStdNAME1FULL, geoID, SitusStdZIP5, county)
If OwnerStd1LastName is missing, replace with first word (last name) in CurrSaleBuyer1FullName
df.OwnerNamesMaster$OwnerStd1LastName <- ifelse(is.na(df.OwnerNamesMaster$OwnerStd1LastName), word(df.OwnerNamesMaster$CurrSaleBuyer1FullName, 1), df.OwnerNamesMaster$OwnerStd1LastName)
If OwnerStd1FirstName is missing, replace with second word (first name) in CurrSaleBuyer1FullName
df.OwnerNamesMaster$OwnerStd1FirstName <- ifelse(is.na(df.OwnerNamesMaster$OwnerStd1FirstName), word(df.OwnerNamesMaster$CurrSaleBuyer1FullName, 2), df.OwnerNamesMaster$OwnerStd1FirstName)
If OwnerStd1LastName is missing, replace with first word (last name) in OwnerStdNAME1FULL
df.OwnerNamesMaster$OwnerStd1LastName <- ifelse(is.na(df.OwnerNamesMaster$OwnerStd1LastName), word(df.OwnerNamesMaster$OwnerStdNAME1FULL, 1), df.OwnerNamesMaster$OwnerStd1LastName)
If OwnerStd1FirstName is missing, replace with second word (first name) in OwnerStdNAME1FULL
df.OwnerNamesMaster$OwnerStd1FirstName <- ifelse(is.na(df.OwnerNamesMaster$OwnerStd1FirstName), word(df.OwnerNamesMaster$OwnerStdNAME1FULL, 2), df.OwnerNamesMaster$OwnerStd1FirstName)
df.OwnerNamesMergeLast <- inner_join(df.OwnerNamesMaster, df.NameLastEthnicityPct, by = c("OwnerStd1LastName" = "nameLast"))
#Obtain set of names that were not matched in simple merge
df.OwnerNamesMissingLast <- anti_join(df.OwnerNamesMaster, df.OwnerNamesMergeLast, by = "OwnerStd1LastName")
We use approximate string matching to match mispelled surnames or names that are uniquely spelled, yet can still be said to retain the ethnic origin of the majority of characters in the name i.e. Gonzales, Gonzáles, Gonzalés, and Gonzalez are all of Spanish origin.
strMatch <- amatch(df.OwnerNamesMissingLast$OwnerStd1LastName, df.NameLastEthnicityPct$nameLast, maxDist = 2)
strMatch <- as.data.frame(strMatch)
df.OwnerNamesMissingLast <- bind_cols(df.OwnerNamesMissingLast, strMatch)
df.NameLastEthnicityPct$rownum <- row(df.NameLastEthnicityPct)
df.OwnerNamesMissingLast <- left_join(df.OwnerNamesMissingLast, df.NameLastEthnicityPct, by=c("strMatch" = "rownum"))
## Warning: Column `strMatch`/`rownum` has different attributes on LHS and RHS
## of join
df.OwnerNamesMissingLast$strMatch <- NULL
df.OwnerNamesMissingLast$nameLast <- NULL
df.OwnerNamesLast <- bind_rows(df.OwnerNamesMergeLast, df.OwnerNamesMissingLast)
df.OwnerNamesLast <- df.OwnerNamesLast %>%
select(-CurrSaleBuyer1FullName, -OwnerStdNAME1FULL )
df.OwnerNamesMergeFirst <- inner_join(df.OwnerNamesMaster, df.NameFirstEthnicityPct, by = c("OwnerStd1FirstName" = "nameFirst"))
df.OwnerNamesMissingFirst <- anti_join(df.OwnerNamesMaster, df.OwnerNamesMergeFirst, by = "OwnerStd1FirstName")
Approximate String Matching First Name Matched string returns row number of first name in nameFirstEthnicityPct. Max string distance = 2
strMatch <- amatch(df.OwnerNamesMissingFirst$OwnerStd1FirstName, df.NameFirstEthnicityPct$nameFirst, maxDist = 1)
strMatch <- as.data.frame(strMatch)
df.OwnerNamesMissingFirst <- bind_cols(df.OwnerNamesMissingFirst, strMatch)
df.NameFirstEthnicityPct$rownum <- row(df.NameFirstEthnicityPct)
df.OwnerNamesMissingFirst <- left_join(df.OwnerNamesMissingFirst, df.NameFirstEthnicityPct, by=c("strMatch" = "rownum"))
## Warning: Column `strMatch`/`rownum` has different attributes on LHS and RHS
## of join
df.OwnerNamesFirst <- bind_rows(df.OwnerNamesMergeFirst, df.OwnerNamesMissingFirst)
df.OwnerNamesFirst <- df.OwnerNamesFirst %>%
select(-CurrSaleBuyer1FullName, -OwnerStdNAME1FULL )
df.OwnerNamesLast$geoBlockGroup <- substr(df.OwnerNamesLast$geoID, 1, 12)
df.OwnerNamesLast$geoTract <- substr(df.OwnerNamesLast$geoID, 1, 11)
df.ethnicityComplete <- left_join(df.OwnerNamesLast, df.CA_Block_Group, by = "geoBlockGroup")
df.ethnicityComplete <- left_join(df.ethnicityComplete, df.CA_Census_Tract, by = "geoTract")
If census tract ethnicity data is missing we use the zip code ethnicity distribution to replace the missing data. Then we replace missing census block group data with census tract ethnicity data.
df.ethnicityComplete <- df.ethnicityComplete %>%
group_by(SitusStdZIP5) %>%
mutate(mean_pctZipWhite = mean(as.numeric(pctTractWhite), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)
df.ethnicityComplete$pctTractWhite[is.na(df.ethnicityComplete$pctTractWhite)] <- df.ethnicityComplete$mean_pctZipWhite
df.ethnicityComplete <- df.ethnicityComplete %>%
group_by(SitusStdZIP5) %>%
mutate(mean_pctZipAsian = mean(as.numeric(pctTractAsian), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)
df.ethnicityComplete$pctTractAsian[is.na(df.ethnicityComplete$pctTractAsian)] <- df.ethnicityComplete$mean_pctZipAsian
df.ethnicityComplete <- df.ethnicityComplete %>%
group_by(SitusStdZIP5) %>%
mutate(mean_pctZipBlack = mean(as.numeric(pctTractBlack), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)
df.ethnicityComplete$pctTractBlack[is.na(df.ethnicityComplete$pctTractBlack)] <- df.ethnicityComplete$mean_pctZipBlack
df.ethnicityComplete <- df.ethnicityComplete %>%
group_by(SitusStdZIP5) %>%
mutate(mean_pctZipHispanic = mean(as.numeric(pctTracHispanic), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)
df.ethnicityComplete$pctTracHispanic[is.na(df.ethnicityComplete$pctTracHispanic)] <- df.ethnicityComplete$mean_pctZipHispanic
df.ethnicityComplete <- df.ethnicityComplete %>%
group_by(SitusStdZIP5) %>%
mutate(mean_pctZipOther = mean(as.numeric(pctTractOther), na.rm=TRUE))
df.ethnicityComplete <- ungroup(df.ethnicityComplete)
df.ethnicityComplete$pctTractOther[is.na(df.ethnicityComplete$pctTractOther)] <- df.ethnicityComplete$mean_pctZipOther
df.ethnicityComplete$mean_pctZipWhite <- NULL
df.ethnicityComplete$mean_pctZipAsian <- NULL
df.ethnicityComplete$mean_pctZipOther <- NULL
df.ethnicityComplete$mean_pctZipHispanic <- NULL
df.ethnicityComplete$mean_pctZipBlack <- NULL
Now we’ll replace missing census block group data with the census tract ethnicity data.
df.ethnicityComplete$pctBlockGroupAsian[is.na(df.ethnicityComplete$pctBlockGroupAsian)] <- df.ethnicityComplete$pctTractAsian
df.ethnicityComplete$pctBlockGroupOther[is.na(df.ethnicityComplete$pctBlockGroupOther)] <- df.ethnicityComplete$pctTractOther
df.ethnicityComplete$pctBlockGroupWhite[is.na(df.ethnicityComplete$pctBlockGroupWhite)] <- df.ethnicityComplete$pctTractWhite
df.ethnicityComplete$pctBlockGroupBlack[is.na(df.ethnicityComplete$pctBlockGroupBlack)] <- df.ethnicityComplete$pctTractBlack
df.ethnicityComplete$pctBlockGroupHispanic[is.na(df.ethnicityComplete$pctBlockGroupHispanic)] <- df.ethnicityComplete$pctTracHispanic
Replace Remaining NA’s with 0
df.ethnicityComplete$pctLstNameOther <- 0
df.ethnicityComplete$pctLstNameOther[is.na(df.ethnicityComplete$pctLastNameBlack)] <- 1
df.ethnicityComplete$pctLstNameWhite[is.na(df.ethnicityComplete$pctLstNameWhite)] <- 0
df.ethnicityComplete$pctLastNameBlack[is.na(df.ethnicityComplete$pctLastNameBlack)] <- 0
df.ethnicityComplete$pctLastNameAsian[is.na(df.ethnicityComplete$pctLastNameAsian)] <- 0
df.ethnicityComplete$pctLastNameHispanic[is.na(df.ethnicityComplete$pctLastNameHispanic)] <- 0
df.ethnicityComplete$pctLstNameArab[is.na(df.ethnicityComplete$pctLstNameArab)] <- 0
Extract ethnicity features
train.Ethnicity <- train.Ethnicity %>%
select(-OwnerStd1FirstName, -OwnerStd1LastName)
train.Ethnicity$ownerEthnicity <- as.factor(train.Ethnicity$ownerEthnicity)
We use a conditional inference tree to build our ethnicity prediction model on the training data.
ethnicity_ctree <- ctree(ownerEthnicity ~ ., data=train.Ethnicity)
print(ethnicity_ctree)
##
## Conditional inference tree with 7 terminal nodes
##
## Response: ownerEthnicity
## Inputs: pctLstNameArab, pctLstNameWhite, pctLastNameBlack, pctLastNameAsian, pctLastNameHispanic, pctBlockGroupHispanic, pctBlockGroupAsian, pctBlockGroupWhite, pctBlockGroupBlack, pctBlockGroupOther, pctTractWhite, pctTractAsian, pctTractBlack, pctTractOther, pctTracHispanic
## Number of observations: 110
##
## 1) pctLstNameArab <= 0.01; criterion = 1, statistic = 108.997
## 2) pctLstNameWhite <= 0.0329; criterion = 1, statistic = 82.188
## 3)* weights = 12
## 2) pctLstNameWhite > 0.0329
## 4) pctLastNameHispanic <= 0.0353; criterion = 1, statistic = 73.91
## 5) pctBlockGroupBlack <= 0.1535714; criterion = 1, statistic = 23.411
## 6)* weights = 39
## 5) pctBlockGroupBlack > 0.1535714
## 7)* weights = 9
## 4) pctLastNameHispanic > 0.0353
## 8) pctTractAsian <= 0.1190354; criterion = 1, statistic = 33.281
## 9) pctLastNameBlack <= 0.006; criterion = 1, statistic = 18.176
## 10)* weights = 26
## 9) pctLastNameBlack > 0.006
## 11)* weights = 8
## 8) pctTractAsian > 0.1190354
## 12)* weights = 7
## 1) pctLstNameArab > 0.01
## 13)* weights = 9
plot(ethnicity_ctree)
ethnicityPredictionsInSample <- predict(ethnicity_ctree)
#Predict ethnicity for new data
df.ethnicityComplete$ownerEthnicity = predict(ethnicity_ctree, df.ethnicityComplete, type="response")