Data Cleaning

To begin analysis, we first read and clean the UNO survey data. Data was e-mailed by Professor Peter Yaukey. Data requests and questions can be sent to pyaukey@uno.edu.

code.check <- function(vec,code){
    #Compares entries in a vector to a given value
    #vec: a vector of codes
    #code: a code to be evaluated
    #value: a vector of length vec, with entries if vec[i]==1 if check[i]==code and 0 otherwise
    check <- (as.character(vec) %in% code)+0
    return(check)
}

substrRight <- function(x, n){
    #From http://stackoverflow.com/questions/7963898/extracting-the-last-n-characters-from-a-string-in-r,
    #extracts n characters from the end of x
    #x: a character vector
    #n: number of characters from the end of x to be extracted
    #value: a string whose entries are the last n characters of x 
  substr(x, nchar(x)-n+1, nchar(x))
}

setwd("O:\\Projects\\BlightStat\\10000 goal analysis") #your working directory here
packages = "C:\\RWorkingDir\\RPackages" #your package library here
.libPaths(packages)
require(magrittr)

## Loading required package: magrittr

require(plyr)

## Loading required package: plyr

require(dplyr)

## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

options(scipen=999) 
blight.survey <- read.csv("RebuildingDataRawForm.csv",na.strings=c(""," "),
stringsAsFactors=FALSE,strip.white=TRUE,sep=",")
blight.survey <- data.frame(lapply(blight.survey, 
    function(v) {if (is.character(v)) return(as.character(toupper(v))) 
    else return(v)
}))
#Data cleaning and taking out block groups that were removed from the survey
blight.survey$X14renov <- substr(blight.survey$X14renov,1,1)
blight.survey$ID <- paste0(blight.survey$track,blight.survey$bl.grp)
blight.survey <- blight.survey[-which(blight.survey$ID %in%
c(944,93.011,3041,3053,3083,38.071)),]

From the survey codes, we create four categories that can be construed as similar to no-stats: blighted structures (consisting of G-gutted, C-closed, and U-untouched), structures in the process of being rebuilt (P), vacant lots (D), and vacant lots that had previously been considered vacant lots (D in both the year of interest and the previous year). Finally, the codes are aggregated by block group.

blight.survey$blighted10 <- code.check(blight.survey$X10renov,c("G","C","U"))
blight.survey$lot10 <- code.check(blight.survey$X10renov,"D")
blight.survey$lotNC10 <- (blight.survey$lot10==1 & 
    (as.character(blight.survey$X09renov)=="D" | 
    is.na(as.character(blight.survey$X09renov))))+0
blight.survey$proc10 <- code.check(blight.survey$X10renov,c("P"))
blight.survey$blighted12 <- code.check(blight.survey$X12renov,c("G","C","U"))
blight.survey$lot12 <- code.check(blight.survey$X12renov,"D")
blight.survey$lotNC12 <- (blight.survey$lot12==1 & 
    (as.character(blight.survey$X10renov)=="D" | 
    is.na(as.character(blight.survey$X10renov))))+0
blight.survey$proc12 <- code.check(blight.survey$X12renov,c("P"))
blight.survey$blighted14 <- code.check(blight.survey$X14renov,c("G","C","U"))
blight.survey$lot14 <- code.check(blight.survey$X14renov,"D")
blight.survey$lotNC14 <- (blight.survey$lot14==1 & 
    (as.character(blight.survey$X12renov)=="D" | 
    is.na(as.character(blight.survey$X12renov))))+0
blight.survey$proc14 <- code.check(blight.survey$X14renov,c("P"))

#Creating a new data.frame with totals for each block group
blight.counts <- as.data.frame(summarise(group_by(blight.survey,ID), 
    surveyed10=sum(!is.na(as.character(X10renov))),blighted10=sum(blighted10),
    lot10=sum(lot10),lotNC10=sum(lotNC10),proc10=sum(proc10),
    surveyed12=sum(!is.na(as.character(X12renov))),blighted12=sum(blighted12),
    lot12=sum(lot12),lotNC12=sum(lotNC12),proc12=sum(proc12),
    surveyed14=sum(!is.na(as.character(X14renov))),blighted14=sum(blighted14),
    lot14=sum(lot14),lotNC14=sum(lotNC14),proc14=sum(proc14)))
blight.counts$Tract <- substr(blight.counts$ID,1,nchar(blight.counts$ID)-1)

Our estimation depends on \(M\), the number of total addresses per block group. This can be estimated with the USPS data. The USPS data can be downloaded from: http://www.huduser.org/portal/usps/home.html (an account is needed). Go to “Download Quarterly Data: USPS Vacancy Data - 2000 Census Tract Summary Files” and select the “Quarter 3 ending September 30, 2010” file. The data is only available at the tract level, so we make the assumption that each block group within a given tract has roughly the same number of addresses.

USPS <- read.csv("USPSData2010Q3.csv")
USPS <- subset(USPS,select=c(GEOID,AMS_RES,NOSTAT_RES))
USPS$GEOID <- substrRight(USPS$GEOID,5)
USPS$GEOID <- as.numeric(paste0(substr(USPS$GEOID, 1, 4-1), ".", substr(USPS$GEOID, 4, nchar(USPS$GEOID))))
names(USPS)[1] <- "Tract"
#Finding the number of BGs in each tract. This uses the Census table H001 using 2000 BG geography,
#obtained from the Census Bureau's American FactFinder Tool,
#though any Census table can be used, as long as the same geography is applied. 
BG.table <- read.csv("BGInfo00.csv",skip=1,stringsAsFactors=FALSE)
split.geog <- strsplit(BG.table$Geography, ",")
geog.df <- as.data.frame(t(sapply(split.geog,function(x) x[1:2])))
names(geog.df) <- c("BG","Tract")
geog.df$BG <- as.numeric(sub("Block Group ", "", geog.df$BG))
geog.df$Tract <- as.numeric(sub("Census Tract ", "", geog.df$Tract))
geog.df$ID <- paste0(geog.df$Tract,geog.df$BG)
BG.count <- as.data.frame(table(geog.df$Tract))
names(BG.count)[1] <- "Tract"

Estimation also depends on \(latex N\), the total number of block groups.

excluded <- c(1, 2, 3, 4, 6.01, 6.02, 6.03, 6.04, 6.05, 6.06, 6.07, 6.08, 6.11, 6.12, 6.13, 6.14, 13.03, 13.04, 12, 15, 17.33, 26, 38, 42, 57, 67, 77, 78, 79, 82, 83, 84, 85, 87, 88, 89, 90, 96, 97, 99, 104, 105, 106, 107, 108, 114, 115, 116, 120, 125, 126, 127, 129, 130, 131)
all.considered.BG <- geog.df$Tract[-which(geog.df$Tract %in% excluded)]
num.BG.considered <- length(all.considered.BG)

Finally, we merge the UNO dataset with the number of addresses per block group.

USPS <- merge(x=USPS,y=BG.count,by="Tract")
USPS$AVG.RES <- USPS$AMS_RES/USPS$Freq
blight.counts <- join(blight.counts,USPS,by="Tract")
save(num.BG.considered,blight.counts,USPS,file="BlightData.RData")