Data Cleaning

From the Introduction, our estimates are based on the following variables:

\(X\): the total number of no-stat addresses in 2010
\(Y\): the total number of no-stat addresses in subsequent years
\(x_i\): the number of sampled houses in block group \(i\) that can be defined as no-stats in 2010
\(y_i\): the number of sampled houses in block group \(i\) that can be defined as no-stats in subsequent years
\(m_i\): the total number of sampled houses in block group \(i\)

Before making our calculations, we first read and clean the UNO survey data to obtain \(x_i\) (no-stat addresses in 2010), \(y_i\) (no-stat addresses in subsequent years), and \(m_i\) (total sampled addresses in all years). Data requests and questions can be sent to pyaukey@uno.edu.

code.check <- function(vec,code){
    #Compares entries in a vector to a given value
    #vec: a vector of codes
    #code: a code to be evaluated
    #value: a vector of length vec, with entries if vec[i]==1 if check[i]==code and 0 otherwise
    check <- (as.character(vec) %in% code)+0
    return(check)
}

substrRight <- function(x, n){
    #From http://stackoverflow.com/questions/7963898/extracting-the-last-n-characters-from-a-string-in-r,
    #extracts n characters from the end of x
    #x: a character vector
    #n: number of characters from the end of x to be extracted
    #value: a string whose entries are the last n characters of x 
  substr(x, nchar(x)-n+1, nchar(x))
}

setwd("G:\\Projects\\BlightStat\\BlightSurveyAnalysis\\Final")  #your working directory here
library(dplyr)
options(scipen=999) 
blight.survey <- read.csv("RebuildingDataRawForm.csv", na.strings = c(""," "),
stringsAsFactors = FALSE, strip.white = TRUE, sep=",")
blight.survey <- data.frame(lapply(blight.survey, 
    function(v) {if (is.character(v)) return(as.character(toupper(v))) 
    else return(v)
}))
#Data cleaning and taking out block groups that were removed from the survey
blight.survey$X14renov <- substr(blight.survey$X14renov, 1, 1)
blight.survey$ID <- paste0(blight.survey$track,blight.survey$bl.grp)
blight.survey <- blight.survey[-which(blight.survey$ID %in%
c(944, 93.011, 3041, 3053, 3083, 38.071)),]

From the survey codes, we create four categories: blighted structures (consisting of G-gutted, C-closed, and U-untouched), structures in the process of being rebuilt (P), vacant lots (D), and vacant lots that had previously been considered vacant lots (D in both the year of interest and the previous year). Finally, the codes are aggregated by block group.

blight.survey$blighted10 <- code.check(blight.survey$X10renov,c("G", "C", "U"))
blight.survey$lot10 <- code.check(blight.survey$X10renov, "D")
blight.survey$lotNC10 <- (blight.survey$lot10 == 1 & 
    (as.character(blight.survey$X09renov) == "D" | 
    is.na(as.character(blight.survey$X09renov)))) + 0
blight.survey$proc10 <- code.check(blight.survey$X10renov, c("P"))
blight.survey$blighted12 <- code.check(blight.survey$X12renov, c("G", "C", "U"))
blight.survey$lot12 <- code.check(blight.survey$X12renov, "D")
blight.survey$lotNC12 <- (blight.survey$lot12 == 1 & 
    (as.character(blight.survey$X10renov) == "D" | 
    is.na(as.character(blight.survey$X10renov)))) + 0
blight.survey$proc12 <- code.check(blight.survey$X12renov,c("P"))
blight.survey$blighted14 <- code.check(blight.survey$X14renov, c("G", "C", "U"))
blight.survey$lot14 <- code.check(blight.survey$X14renov, "D")
blight.survey$lotNC14 <- (blight.survey$lot14 == 1 & 
    (as.character(blight.survey$X12renov) == "D" | 
    is.na(as.character(blight.survey$X12renov)))) + 0
blight.survey$proc14 <- code.check(blight.survey$X14renov, "P")

#Creating a new data.frame with totals for each block group
blight.counts <- as.data.frame(group_by(blight.survey, ID) %>%
    summarise(surveyed10=sum(!is.na(as.character(X10renov))), blighted10 = sum(blighted10), 
    lot10 = sum(lot10), lotNC10 = sum(lotNC10), proc10 = sum(proc10),
    surveyed12 = sum(!is.na(as.character(X12renov))), blighted12 = sum(blighted12),
    lot12 = sum(lot12), lotNC12 = sum(lotNC12), proc12 = sum(proc12),
    surveyed14 = sum(!is.na(as.character(X14renov))), blighted14 = sum(blighted14),
    lot14 = sum(lot14), lotNC14 = sum(lotNC14), proc14=sum(proc14)))
blight.counts$Tract <- substr(blight.counts$ID, 1, nchar(blight.counts$ID)-1)

Finally, we can obtain \(X\) from USPS data. This data can be downloaded from: http://www.huduser.org/portal/usps/home.html (an account is needed). Go to “Download Quarterly Data: USPS Vacancy Data - 2000 Census Tract Summary Files” and select the “Quarter 3 ending September 30, 2010” file.

USPS <- read.csv("USPSData2010Q3.csv")
USPS <- subset(USPS,select=c(GEOID, AMS_RES, NOSTAT_RES))
save(blight.counts, USPS, file = "BlightData.RData")