I. POLICE SHOOTING DATA ———————————

A. Load Libraries

#library(Hmisc)
library(dplyr)
library(ggmap)
library(readr)

B. Functions

1. getStateName

This function is just a way to convert state names back and forth using base R functions, but includes washington dc and puerto rico.

getState <- function(x){
  ##remove punctuation and twim white space, convert to lower case
  x <- tolower(trimws(gsub("[[:punct:]]", "", x)))
  x[which(x == "washington dc")] <- "district of columbia"
  
  states.abb <- c(state.abb, "DC", "DC", "PR")
  states.name <- c(state.name, "District of Columbia", "District of Columbia", "Puerto Rico")
  
  m.abb <- tolower(states.abb)
  m.name <- tolower(gsub("[[:punct:]]", "", states.name))
  
  ##automatically decide to convert from names to states or vice versa based on majority match
  if(sum(x %in% m.abb) > sum(x %in% m.name)){
    #convert abbreviations to names
    y <- states.name[match(x,m.abb)]
  } else {
    y <- states.abb[match(x,m.name)]
  }
   return(y)
}
##returns abbreviations only
getState.abb <- function(x){
  ##remove punctuation and twim white space, convert to lower case
  x <- tolower(trimws(gsub("[[:punct:]]", "", x)))
  x[which(x == "washington dc")] <- "district of columbia"
  
  states.abb <- c(state.abb, "DC", "DC", "PR")
  states.name <- c(state.name, "District of Columbia", "District of Columbia", "Puerto Rico")
  
  m.abb <- tolower(states.abb)
  m.name <- tolower(gsub("[[:punct:]]", "", states.name))
  
    x[which(x %in% m.abb)] <- states.abb[match(x[which(x %in% m.abb)] ,m.abb)]
    x[which(x %in% m.name)] <- states.abb[match(x[which(x %in% m.name)],m.name)]
  return(x)
}
##returns Names only
getState.names <- function(x){
  ##remove punctuation and twim white space, convert to lower case
  x <- tolower(trimws(gsub("[[:punct:]]", "", x)))
  x[which(x == "washington dc")] <- "district of columbia"
  
  states.abb <- c(state.abb, "DC", "DC", "PR")
  states.name <- c(state.name, "District of Columbia", "District of Columbia", "Puerto Rico")
  
  m.abb <- tolower(states.abb)
  m.name <- tolower(gsub("[[:punct:]]", "", states.name))
  
  x[which(x %in% m.abb)] <- states.name[match(x[which(x %in% m.abb)] ,m.abb)]
  x[which(x %in% m.name)] <- states.name[match(x[which(x %in% m.name)],m.name)]
  
  return(x)
}

2. Get FIPS

getFIPS <- function(df, cityVar="city", stateVar="State", countyVar="county", addressVar = "address"){
  require(tibble)
  require(readr)
  require(dplyr)
  require(ggmap)
  require(noncensus)
  setwd("createData")
#PART I - LOAD 3 DATA SETS    
  data(counties)
  counties <- sapply(counties, as.character) %>% tibble::as_data_frame()
  counties$FIPS <- paste0("`", as.character(counties$state_fips), as.character(counties$county_fips))
  counties$county_lower <- counties$county_name
  counties$county_lower  = tolower(trimws(gsub("[[:punct:]]", "", counties$county_lower)))
  counties$county_lower= gsub(" county| parish| borough", "", counties$county_lower, fixed = TRUE)
geoCode <- readr::read_tsv("35158-0001-Data.tsv") %>% 
  dplyr::mutate(
    State =  getState.abb(STATENAME),
    city_lower = ADDRESS_CITY,
    city_lower = gsub("st. ", "saint ", trimws(tolower(city_lower)), fixed = TRUE),
    city_lower = gsub("mt. ", "mount ", trimws(tolower(city_lower)), fixed = TRUE),
    city_lower = tolower(as.character(trimws(gsub("[[:punct:]]", "", city_lower)))),
    county_lower = tolower(as.character(trimws(gsub("[[:punct:]]", "", COUNTYNAME)))),
    FIPS = paste("`", FIPS, sep="")
  )
charNums <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
geo_city <- readr::read_delim(file="POP_PLACES_20170601.txt", delim="|") %>% 
  dplyr::mutate(
    FIPS = paste0("`", STATE_NUMERIC, COUNTY_NUMERIC),
    city_lower1 = tolower(as.character(trimws(gsub("[[:punct:]]", "", FEATURE_NAME)))),
    city_lower2 = tolower(as.character(trimws(gsub("[[:punct:]]", "", MAP_NAME  )))),
    ##trim last 3 characters, consisting of a letter-number, e.g. D-3
    city_lower2 = ifelse(substring(city_lower2, nchar(city_lower2), nchar(city_lower2)) %in% charNums,
                         trimws(substr(city_lower2, 1, nchar(city_lower2)-3)),
                         city_lower2),
    county_lower = tolower(as.character(trimws(gsub("[[:punct:]]", "", COUNTY_NAME)))),
    county_lower =  ifelse(substring(county_lower, nchar(county_lower)-2, nchar(county_lower)) == " ca",
                           trimws(substr(county_lower, 1, nchar(county_lower)-3)),
                           county_lower)
  )
#PART II.  Load Data
if(cityVar %in% names(df) == FALSE){df[,cityVar] <- NA}
if(stateVar %in% names(df) == FALSE){df[,stateVar] <- NA}
if(addressVar %in% names(df) == FALSE){df[,addressVar] <- NA}
if(countyVar %in% names(df) == FALSE){df[,countyVar] <- NA}
if("FIPS" %in% names(df) == FALSE){df$FIPS <- NA}
#PART III.  Search Using Existing Datasets
scan4fips <- function(df_a, cityVar_a=cityVar, stateVar_a=stateVar, countyVar_a=countyVar){
#step 1 - identify counties if they are already labelled using noncensus counties file
city_vector <- df_a[,cityVar_a] %>% unlist() %>% as.character() %>% trimws() 
county_vector <- df_a[,countyVar_a] %>% unlist() %>% as.character() %>% trimws() 
  
county_index <- which(grepl(" county| parish| borough", city_vector, 
                            ignore.case = TRUE, fixed = FALSE) & is.na(county_vector))
##If "city" has "county" etc., in its title, move it to county column  
df_a[county_index,countyVar_a] <- df_a[county_index,cityVar_a]
for(i in which(is.na(df_a$FIPS) & !is.na(df_a[,countyVar_a]))){
  st <- df_a[[i,stateVar_a]]
  cnty <- df_a[[i,countyVar_a]]
  fips <- unique(counties$FIPS[which(counties$state == st & counties$county_name == cnty)])
  if(length(fips) == 1 & !is.null(fips)){
  if(!is.na(fips) & fips != "NULL") {
    df_a$FIPS[[i]] <- fips
  }} else {
  county_lower = cnty
  county_lower = tolower(trimws(gsub("[[:punct:]]", "", cnty)))
  county_lower = gsub(" county| parish| borough", "", county_lower, fixed = TRUE)
  
  if(length(fips) == 1 & !is.null(fips)){
  if(!is.na(fips) & fips != "NULL") {df_a$FIPS[[i]] <- fips}} else {
      fips1 <- (geo_city$FIPS[which(geo_city$county_lower == county_lower & geo_city$STATE_ALPHA == st)])
      fips2 <- (geoCode$FIPS[which(geoCode$State ==st & geoCode$county_lower== county_lower)])
    new_fips <- names(sort(table(c(fips, fips1, fips2)), decreasing=TRUE)[1])
    if(length(new_fips) == 1 & !is.null(new_fips)){
    if(!is.na(new_fips) & new_fips != "NULL"){df_a$FIPS[[i]] <- new_fips}}
  }
  } 
}
  
#step 2 - Use city/location
for(i in which(is.na(df_a$FIPS) & !is.na(df_a[,cityVar_a]))){
  st <- df_a[[i,stateVar_a]]
  i_city <- df_a[[i,cityVar_a]]
  city_lower = tolower(trimws(gsub("[[:punct:]]", "", i_city)))
  new_fips <- unique(geo_city$FIPS[which(geo_city$city_lower1 == city_lower & geo_city$STATE_ALPHA == st)])
    df_a$FIPS[[i]] <- new_fips}} else {
      
  new_fips <-  unique(geoCode$FIPS[which(geoCode$State == st & geoCode$city_lower==city_lower)])
      df_a$FIPS[[i]] <- new_fips}} else {
        
    new_fips <-  unique(geo_city$FIPS[which(geo_city$city_lower2 == city_lower & geo_city$STATE_ALPHA == st)])
        if(!is.na(new_fips) & new_fips != "NULL"){
            df_a$FIPS[[i]] <- new_fips}} else {
  
  city_lower = gsub("Mt |mt. ", "mount ", trimws(i_city), fixed = TRUE)
  city_lower = gsub(" county| parish| charter township| township", "", city_lower, fixed = TRUE)
  city_lower = ifelse(city_lower %in% c("coney island", "brooklyn", "manhattan", 
  city_lower = ifelse(st == "DC", "washington", city_lower)
  fips1 <- unique(geo_city$FIPS[which(geo_city$city_lower1 == city_lower & geo_city$STATE_ALPHA == st)])
  fips3 <- unique(geo_city$FIPS[which(geo_city$county_lower == city_lower & geo_city$STATE_ALPHA == st)])
  new_fips <- names(sort(table(c(fips1, fips2, fips3, fips4, fips5)), decreasing=TRUE)[1])
  
  if(length(new_fips) == 1 & !is.null(new_fips)){
  return(df_a)
#PART IV. SEARCH USING GOOGLE MAPS
#After running scan, search for remaining missing cases using google
search4FIPS <- function(df_b, addressVar_b=addressVar, countyVar_b=countyVar){
  df_names <- names(df_b)
  fips.na <- df_b[index,]
  address_vector <- fips.na[,addressVar_b] %>% unlist() 
  #names(fips_geo)[a_n] <- "address_google"  #deleting duplicate variable name, causing errors
  df_b[index,countyVar_b] <- as.character(fips_geo$administrative_area_level_2)
missing_fips <- length(df$FIPS[which(is.na(df$FIPS))])
message(paste0("Initial Missing FIPS:  ", missing_fips))
missing_fips <- length(dfa$FIPS[which(is.na(dfa$FIPS))])
message(paste0("Missing FIPS after Step 1:  ", missing_fips))
dfc <- scan4fips(dfb)
missing_fips <- length(dfc$FIPS[which(is.na(dfc$FIPS))])
return(dfc)
}

C. Load Washington Post Data

wapo <- read_csv("https://cdn.rawgit.com/washingtonpost/data-police-shootings/master/fatal-police-shootings-data.csv") %>% 
  mutate(address = paste(city, state, "USA", sep=", ")) %>% 
  dplyr::rename(Weapon=armed, Race=race, mental_illness=signs_of_mental_illness) %>% 
  dplyr::mutate(
    year=as.numeric(format(date,"%Y")),
    age_f= cut(age, breaks=c(0, 14, 19, 39, 200), 
                    right= FALSE, labels=c("AgeUnder15", "Age15_19", "Age20_39", "Age40Above")), 
         
    Race=replace(Race, which(Race=="W"), "White"),
         Race=replace(Race, which(Race=="B"), "Black"),
         Race=replace(Race, which(Race=="H"), "Hispanic"),
         Race=replace(Race, which(Race=="A"), "Asian"),
         Race=replace(Race, which(Race=="N"), "Native American"),
          Race=replace(Race, which(Race=="O"), "Other"),
         gender=replace(gender, which(gender=="M"), "Male"),
         gender=replace(gender, which(gender=="F"), "Female"),
         State =  getState.abb(state),     
         mental_illness=replace(mental_illness, 
                                which(mental_illness=="True"), 
                                "Mentally Ill"),
         mental_illness=replace(mental_illness, 
                                which(mental_illness=="False"), 
                                "Not Mentally Ill/Unknown"),
         body_camera=replace(body_camera, 
                             which(body_camera=="False"), "No BodyCam"),
         body_camera=replace(body_camera, 
                             which(body_camera=="True"), "BodyCam"),
    dataSource="Wapo") %>% 
  dplyr::mutate(
    date=as.character(date)
  )
  

D. Load Lott & Moody (2016)

#setwd("createData")
lott <- haven::read_stata("createData/police.shoot.dta") %>%  
  mutate_all( funs(replace(., is.nan(.), NA))) %>% 
  dplyr::mutate(
    OfficerRace = NA,
    OfficerRace = replace(OfficerRace, which(pw==1), "White"),
    OfficerRace = replace(OfficerRace, which(ph==1), "Hispanic"),
    OfficerRace = replace(OfficerRace, which(pb==1), "Black"),
    OfficerRace = replace(OfficerRace, which(pu==1), "Unknown"),
    OfficerRace = replace(OfficerRace, which(po==1), "Other"),
    Officer2Race = NA,
    Officer2Race = replace(Officer2Race, which(officer2race==1), "White"),
    Officer2Race = replace(Officer2Race, which(officer2race==3), "Hispanic"),
    Officer2Race = replace(Officer2Race, which(officer2race==2), "Black"),
    Officer2Race = replace(Officer2Race, which(officer2race==4), "Asian/Pacific Islander"),
    OfficerSex = NA,
    OfficerSex = replace(OfficerSex, which(officer1gender==1), "Male"),
    OfficerSex = replace(OfficerSex, which(officer1gender==0), "Female"),
    Officer2Sex = NA,
    Officer2Sex = replace(Officer2Sex, which(officer2gender==1), "Male"),
    Officer2Sex = replace(Officer2Sex, which(officer2gender==0), "Female"),
    Race = NA,
    Race = replace(Race, which(offender1race == 1), "White"),
    Race = replace(Race, which(offender1race == 2), "Black"),
    Race = replace(Race, which(offender1race == 3), "Hispanic"),
    Race = replace(Race, which(offender1race == 4), "Asian/Pacific Islander"),
    Race = replace(Race, which(offender1race == 5), "Native American"),
    Race = replace(Race, which(offender1race == 6), "Other")
  ) %>% 
                -pf, -pw, -ph, -pb, -pu, -po, -offender1race, 
                -officer1gender, -officer2gender, -officer2race) %>% 
dplyr::mutate(
  suicidal = replace(suicidal, which(suicidal == 0), "False"),
  suicidal = replace(suicidal, which(suicidal == 1), "True"),
  drugrelated = replace(drugrelated, which(drugrelated == 0), "False"),
  drugrelated = replace(drugrelated, which(drugrelated == 1), "True"),
  offender2 = replace(offender2, which(offender2 == 0), "False"),
  offender2 = replace(offender2, which(offender2 == 1), "True"),
  # polkilled = replace(polkilled, which(polkilled==0), "False"),
  # polkilled = replace(polkilled, which(polkilled==1), "True"),
  firearm = replace(firearm, which(firearm==0), "False"),
  firearm = replace(firearm, which(firearm==1), "True"),
  knife = replace(knife, which(knife==0), "False"),
  knife=replace(knife, which(knife==1), "True"),
  vehicle = replace(vehicle, which(vehicle==0), "False"),
  vehicle=replace(vehicle, which(vehicle==1), "True"),
  otherweapon=replace(otherweapon, which(otherweapon==0), "False"),
  otherweapon=replace(otherweapon, which(otherweapon==1), "True"),
  armed = replace(armed, which(armed==0), "False"),
  armed=replace(armed, which(armed==1), "True"),
  ipcr = replace(ipcr, which(ipcr==0), "False"),
  ipcr = replace(ipcr, which(ipcr==1), "True"),
    ivcr = replace(ivcr, which(ivcr==0), "False"),
    ivcr = replace(ivcr, which(ivcr==1), "True"),
    bodycam = replace(bodycam, which(bodycam==0), "False"),
  bodycam = replace(bodycam, which(bodycam==1), "True"),
    guncam = replace(guncam, which(guncam==0), "False"),
  guncam = replace(guncam, which(guncam==1), "True"),
    gunshots = replace(gunshots, which(gunshots==0), "False"),
  gunshots = replace(gunshots, which(gunshots==1), "True"),
    sameofficers = replace(sameofficers, which(sameofficers==0), "False"),
  sameofficers = replace(sameofficers, which(sameofficers==1), "True"),
    helicopters = replace(helicopters, which(helicopters==0), "False"),
  helicopters = replace(helicopters, which(helicopters==1), "True"),
    union = replace(union, which(union==0), "False"),
  union = replace(union, which(union==1), "True"),
    college = replace(college, which(college==0), "False"),
  college = replace(college, which(college==1), "True"),
    community = replace(community, which(community==0), "False"),
  community = replace(community, which(community==1), "True"),
  age_f= cut(sa, breaks=c(0, 14, 19, 39, 200), 
                    right= FALSE, labels=c("AgeUnder15", "Age15_19", "Age20_39", "Age40Above")),
  State =  getState.abb(stnm), 
  #state_lower = stateFromLower(stnm),
  dataSource="lott",
  address=paste(city, stnm, "USA", sep=", ")
) %>% 
  dplyr::rename(Age = sa, date=Date)
for(i in 1:ncol(lott)) {
  if(Hmisc::label(lott)[i]!=""){
    if(lab == "Date"){lab <- "date"} #keep consistent with other df's
    if(lab == "suspect arm**ed, percent"){lab <- "Armed"}
    lab <- gsub(", ", "", lab)
    lab <- gsub("percent ", "", lab)
    lab <- gsub("percent", "", lab)
    if(lab == "whether offender2 exists"){lab <- "Offender 2"}
    lab <- gsub("suspect ", "", lab)
    names(lott)[i] <- lab
  }
}

E. Combine Data

lott_nested <- 
 lott %>%
  tidyr::nest(-dataSource, -address, -State, -city, -year)
wapo_nested <- 
  wapo %>%
  tidyr::nest(-dataSource, -address, -State, -city, -year) 
killData <- dplyr::bind_rows(lott_nested, wapo_nested) %>% 
   dplyr::filter(State %in% c(state.abb, "DC"), year < 2017)
#excluding a case in Virgin Islands (lott), and onec case in puerto rico (lott)

F. Load Geo-Coordinates (FIPS) data

Two files are used to identify the counties in which police shootings occurred:
1. Law Enforcement Agency Indentifiers Crosswalk, 2012 (ICPSR 35158)
https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/35158 2. “Populated Places” file “POP_PLACES_20170601.txt”, a subset of the larger file “NationalFile_20170601.txt” from USGS - Domestic and Antarctic Names - State and Topical Gazetteer Download Files Source: https://geonames.usgs.gov/domestic/download_data.htm

The remaining cases without a FIPS code are identified using the geocode() function from the ggmaps package.

kd <- getFIPS(df=killData)
Parsed with column specification:
cols(
  .default = col_integer(),
  FIPS_ST = col_character(),
  FIPS_COUNTY = col_character(),
  FIPS = col_character(),
  ORI9 = col_character(),
  ORI7 = col_character(),
  NAME = col_character(),
  STATENAME = col_character(),
  COUNTYNAME = col_character(),
  UANAME = col_character(),
  LG_NAME = col_character(),
  ADDRESS_NAME = col_character(),
  ADDRESS_STR1 = col_character(),
  ADDRESS_STR2 = col_character(),
  ADDRESS_CITY = col_character(),
  ADDRESS_STATE = col_character(),
  LEMAS_ID = col_character(),
  U_POPGRP = col_character(),
  COMMENT = col_character(),
  INTPTLAT = col_double(),
  INTPTLONG = col_double()
  # ... with 3 more columns
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_character(),
  `<U+FEFF>FEATURE_ID` = col_integer(),
  PRIM_LAT_DEC = col_double(),
  PRIM_LONG_DEC = col_double(),
  ELEV_IN_M = col_integer(),
  ELEV_IN_FT = col_integer()
)
See spec(...) for full column specifications.
Mangling the following names: <U+FEFF>FEATURE_ID -> <U+FEFF>FEATURE_ID. Use enc2native() to avoid the warning.Mangling the following names: <U+FEFF>FEATURE_ID -> <U+FEFF>FEATURE_ID. Use enc2native() to avoid the warning.Mangling the following names: <U+FEFF>FEATURE_ID -> <U+FEFF>FEATURE_ID. Use enc2native() to avoid the warning.Mangling the following names: <U+FEFF>FEATURE_ID -> <U+FEFF>FEATURE_ID. Use enc2native() to avoid the warning.Mangling the following names: <U+FEFF>FEATURE_ID -> <U+FEFF>FEATURE_ID. Use enc2native() to avoid the warning.Mangling the following names: <U+FEFF>FEATURE_ID -> <U+FEFF>FEATURE_ID. Use enc2native() to avoid the warning.Initial Missing FIPS:  3271
Step 1.  Scanning for FIPS using existing data sets
Missing FIPS after Step 1:  82
Step 2.  Scanning for FIPS using Google Maps
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Florence%2C%20AK%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=National%20Park%2C%20AZ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Prairie%20Grove%2C%20AZ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Boise%20National%20Forest%2C%20ID%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Hollis%2C%20ME%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Norfolk-Quincy%2C%20MA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Hamilton%20Township%2C%20NJ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Henderson%2C%20NM%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Clintonville%2C%20OH%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Franklin-Westerville%2C%20OH%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Waverl%2C%20OH%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Lakes%2C%20AK%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Clint%20Wells%2C%20AZ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=St.%20Johns%2C%20FL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=West%20Woodlawn%2C%20IL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Suitland-Silver%20Hill%2C%20MD%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Dartmouth%2C%20MA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Redford%20Charter%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Flint%20charter%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Northland%2C%20MO%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Boiling%20Springs%20Lakes%2C%20NC%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Huntingdon%2C%20WV%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Pinon%20Hills%2C%20CA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Overtown%2C%20FL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Opp%2C%20FL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Alabama%2C%20GA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Chavies%20community%2C%20KY%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Fort%20Meade%2C%20MD%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Spauldings%2C%20MD%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Bloomfield%20Twp%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Hamilton%20Township%2C%20NJ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Little%20Egg%20Harbor%20Township%2C%20NJ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Cockeysville%2C%20TX%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Troutdale%2C%20VA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Mt%20Hope%2C%20WV%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Chapeno%2C%20TX%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Smyrna%2C%20ME%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Eaton%20Rapids%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Roxand%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Carollwood%2C%20FL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Fort%20Meade%2C%20MD%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Straban%20Township%2C%20PA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Ideal%20Beach%2C%20NJ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Watsonsville%2C%20CA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Wood%20Lake%2C%20ND%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Mt.%20Auburn%2C%20OH%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Bloomfield%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Brick%20Township%2C%20NJ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=West%20Knox%2C%20TN%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Pinion%20Hills%2C%20CA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Cato%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Lakes%20Charles%2C%20LA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=West%20Goshen%2C%20CA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Norwood%2C%20OK%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=North%20Laredo%2C%20TX%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Canaan%20Township%2C%20PA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Geneva%2C%20WI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Penn%20Township%2C%20PA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Hollywood%20Hills%2C%20CA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Perry%20Township%2C%20OH%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=East%20Berstadt%2C%20KY%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Pueblo%20of%20Laguna%2C%20NM%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Milcreek%2C%20UT%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Jacksonsville%2C%20FL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Shawnee%20National%20Forest%2C%20IL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Ishpeming%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Columbia%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Laguna%20Pueblo%2C%20NM%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Lake%20Asbury%2C%20FL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Little%20Egg%20Harbor%20Township%2C%20NJ%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=McKinneyville%2C%20CA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=East%20Hollywood%2C%20CA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Sylvania%20Township%2C%20OH%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Weeki%20Wachi%2C%20FL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Big%20Bear%2C%20MO%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Logan%20Canyon%2C%20UT%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Killeen%2C%20AL%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Muckleshoot%20Indian%20Reservation%2C%20WA%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Holland%20Township%2C%20MI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Simpsonsville%2C%20KY%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Keaau%2C%20HI%2C%20USA
Source : https://maps.googleapis.com/maps/api/geocode/json?address=Forks%20Township%2C%20PA%2C%20USA
Missing FIPS after Step 2:  7

There are four remaining cases without a FIPS code. After examining these four cases, I made the following manual changes:

  1. “Florence, AK, USA” - Lott - 2013 - is identified as Florence, Alabama, Lauderdale County, FIPS `01077

  2. “Opp, FL, USA” - Lott - 2015 - is identified as Opp, Alabama, Covington County, FIPS `01039

  3. “Boise National Forest, ID, USA” - Lott - 2013 - is in/near Lowman, ID, Boise County, FIPS `16015

  4. “Cockeysville, TX, USA” could not be verified and is dropped.

In addition, the following changes are made:

  1. “Prairie Grove, AZ, USA” - Lott - 2013 - is Prairie Grove, Arkansas, Washington County, FIPS `05143

  2. “Alabama, GA, USA” - Lott - 2015 - DROPPED

  3. “National Park, AZ, USA” - Lott - 2013 - DROPPED

drop_addresses <- c("Alabama, GA, USA", "National Park, AZ, USA", "Cockeysville, TX, USA")
nrow(kd)
[1] 3271
kd <- subset(kd, !(address %in% drop_addresses))
nrow(kd)
[1] 3268
kd$FIPS[which(kd$address == "Prairie Grove, AZ, USA")] <- "`05143"
kd$State[which(kd$address == "Prairie Grove, AZ, USA")] <- "AR"
kd$FIPS[which(kd$address == "Florence, AK, USA")] <- "`01077"
kd$State[which(kd$address == "Florence, AK, USA")] <- "AL"
kd$State[which(kd$address == "Opp, FL, USA")] <- "AL"
kd$FIPS[which(kd$address == "Boise National Forest, ID, USA")] <- "`16015"

NYC Corrrection

(see section below in UCR crime data - using Manhattan FIPS for all NYC FIPS)

nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
kd <- kd %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
  dplyr::select(-county)

G. Save Data.

wapo <- kd %>% dplyr::filter(dataSource=="Wapo") %>% 
  tidyr::unnest()
fst::write.fst(wapo, path="df_wapo_fst.csv")
write.csv(wapo, file="df_wapo.csv")
lott <- kd %>% dplyr::filter(dataSource=="lott") %>% 
  tidyr::unnest()
fst::write.fst(lott, path="df_lott_fst.csv")
write.csv(lott, file="df_lott.csv")

II. POPULATION CENSUS DATA ———————————

1. COUNTY-LEVEL Population Data

CC-EST2015-ALLDATA-[ST-FIPS]: Annual County Resident Population Estimates by Age, Sex, Race, and Hispanic Origin: April 1, 2010 to July 1, 2015 File: 7/1/2015 County Characteristics Resident Population Estimates Source: U.S. Census Bureau, Population Division Release Date: June 2016

Load Data from Census Website

library(tidyr)
library(readr)
library(dplyr)
# I.  COUNTY-LEVEL POPULATION CENSUS DATA ---------------------------------
##Using dplyr/tidy-verse code
county_census <- read_csv(file="https://www2.census.gov/programs-surveys/popest/datasets/2010-2015/counties/asrh/cc-est2015-alldata.csv")
Parsed with column specification:
cols(
  .default = col_integer(),
  SUMLEV = col_character(),
  STATE = col_character(),
  COUNTY = col_character(),
  STNAME = col_character(),
  CTYNAME = col_character()
)
See spec(...) for full column specifications.
                          
##https://www.census.gov/popest/data/counties/asrh/2015/files/CC-EST2015-ALLDATA.csv
##Race-Total, gender-Both, age-All
countyPop <- county_census %>% 
  dplyr::filter(YEAR %in% c(4,5,6,7,8)) %>% 
  dplyr::mutate(YEAR=replace(YEAR, which(YEAR==4), 2011),
         YEAR=replace(YEAR, which(YEAR==5), 2012),
         YEAR=replace(YEAR, which(YEAR==6), 2013),
         YEAR=replace(YEAR, which(YEAR==7), 2014),
         YEAR=replace(YEAR, which(YEAR==8), 2015),
         FIPS=paste("`", STATE, COUNTY, sep=""),
         
         
         age=NA,
         age=replace(age, which(AGEGRP==0), "All"),
         age=replace(age, which(AGEGRP %in% c(1,2,3)), "Under15"),
         age=replace(age, which(AGEGRP == 4), "15_19"),
         age=replace(age, which(AGEGRP == 5), "20_24"),
         age=replace(age, which(AGEGRP == 6), "25_29"),
         age=replace(age, which(AGEGRP == 7), "30_34"),
         age=replace(age, which(AGEGRP == 8), "35_39"),
         age=replace(age, which(AGEGRP == 9), "40_44"),
         age=replace(age, which(AGEGRP == 10), "45_49"),
         age=replace(age, which(AGEGRP == 11), "50_54"),
         age=replace(age, which(AGEGRP == 12), "55_59"),
         age=replace(age, which(AGEGRP > 12), "60Above")
  ) %>% 
  transmute(year=YEAR, 
            state=STNAME,
            county=CTYNAME,
            FIPS=FIPS, age=age,
          Total_Male=TOT_MALE, 
          Total_Female=TOT_FEMALE, 
          Total_Both = Total_Male+Total_Female,
          
          Black_Male=BA_MALE, Black_Female=BA_FEMALE,
          Black_Both=Black_Male+Black_Female,
          
          Amerindian_Male=IA_MALE, Amerindian_Female=IA_FEMALE,
          Amerindian_Both=Amerindian_Male+Amerindian_Female,
          
          Asian_Male=AA_MALE, Asian_Female=AA_FEMALE,
          Asian_Both=Asian_Male+Asian_Female,
          
          PacificIslander_Male=NA_MALE, PacificIslander_Female=NA_FEMALE,
          PacificIslander_Both=PacificIslander_Male+PacificIslander_Female,
          
          Hispanic_Male=HWA_MALE, #Hispanic, White alone male population 
          Hispanic_Female=HWA_FEMALE, #Hispanic, White alone female population 
          Hispanic_Both=Hispanic_Male+Hispanic_Female,
          
          White_Male=NHWA_MALE,  #Not Hispanic, White alone male population
          White_Female=NHWA_FEMALE, #Not Hispanic, White alone female population
          White_Both=White_Male+White_Female
    ) 
##Separate, then re-unite in long format
white_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, White_Male, White_Female, White_Both) %>% 
  mutate(Race="White") %>% 
  group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))
Total_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Total_Male, Total_Female, Total_Both) %>% 
  dplyr::rename(Male=Total_Male, Female=Total_Female, Both=Total_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Total") %>% 
  group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))
Black_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Black_Male, Black_Female, Black_Both) %>% 
  dplyr::rename(Male=Black_Male, Female=Black_Female, Both=Black_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Black") %>% 
  dplyr::group_by(year, state, county, FIPS, Race, gender, age) %>% 
Asian_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Asian_Male, Asian_Female, Asian_Both) %>% 
  dplyr::rename(Male=Asian_Male, Female=Asian_Female, Both=Asian_Both) %>% 
  dplyr::group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))
Amerindian_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Amerindian_Male, Amerindian_Female, Amerindian_Both) %>% 
  dplyr::rename(Male=Amerindian_Male, Female=Amerindian_Female, Both=Amerindian_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Native American") %>% 
  dplyr::group_by(year, state, county, FIPS, Race, gender, age) %>% 
Hispanic_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Hispanic_Male, Hispanic_Female, Hispanic_Both) %>% 
  dplyr::rename(Male=Hispanic_Male, Female=Hispanic_Female, Both=Hispanic_Both) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))
PacificIslander_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, PacificIslander_Male, PacificIslander_Female, PacificIslander_Both) %>% 
  dplyr::rename(Male=PacificIslander_Male, Female=PacificIslander_Female, Both=PacificIslander_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))
##Recombine - can delete total,both, and all values
countyPop2 <- bind_rows(Total_df , white_df, Black_df, Hispanic_df,
                        Asian_df, Amerindian_df, PacificIslander_df
                        )
census_county <- countyPop2
rm(countyPop2)
rm(countyPop)

NYC Correction

nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
census_county <- census_county %>% ungroup() %>% 
  dplyr::mutate(FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")) %>% 
  dplyr::group_by(year, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=sum(population, na.rm=T),
                   state=first(state),
                   county=first(county))

County Populations

All population variables are for year 2015, except variables appended with “5yr”, which are averaged across 2011 to 2015. Variables appended with “change” are the percent change from 2011 to 2015.

census_vars <- census_county %>% 
  dplyr::group_by(year, FIPS) %>% 
  dplyr::mutate(
    pop.c.total = population[which(Race=="Total"  & gender=="Both" & age=="All")],
    pop.c.white_nh = population[which(Race=="White"  & gender=="Both" & age=="All")],
    pop.c.black = population[which(Race == "Black" & gender=="Both" & age=="All")],
    pop.c.asian = population[which(Race=="Asian" & gender=="Both" & age=="All")],
    #Create Asian/PAcific Islander Category for some datasets that conflate the two
    pop.c.AP = mean(population[which(gender=="Both" & age=="All" & (Race == "Asian" | Race == "Pacific Islander"))], na.rm=T),
    ##Create White/Hispanic category to match UCR
    pop.c.WH = mean(population[which(gender=="Both" & age=="All" & (Race == "White" | Race == "Hispanic"))], na.rm=T),
    pop.c.amerindian = population[which(Race=="Native American"  & gender=="Both" & age=="All")],
    pop.c.hispanic = population[which(Race=="Hispanic"  & gender=="Both" & age=="All")],
    pop.male.15_29 = sum(population[which(Race=="Total" & gender=="Male" & age %in% c("15_19","20_24","25_29"))], na.rm=T),
  dplyr::group_by(FIPS) %>% 
  dplyr::mutate(
    ##Averaged aross 2011-2015    
    pop.c.total_5yr = mean(pop.c.total, na.rm=T),
         pop.c.white_nh_5yr = mean(pop.c.white_nh, na.rm=T),
          pop.c.WH_5yr = mean(pop.c.WH, na.rm=T),
         pop.c.black_5yr = mean(pop.c.black, na.rm=T),
         pop.c.asian_5yr = mean(pop.c.asian, na.rm=T),
         pop.c.hispanic_5yr = mean(pop.c.hispanic, na.rm=T),
         pop.c.amerindian_5yr = mean(pop.c.amerindian, na.rm=T),
         pop.male.15_29_5yr = mean(pop.male.15_29, na.rm=T),
  
    ##Change from 2011 - 2015 
  pop.c.total_change = mean((pop.c.total[which(year==2015)]-pop.c.total[which(year==2011)])/pop.c.total[which(year==2011)], 
                            na.rm=T),
  pop.c.white_nh_change = mean((pop.c.white_nh[which(year==2015)]-pop.c.white_nh[which(year==2011)])/pop.c.white_nh[which(year==2011)], 
                               na.rm=T),
  
  
  pop.c.black_change = mean((pop.c.black[which(year==2015)]-pop.c.black[which(year==2011)])/pop.c.black[which(year==2011)], na.rm=T),
  
  ##A few counties had zero blacks, which resulted in infinite values (division by zero), therefore adding + 1
  pop.c.black_change = ifelse(is.infinite(pop.c.black_change),  
        #If infinite, change                    
        mean(((pop.c.black[which(year==2015)]+1)-(pop.c.black[which(year==2011)]+1))/(pop.c.black[which(year==2011)]+1), na.rm=T),  
        ##If not infinite, keep the same
        pop.c.black_change)
  
  ) %>% 
  dplyr::filter(year==2015, Race=="Total", age=="All", gender=="Both") %>% 
  dplyr::select(-Race, -gender, -age, -year)
  

2. STATE-LEVEL Population Data

State Characteristics Datasets: Annual State Resident Population Estimates for 5 Race Groups (5 Race Alone or in Combination Groups) by Age, Sex, and Hispanic Origin: April 1, 2010 to July 1, 2015 https://www2.census.gov/programs-surveys/popest/datasets/2010-2015/state/asrh/sc-est2015-alldata5.csv

Load Data from Census website

census_state <- read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2010-2015/state/asrh/sc-est2015-alldata5.csv")
  
##formerly, read_csv("https://www.census.gov/popest/data/state/asrh/2015/files/SC-EST2015-ALLDATA5.csv")

census_state2 <- census_state %>% 
  transmute(State=NAME, gender=SEX, Hisp=ORIGIN, Race=RACE, AGE=AGE,
            y2011=POPESTIMATE2011, y2012=POPESTIMATE2012,
            y2013=POPESTIMATE2013, y2014=POPESTIMATE2014,
            y2015=POPESTIMATE2015) %>% 
  dplyr::mutate(
    gender=replace(gender, which(gender==0), "Both"),
    gender=replace(gender, which(gender==1), "Male"),
    gender=replace(gender, which(gender==2), "Female"),
        age=NA,
         age=replace(age, which(AGE < 15), "Under15"),
         age=replace(age, which(AGE>=15 & AGE<=19), "15_19"),
         age=replace(age, which(AGE>=20 & AGE<=24), "20_24"),
         age=replace(age, which(AGE>=25 & AGE<=29), "25_29"),
         age=replace(age, which(AGE>=30 & AGE<=34), "30_34"),
         age=replace(age, which(AGE>=35 & AGE<=39), "35_39"),
         age=replace(age, which(AGE>=40 & AGE<=44), "40_44"),
         age=replace(age, which(AGE>=45 & AGE<=49), "45_49"),
         age=replace(age, which(AGE>=50 & AGE<=54), "50_54"),
         age=replace(age, which(AGE>=55 & AGE<=59), "55_59"),
         age=replace(age, which(AGE>=60), "60Above"),
    ##Need to add a row for all ages, or not
        #age=replace(age, which(age==0), "All"),
    
    Race=replace(Race, which(Race==1 & Hisp==1), "White"),
    Race=replace(Race, which(Race==1 & Hisp==2), "Hispanic"),
    Race=replace(Race, which(Race==2 & Hisp==0), "Black"),
    Race=replace(Race, which(Race==3 & Hisp==0), "Native American"),
    Race=replace(Race, which(Race==4 & Hisp==0), "Asian"),
    Race=replace(Race, which(Race==5 & Hisp==0), "Pacific Islander")
  ) %>% 
  dplyr::select(-c(Hisp, AGE)) %>% 
  dplyr::filter(Race %in% c("White", "Hispanic", "Black","Asian",
                     "Native American", "Pacific Islander")) %>% 
  dplyr::group_by(State, Race, gender, age) %>% 
  dplyr::summarise(y2011=sum(y2011, na.rm=T),
            y2012=sum(y2012, na.rm=T),
            y2013=sum(y2013, na.rm=T),
            y2014=sum(y2014, na.rm=T),
            y2015=sum(y2015, na.rm=T)
            
            ) %>% 
  ###File size prior to tidying:  631 KB
  ###File size after gathering years into one column:  2544 KB.   
  tidyr::gather(year, population, 5:9) %>% 
  dplyr::mutate(
    year=replace(year, which(year=="y2011"), 2011),
    year=replace(year, which(year=="y2012"), 2012),
    year=replace(year, which(year=="y2013"), 2013),
    year=replace(year, which(year=="y2014"), 2014),
    year=replace(year, which(year=="y2015"), 2015)
  )

##Adding a summary categories:
##State = United States
##gender = 'Both' (already created)
##age = 'All'
##Race = 'Total'

##age -'All'
age_df <- census_state2 %>% 
  dplyr::group_by(year, State, Race, gender) %>% 
  dplyr::summarise(population=sum(population, na.rm=T)) %>% 
  dplyr::mutate(age="All")

census_state2 <- dplyr::bind_rows(census_state2, age_df)

##Race - Total
race_df <- census_state2 %>% 
  dplyr::group_by(year, State, gender, age) %>% 
  dplyr::summarise(population=sum(population, na.rm=T)) %>% 
  dplyr::mutate(Race="Total")

census_state2 <- dplyr::bind_rows(census_state2, race_df)

##State=US

us_df <- census_state2 %>% 
  dplyr::group_by(year, gender, Race, age) %>% 
  dplyr::summarise(population=sum(population, na.rm=T)) %>% 
  dplyr::mutate(State="United States")

census_state <- dplyr::bind_rows(census_state2, us_df)
##file size after adding rows:  3366 KB
##write.csv(census_state, file="census_state.csv")

State Populations

All population variables are for year 2015, except variables appended with “5yr”, which are averaged across 2011 to 2015. Variables appended with “change” are the percent change from 2011 to 2015.

census_state <- census_state %>% 
   dplyr::group_by(year, State) %>% 
  dplyr::mutate(
    pop.total = population[which(Race=="Total"  & gender=="Both" & age=="All")],
    pop.white_nh = population[which(Race=="White"  & gender=="Both" & age=="All")],
    pop.black = population[which(Race == "Black" & gender=="Both" & age=="All")],
    pop.asian = population[which(Race=="Asian" & gender=="Both" & age=="All")],
    #Create Asian/PAcific Islander Category for some datasets that conflate the two
    pop.AP = mean(population[which(gender=="Both" & age=="All" & (Race == "Asian" | Race == "Pacific Islander"))], na.rm=T),
    ##Create White/Hispanic category to match UCR
    pop.WH = mean(population[which(gender=="Both" & age=="All" & (Race == "White" | Race == "Hispanic"))], na.rm=T),
    pop.amerindian = population[which(Race=="Native American"  & gender=="Both" & age=="All")],
    pop.hispanic = population[which(Race=="Hispanic"  & gender=="Both" & age=="All")],
    pop.male.15_29 = sum(population[which(Race=="Total" & gender=="Male" & age %in% c("15_19","20_24","25_29"))], na.rm=T),
    pop.black.male15_29 = sum(population[which(Race=="Black"  & gender=="Male" & age %in% c("15_19","20_24","25_29"))])
  ) %>% 
  dplyr::group_by(State) %>% 
  dplyr::mutate(
    ##Averaged aross 2011-2015    
    pop.total_5yr = mean(pop.total, na.rm=T),
         pop.white_nh_5yr = mean(pop.white_nh, na.rm=T),
          pop.WH_5yr = mean(pop.WH, na.rm=T),
         pop.black_5yr = mean(pop.black, na.rm=T),
         pop.asian_5yr = mean(pop.asian, na.rm=T),
         pop.hispanic_5yr = mean(pop.hispanic, na.rm=T),
         pop.amerindian_5yr = mean(pop.amerindian, na.rm=T),
         pop.male.15_29_5yr = mean(pop.male.15_29, na.rm=T),
  
    ##Change from 2011 - 2015 
  pop.total_change = mean((pop.total[which(year==2015)]-pop.total[which(year==2011)])/pop.total[which(year==2011)], 
                            na.rm=T),
  pop.white_nh_change = mean((pop.white_nh[which(year==2015)]-pop.white_nh[which(year==2011)])/pop.white_nh[which(year==2011)], 
                               na.rm=T),
  
  
  pop.black_change = mean((pop.black[which(year==2015)]-pop.black[which(year==2011)])/pop.black[which(year==2011)], na.rm=T),
  
  ##A few counties had zero blacks, which resulted in infinite values (division by zero), therefore adding + 1
  pop.black_change = ifelse(is.infinite(pop.black_change),  
        #If infinite, change                    
        mean(((pop.black[which(year==2015)]+1)-(pop.black[which(year==2011)]+1))/(pop.black[which(year==2011)]+1), na.rm=T),  
        ##If not infinite, keep the same
        pop.black_change)
  
  ) %>% 
  dplyr::filter(year==2015, Race=="Total", age=="All", gender=="Both") %>% 
  dplyr::select(-Race, -gender, -age, -year)

III. ARREST DATA (Panel 3) ———————————

1. Load Data

2015 Data

Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, 2015 (ICPSR 36794) https://www.icpsr.umich.edu/icpsrweb/NACJD/studies/36794 ICPSR_36794

Note that the offense codes changed in the 2015 edition. For example, Murder is now “01A” instead of “011”.
File 36794-0001-Data.tsv I’ve converted this to ‘fst’ format for faster loading.

# ucr_15 <- readr::read_tsv("createData/36794-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_15, path="createData/ucr_15.csv")
ucr_15 <- fst::read.fst(path="createData/ucr_15.csv")
  

2014 Data

Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, Summarized Yearly, 2014 (ICPSR 36400). http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/36400

File 36400-0001-Data.tsv

##File = 36400-0001-Data.tsv
# ucr_14 <- read_tsv("createData/36400-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_14, path="createData/ucr_14.csv")
ucr_14 <- fst::read.fst(path="createData/ucr_14.csv")

2013 Data

Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, Summarized Yearly, 2013 (ICPSR 36116) http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/36116 File 36116-0001-Data.tsv

##File = 36116-0001-Data.tsv
# ucr_13 <- read_tsv("createData/36116-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_13, path="createData/ucr_13.csv")
ucr_13 <- fst::read.fst(path="createData/ucr_13.csv")

2012 Data

Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, Summarized Yearly, 2012 (ICPSR 35018) http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/35018

File 35018-0001-Data.tsv

##File = 35018-0001-Data.tsv
# ucr_12 <- read_tsv("createData/35018-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_12, path="createData/ucr_12.csv")
ucr_12 <- fst::read.fst(path="createData/ucr_12.csv")

2011 Data

Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, Summarized Yearly, 2011 (ICPSR 34581) http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/34581 File 34581-0001-Data.tsv

## File = 34581-0001-Data.tsv
# ucr_11 <- read_tsv("createData/34581-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_11, path="createData/ucr_11.csv")
ucr_11 <- fst::read.fst(path="createData/ucr_11.csv")

2. Combine and Filter

Note that violent crime is composed of four offenses: murder and nonnegligent manslaughter, forcible rape, robbery, and aggravated assault.

offenses_2011_2014 <- c("011", "020", "030", "040", "050", 
              "060", "070", "130", "140", "180", 
              "185", "210")
keepVars <- c("YEAR","STATE","STNAME","COUNTY", "ORI", "AGENCY","OFFENSE","M0_9", "M10_12", "M13_14","M15", "M16","M17","M18","M19",
              "M20","M21","M22","M23","M24","M25_29", "M30_34", "M35_39", "M40_44", "M45_49", "M50_54","M55_59","M60_64","M65","F0_9",
              "F10_12","F13_14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","F25_29","F30_34","F35_39","F40_44",
              "F45_49",  "F50_54",  "F55_59","F60_64","F65","JW", "JB","JI","JA","AW","AB", "AI","AA")
ucr_15 <- ucr_15[,keepVars] 
 ucr_15 <- ucr_15 %>%  dplyr::mutate(crime=NA,
                crime=replace(crime, which(OFFENSE == "01A"), "murder"),
                ##Murder and non-negligent manslaughter
                crime=replace(crime, which(OFFENSE == "02"), "rape"),
                ##Forcible rape
                crime=replace(crime, which(OFFENSE == "03"), "robbery"),
                ##robbery
                crime=replace(crime, which(OFFENSE == "04"), "assault" ),
                ##Aggravated assault
                crime=replace(crime, which(OFFENSE == "05"), "burglary" ),
                ##Burglary-breaking or entering
                crime=replace(crime, 
                              which(OFFENSE %in% c("06", "07", "13")), 
                              "theft" ),
                ## 060 = Larceny-theft (not motor vehicle)
                ## 070 = Motor vehicle theft
                ## 130 = Stolen property-buy, receive, poss.
                crime=replace(crime, which(OFFENSE == "140"), "vandalism" ),
                ##Vandalism
                crime=replace(crime, which(OFFENSE =="180"), "drug_sale"),
                ##Sale/manufacture (subtotal) - drugs
                crime=replace(crime, which(OFFENSE =="185"), "drug_poss"),
                ##Possession (subtotal)
                crime=replace(crime, which(OFFENSE =="21"), "dui")
                ##Driving under the influence
  ) %>% ##eliminating data for other crimes
  dplyr::filter(!is.na(crime))
ucr <- dplyr::bind_rows(ucr_11, ucr_12, ucr_13, ucr_14) 
ucr <- ucr[,keepVars] 
  ##dplyr::filter(OFFENSE %in% offenses_2011_2014) %>% 
  ##dplyr::select(c(7, 3, 18, 13, 4, 17, 22:70, 73:76)) %>% 
ucr <- ucr %>% dplyr::mutate(crime=NA,
         crime=replace(crime, which(OFFENSE == "011"), "murder"),
         ##Murder and non-negligent manslaughter
         crime=replace(crime, which(OFFENSE == "020"), "rape"),
         ##Forcible rape
         crime=replace(crime, which(OFFENSE == "030"), "robbery"),
         ##robbery
         crime=replace(crime, which(OFFENSE == "040"), "assault" ),
         ##Aggravated assault
         crime=replace(crime, which(OFFENSE == "050"), "burglary" ),
         ##Burglary-breaking or entering
         crime=replace(crime, 
                       which(OFFENSE %in% c("060", "070", "130")), 
                       "theft" ),
         ## 060 = Larceny-theft (not motor vehicle)
         ## 070 = Motor vehicle theft
         crime=replace(crime, which(OFFENSE == "140"), "vandalism" ),
         ##Vandalism
         crime=replace(crime, which(OFFENSE =="180"), "drug_sale"),
         ##Sale/manufacture (subtotal) - drugs
         crime=replace(crime, which(OFFENSE =="185"), "drug_poss"),
         ##Possession (subtotal)
         crime=replace(crime, which(OFFENSE =="210"), "dui")
         ##Driving under the influence
         ) %>% ##eliminating data for other crimes
  dplyr::filter(!is.na(crime)) %>% 
  dplyr::bind_rows(ucr_15)

Finding data entry errors

Delphi, Indiana in 2013 mistakenly reported crimes with values of 10000 appear in multiple columns and for multiple rows (i.e. offenses) for the same agency and which also exceed population size. In addition, 2 police stations report negative values for arrests: * FLOYD COUNTY POLICE DEPT, GA 2013 rape * LEE, GA 2015 assault

ucr$totals <- NA
ucr$totals <- apply(ucr[,8:59], 1, sum, na.rm=T)
error_rows <- as.numeric()
  #otherc <- setdiff(8:59, i) ##other variables/columns
  #ucr$totals <- apply(ucr[,otherc], 1, sum, na.rm=T)
  ##if one arrest column exceeds the sum of all the others - impossible, therefore flag.
  er = which(ucr[,i] == 10000 | ucr[,i] == 10001 | ucr[,i] == 20000)
  
  if(length(er)>0){
    message(paste0("variable = ", names(ucr)[i]))
    message(paste(ucr$AGENCY[er], sep=" \n"))
    error_rows <- unique(c(error_rows, er))
  }
}
variable = M13_14
DELPHI
variable = M17
DELPHI
variable = M18
DELPHI
variable = M20
DELPHI
variable = M24
DELPHIDELPHI
variable = M30_34
DELPHIDELPHI
variable = M40_44
DELPHI
variable = M45_49
DELPHIDELPHIDELPHI
variable = F13_14
DELPHI
variable = F17
DELPHI
variable = AW
DELPHIDELPHI
total_Zero <- which(ucr$totals < 0)
error_rows <- unique(c(error_rows, total_Zero))
ucr_error <- ucr[error_rows,]

Deleting data entry errors

Deleting these cases.

ucr <- ucr[-which(ucr$AGENCY=="DELPHI" & ucr$STATE==13 & ucr$YEAR==2013 ),] ## removed 6 cases
ucr <- ucr[-which(ucr$AGENCY=="FLOYD COUNTY POLICE DEPT" & ucr$STATE==10 & ucr$YEAR==2013 & ucr$crime == "rape"),] # one case
ucr <- ucr[-which(ucr$AGENCY=="LEE" & ucr$STATE==10 & ucr$YEAR==2015 & ucr$crime == "assault"),] # one case

3. Attach FIPS codes to UCR Data, aggregate by county

1.– Some identical FIPS codes have separate county names - e.g. California, FIPS code 06065 = both ‘Riverside’ & ‘Riverside County’. Therefore grouping by FIPS rather than by county. 2.– There is also an FIPS coding error for Philadelphia. The ORI code PAPEP00 is read as Indiana County, PA, FIPS = 42063, which should instead be 42101.

Load Geographic Codes - County FIPS

Law Enforcement Agency Indentifiers Crosswalk, 2012 (ICPSR 35158) https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/35158

#setwd("createData")
ucr_geo <- read_tsv("createData/35158-0001-Data.tsv") %>% 
  dplyr::select(ORI7, FIPS, COUNTYNAME, STATENAME ) %>% 
  dplyr::rename(ORI=ORI7, county=COUNTYNAME, State=STATENAME)
Parsed with column specification:
cols(
  .default = col_integer(),
  FIPS_ST = col_character(),
  FIPS_COUNTY = col_character(),
  FIPS = col_character(),
  ORI9 = col_character(),
  ORI7 = col_character(),
  NAME = col_character(),
  STATENAME = col_character(),
  COUNTYNAME = col_character(),
  UANAME = col_character(),
  LG_NAME = col_character(),
  ADDRESS_NAME = col_character(),
  ADDRESS_STR1 = col_character(),
  ADDRESS_STR2 = col_character(),
  ADDRESS_CITY = col_character(),
  ADDRESS_STATE = col_character(),
  LEMAS_ID = col_character(),
  U_POPGRP = col_character(),
  COMMENT = col_character(),
  INTPTLAT = col_double(),
  INTPTLONG = col_double()
  # ... with 3 more columns
)
See spec(...) for full column specifications.

|==========                                                                     |  12%    1                                                                     |  13%    1 MB
|===========                                                                    |  13%    1 MB
|===========                                                                    |  14%    1 MB
|============                                                                   |  15%    1 MB
|============                                                                   |  15%    1 MB
|=============                                                                  |  16%    1 MB
|=============                                                                  |  16%    1 MB
|==============                                                                 |  17%    1 MB
|==============                                                                 |  18%    1 MB
|==============                                                                 |  18%    2 MB
|===============                                                                |  19%    2 MB
|===============                                                                |  19%    2 MB
|================                                                               |  20%    2 MB
|================                                                               |  21%    2 MB
|=================                                                              |  21%    2 MB
|=================                                                              |  22%    2 MB
|==================                                                             |  22%    2 MB
|==================                                                             |  23%    2 MB
|===================                                                            |  24%    2 MB
|===================                                                            |  24%    2 MB
|====================                                                           |  25%    2 MB
|====================                                                           |  25%    2 MB
|=====================                                                          |  26%    2 MB
|=====================                                                          |  26%    2 MB
|=====================                                                          |  27%    3 MB
|======================                                                         |  28%    3 MB
|======================                                                         |  28%    3 MB
|=======================                                                        |  29%    3 MB
|=======================                                                        |  29%    3 MB
|========================                                                       |  30%    3 MB
|========================                                                       |  30%    3 MB
|=========================                                                      |  31%    3 MB
|=========================                                                      |  32%    3 MB
|==========================                                                     |  32%    3 MB
|==========================                                                     |  33%    3 MB
|===========================                                                    |  33%    3 MB
|===========================                                                    |  34%    3 MB
|============================                                                   |  35%    3 MB
|============================                                                   |  35%    3 MB
|=============================                                                  |  36%    4 MB
|=============================                                                  |  36%    4 MB
|==============================                                                 |  37%    4 MB
|==============================                                                 |  38%    4 MB
|===============================                                                |  38%    4 MB
|===============================                                                |  39%    4 MB
|===============================                                                |  39%    4 MB
|================================                                               |  40%    4 MB
|================================                                               |  41%    4 MB
|=================================                                              |  41%    4 MB
|=================================                                              |  42%    4 MB
|==================================                                             |  42%    4 MB
|==================================                                             |  43%    4 MB
|===================================                                            |  44%    4 MB
|===================================                                            |  44%    4 MB
|====================================                                           |  45%    4 MB
|====================================                                           |  45%    5 MB
|=====================================                                          |  46%    5 MB
|=====================================                                          |  46%    5 MB
|======================================                                         |  47%    5 MB
|======================================                                         |  48%    5 MB
|======================================                                         |  48%    5 MB
|=======================================                                        |  49%    5 MB
|=======================================                                        |  49%    5 MB
|========================================                                       |  50%    5 MB
|========================================                                       |  51%    5 MB
|=========================================                                      |  51%    5 MB
|=========================================                                      |  52%    5 MB
|==========================================                                     |  52%    5 MB
|==========================================                                     |  53%    5 MB
|===========================================                                    |  54%    5 MB
|===========================================                                    |  54%    6 MB
|============================================                                   |  55%    6 MB
|============================================                                   |  55%    6 MB
|=============================================                                  |  56%    6 MB
|=============================================                                  |  57%    6 MB
|==============================================                                 |  57%    6 MB
|==============================================                                 |  58%    6 MB
|===============================================                                |  58%    6 MB
|===============================================                                |  59%    6 MB
|================================================                               |  60%    6 MB
|================================================                               |  60%    6 MB
|=================================================                              |  61%    6 MB
|=================================================                              |  61%    6 MB
|==================================================                             |  62%    6 MB
|==================================================                             |  63%    6 MB
|==================================================                             |  63%    7 MB
|===================================================                            |  64%    7 MB
|===================================================                            |  64%    7 MB
|====================================================                           |  65%    7 MB
|====================================================                           |  66%    7 MB
|=====================================================                          |  66%    7 MB
|=====================================================                          |  67%    7 MB
|======================================================                         |  67%    7 MB
|======================================================                         |  68%    7 MB
|======================================================                         |  68%    7 MB
|=======================================================                        |  69%    7 MB
|=======================================================                        |  69%    7 MB
|========================================================                       |  70%    7 MB
|========================================================                           7 MB
|=========================================================                      |  71%    7 MB
|=========================================================                      |  72%    7 MB
|==========================================================                     |  73%    8 MB
|==========================================================                     |  73%    8 MB
|===========================================================                    |  74%    8 MB
|===========================================================                    |  74%    8 MB
|============================================================                   |  75%    8 MB
|============================================================                   |  76%    8 MB
|=============================================================                  |  76%    8 MB
|=============================================================                  |  77%    8 MB
|==============================================================                 |  78%    8 MB
|==============================================================                 |  78%    8 MB
|===============================================================                |  79%    8 MB
|===============================================================                |  79%    8 MB
|================================================================               |  80%    8 MB
|================================================================               |  80%    8 MB
|=================================================================              |  81%    9 MB
|=================================================================              |  82%    9 MB
|==================================================================             |  82%    9 MB
|==================================================================             |  83%    9 MB
|===================================================================            |  83%    9 MB
|===================================================================            |  84%    9 MB
|====================================================================           |  85%    9 MB
|====================================================================           |  85%    9 MB
|=====================================================================          |  86%    9 MB
|=====================================================================          |  86%    9 MB
|=====================================================================          |  87%    9 MB
|======================================================================         |  88%    9 MB
|======================================================================         |  88%    9 MB
|=======================================================================        |  89%    9 MB
|=======================================================================        |  89%    9 MB
|========================================================================       |  90%    9 MB
|========================================================================       |  91%   10 MB
|=========================================================================      |  91%   10 MB
|=========================================================================      |  92%   10 MB
|==========================================================================     |  92%   10 MB
|==========================================================================     |  93%   10 MB
|===========================================================================    |  94%   10 MB
|===========================================================================    |  94%   10 MB
|============================================================================   |  95%   10 MB
|============================================================================   |  95%   10 MB
|=============================================================================  |  96%   10 MB
|=============================================================================  |  97%   10 MB
|============================================================================== |  97%   10 MB
|============================================================================== |  98%   10 MB
|===============================================================================|  98%   10 MB
|===============================================================================|  99%   10 MB
|================================================================================| 100%   11 MB

Aggregate by County

ucr2 <- left_join(ucr, ucr_geo)
Joining, by = "ORI"
ucr2 <- ucr2 %>% arrange(YEAR, State, county, FIPS, OFFENSE) 
##Fix Philly.
ucr2 <- ucr2 %>% 
  dplyr::mutate(
  FIPS=replace(FIPS, which(ORI == "PAPEP00"), 42101)
) %>% 
  dplyr::rename(year=YEAR) %>% ungroup() %>% 
  dplyr::select(-c(STATE, STNAME, COUNTY, county, AGENCY, ORI, OFFENSE)) %>% 
  dplyr::filter(!is.na(FIPS))
ucr2 <- ucr2 %>% 
  dplyr::group_by(year, State, FIPS, crime) %>%
  dplyr::summarise_each_(funs(sum(., na.rm = TRUE)),
                  names(ucr2)[c(2:53)])
`summarise_each()` is deprecated.
Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
To map `funs` over a selection of variables, use `summarise_at()`

4. Arrests by Sex & Age by County

{Note: I’ve decided to keep these two files separate. When writing to .csv files, combined they are approx. 37KB. When combining these data frames using dplyr::full_join, for instance, the total file size exceeds 1GB.}

ucr_age_sex <- ucr2 %>% 
  dplyr::select(-c(AW, JW, AB, JB, AA, JA, AI, JI)) %>% 
  dplyr::group_by(year, State, FIPS, crime) %>% 
  ## Convention:  first letter = gender (M,F,B), 
  ##then age group, matching census age groups
  ## B = Both Genders
  ## All = all ages
  dplyr::mutate_all(as.numeric) %>% 
  dplyr::summarise(
    MAll=sum( c(M0_9, M10_12, M13_14, M15, M16, M17, M18, M19,     
                M20, M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,
                M45_49,M50_54,M55_59,M60_64,M65), na.rm=T), 
    ##Census: AGEGRP=0
    FAll=sum( c( F0_9,  F10_12,  F13_14,  F15,  F16,  F17,  F18,  F19,     
                   F20,  F21, F22, F23, F24, F25_29, F30_34, F35_39, F40_44,
                   F45_49, F50_54, F55_59, F60_64, F65), na.rm=T), 
    ##Census: AGEGRP=0
    
    BAll=MAll+FAll,
    
    ##Males
    MUnder15=sum(c(M0_9, M10_12, M13_14), na.rm=T), ##Census: AGEGRP=1,2,3
    M15_19=sum(c( M15, M16, M17, M18, M19), na.rm=T), ##Census: AGEGRP=4
    M20_24=sum(c(M20, M21,M22,M23,M24), na.rm=T), ##Census: AGEGRP=5
    M25_29=M25_29, ##Census: AGEGRP=6
    M30_34=M30_34, ##Census: AGEGRP=7
    M35_39=M35_39, ##Census: AGEGRP=8
    M40_44=M40_44, ##Census: AGEGRP=9
    M45_49=M45_49, ##Census: AGEGRP=10
    M50_54=M50_54, ##Census: AGEGRP=11
    M55_59=M55_59, ##Census: AGEGRP=12
    M60Above=sum(c(M60_64,M65), na.rm=T), ##Census: AGEGRP=13-18
    
    FUnder15=sum(c(F0_9, F10_12, F13_14), na.rm=T), ##Census: AGEGRP=1,2,3
    F15_19=sum(c( F15, F16, F17, F18, F19), na.rm=T), ##Census: AGEGRP=4
    F20_24=sum(c(F20, F21,F22,F23,F24), na.rm=T), ##Census: AGEGRP=5
    F25_29=F25_29, ##Census: AGEGRP=6
    F30_34=F30_34, ##Census: AGEGRP=7
    F35_39=F35_39, ##Census: AGEGRP=8
    F40_44=F40_44, ##Census: AGEGRP=9
    F45_49=F45_49, ##Census: AGEGRP=10
    F50_54=F50_54, ##Census: AGEGRP=11
    F55_59=F55_59, ##Census: AGEGRP=12
    F60Above=sum(c(F60_64,F65), na.rm=T), ##Census: AGEGRP=13-18
    
    BUnder15=FUnder15+MUnder15,  ##Census: AGEGRP=1,2,3
    B15_19=F15_19+M15_19, ##Census: AGEGRP=4
    B20_24=F20_24+M20_24, ##Census: AGEGRP=5
    B25_29=F25_29+M25_29, ##Census: AGEGRP=6
    B30_34=F30_34+M30_34, ##Census: AGEGRP=7
    B35_39=F35_39+M35_39, ##Census: AGEGRP=8
    B40_44=F40_44+M40_44, ##Census: AGEGRP=9
    B45_49=F45_49+M45_49, ##Census: AGEGRP=10
    B50_54=F50_54+M50_54, ##Census: AGEGRP=11
    B55_59=F55_59+F55_59, ##Census: AGEGRP=12
    B60Above=M60Above+F60Above ##Census: AGEGRP=13-18
    ) %>% 
  tidyr::gather("sex_age", "freq", 5:40) %>% 
  tidyr::separate(sex_age, into=c("gender", "age"), sep=1) %>% 
  dplyr::mutate(gender=replace(gender, which(gender=="M"), "Male"),
         gender=replace(gender, which(gender=="F"), "Female"),
         gender=replace(gender, which(gender=="B"), "Both")
         ) %>% 
 tidyr::spread(crime, freq) %>%
 dplyr::mutate(
 #for Counties that reported to UCR but not specific crimes such as murder,
 #treating these as zeros for aggregating Violent Crime Rates
 murder=replace(murder, which(is.na(murder)), 0),
 rape=replace(rape, which(is.na(rape)), 0),
 robbery=replace(robbery, which(is.na(robbery)), 0),
 assault=replace(assault, which(is.na(assault)), 0),
 ViolentCrime = murder+rape+robbery+assault) %>%
  arrange(year, State, FIPS, gender, age)

5. Arrests by Race by County

##UCR Race
ucr_race <- ucr2 %>% 
  dplyr::group_by(year, State, FIPS, crime) %>% 
  dplyr::select(year, State, FIPS, 
        crime, JW, JB, JI, JA, AW, AB, AI, AA) %>% 
  dplyr::mutate(
    White= AW + JW,
    Black=AB + JB,
    Asian=AA + JA,
    Amerindian= AI + JI) %>% 
dplyr::select(-c(AW, JW, AB, JB, AA, JA, AI, JI)) %>% 
tidyr::gather("Race", "freq", 5:8) %>% 
tidyr::spread(crime, freq) %>%
dplyr::mutate(
   Race=replace(Race, which(Race=="Amerindian"), "Native American"),
   murder=replace(murder, which(is.na(murder)), 0),
   rape=replace(rape, which(is.na(rape)), 0),
   robbery=replace(robbery, which(is.na(robbery)), 0),
   assault=replace(assault, which(is.na(assault)), 0),
   ViolentCrime = murder+rape+robbery+assault) %>%
dplyr::group_by(year, State, FIPS, Race) %>% 
dplyr::mutate(
    drugs = sum(c(drug_poss, drug_sale), na.rm=T)
  ) %>% 
dplyr::select(-drug_sale, -drug_poss) %>% 
  dplyr::arrange(year, State, FIPS, Race)

6. FLORIDA

Florida arrest data are not included in the UCR. Here I include Florida arrest data, taken from the excel file “Arrests by County, Age, Sex and Offense for Florida, 2004 - 2015” available on the Florida Department of Law Enforcement website: http://www.fdle.state.fl.us/cms/FSAC/Data-Statistics/UCR-Arrest-Data.aspx

Note that the Florida offense codes don’t exactly match the UCR. The UCR murder code is for “Murder and non-negligent manslaughter”, whereas the Florida records are separate for murder and for manslaughter. The UCR also distinguishes between “Manslaughter by negligence” and “Murder and non-negligent manslaughter”, whereas Florida records only indicate one category for manslaughter.

Florida also does not report separate categories for drug sales and drug possession arrests. I therefore conflate the categories in the rest of the UCR.

##Adding FIPS codes to Florida
#setwd("createData")
require(noncensus)
  data(counties)
  counties <- sapply(counties, as.character) %>% tibble::as_data_frame()
  cnty_df <- counties[which(counties$state=="FL"),c("county_name", "state", "FIPS")]
  names(cnty_df) <- c("county", "state", "FIPS")
  cnty_df$county  <- tolower(trimws(gsub("[[:punct:]]", "", cnty_df$county)))
  cnty_df$county <- gsub(" county| parish| borough", "", cnty_df$county, fixed = FALSE)
florida <- readr::read_csv("createData/floridaArrest_2011_2015.csv") %>% 
  tidyr::gather(Race, freq, 6:9) %>% 
  tidyr::spread(crime, freq) %>% 
  dplyr::mutate(
    murder = murder_a + manslaughter,
    theft = buy_receive_possess_stolen_property + larceny + motor_vehicle_theft,
    ViolentCrime = murder+rape+robbery+assault,
    county = tolower(trimws(gsub("[[:punct:]]", "", county)))
  ) %>% 
  dplyr::select(-murder_a, -manslaughter, -buy_receive_possess_stolen_property, -larceny, -motor_vehicle_theft) %>% 
  dplyr::left_join(cnty_df)
Parsed with column specification:
cols(
  State = col_character(),
  state = col_character(),
  year = col_integer(),
  county = col_character(),
  crime = col_character(),
  White = col_double(),
  Black = col_double(),
  `Native American` = col_double(),
  Asian = col_double()
)
Joining, by = c("state", "county")
unique(florida$county[which(is.na(florida$FIPS))])
[1] "osecola"

Fixing missing counties

Osecola county was not recognized. Entering it manually.

florida <- florida %>% 
  dplyr::mutate(
    FIPS=replace(FIPS, which(county=="osecola"), "`12097")
  )
unique(florida$county[which(is.na(florida$FIPS))])
character(0)

Adding Florida to the UCR_Race dataset.

uVars <- names(ucr_race)
florida <- florida[,which(names(florida) %in% uVars)]
ucr_race <- ucr_race %>% 
  dplyr::bind_rows(florida)

7. NYC Arrest Data

The UCR Data only contains information for Manhattan, NY - FIPS code 36061. Manhattan is also called ‘New York County.’ It does not include data from: Kings County (Brooklyn, 36047), Queens (36081), The Bronx (36005), Staten Island (36085)

The ORI codes for which arrest data are reported in the UCR for FIPS 36061 includes: - NY03083 (NYC METRO TRNSPRTN AUTH) - NY330SS (SP: NEW YORK COUNTY) - NY03031 (ST PRK: NEW YORK CITY RE)

Further, the only ORI7 code listed in the UCR files as Group “1A”, indicating a city larger than 1 Million, is NY03030, (in the CrossWalk identified as “NEW YORK CITY POLICE DEPARTMENT”), which contains only missing values, i.e. no arrests are reported. Right now, ZERO murders are reported for NYC for example, from 2013-2015. The Law Enforcement Management and Administrative Statistics (LEMAS) data also reports police officers for ORI7 district NY03030.

To resolve these problems I aggregate NYC into one county across all datasets using county code 36061 (Manhattan or New York County). Because arrest data are incomplete from the UCR, I use the New York City Police Department’s “Crime and Enforcement Activity in New York City” report, available from: http://www.nyc.gov/html/nypd/html/analysis_and_planning/crime_and_enforcement_activity.shtml

Specifically, I use the file “Enforcement Report Data Tables 2008-2015”, http://www.nyc.gov/html/nypd/downloads/zip/analysis_and_planning/crime_and_enforcement_report_data_tables_2008-2015_2016-02.zip

Unlike the UCR Data, the NYC arrest data distinguishes Hispanics and Non-Hispanic Whites. To make this data consistent with the UCR data, I collapse Hispanic and White into a single ‘White’ racial category.

#setwd("createData")
nyc <- read_csv("createData/nycArrests.csv") %>% 
  dplyr::filter(year<2015) %>% 
  dplyr::mutate(
    Race=replace(Race, which(Race=="Hispanic"), "White"),
    FIPS="36061",
    State="NEW YORK"
  ) %>% 
  dplyr::group_by(year, State, FIPS, Race) %>% 
  dplyr::summarise_all(sum, na.rm=T)
Parsed with column specification:
cols(
  year = col_integer(),
  Race = col_character(),
  murder = col_integer(),
  rape = col_integer(),
  robbery = col_integer(),
  assault = col_integer(),
  ViolentCrime = col_integer(),
  theft = col_integer()
)
nyFips <- c("36061", "36047", "36081", "36005", "36085")
ucr_race <- ucr_race %>% 
  dplyr::filter(!(FIPS %in% nyFips)) %>% 
  dplyr::bind_rows(nyc)

8. Create Race Crime Rate Variables

Attach county-level Census Population Data

# if(exists("census_county")==FALSE){
#   census_county <- fst::read.fst("createData/census_county.csv")
# }
census_race <- census_county %>% 
  dplyr::ungroup() %>% 
  dplyr::filter(gender == "Both", age == "All") %>% 
  dplyr::select(-gender, -age, -state, -county)
ucr_race2 <- ucr_race 
##Fixing fips that begin with 0
for(i in 1:nrow(ucr_race2)){
  if(nchar(ucr_race2$FIPS[i])==4){
    ucr_race2$FIPS[i] <- paste0("`0", as.character(ucr_race2$FIPS[i]))
  } else {
  if(nchar(ucr_race2$FIPS[i])==5){
      ucr_race2$FIPS[i] <- paste0("`", as.character(ucr_race2$FIPS[i]))
    } 
  }
}
  
ucr_race2 <- ucr_race2 %>% dplyr::left_join(census_race)
Joining, by = c("year", "FIPS", "Race")

Creating Crime Rate Variables by Race

crimeVars <- c("murder", "vc", "dui", "vandal", "theft", "drugs")
#florida_fips <- unique(ucr_race2$FIPS[which(ucr_race2$State=="FLORIDA")])
UCRData <-  ucr_race2 %>% 
  dplyr::group_by(year, FIPS) %>% 
    #mutate(State=stateFromLower(tolower(State), rev=T)) %>% 
    ##selecting, changing names, dropping all other vars
    dplyr::transmute(
      Race=Race, 
      murder=murder, 
      vc=ViolentCrime, 
      dui=dui, 
      vandal=vandalism, 
      theft=theft, 
      drugs=drugs,
      #drug_poss=drug_poss, 
      #drug_sale=drug_sale, 
      popUCR.c=population) %>% 
  dplyr::group_by(year, FIPS) %>% 
  mutate_at(crimeVars, funs(replace(., is.na(.), 0))) %>% ##Converting missing to zeros for counties reporting to UCR
  ##Frequencies
  mutate_at(crimeVars, funs("white" = .[which(Race=="White")])) %>% 
  mutate_at(crimeVars, funs("black" = .[which(Race=="Black")])) %>% 
  mutate_at(crimeVars, funs("asian" = .[which(Race=="Asian")])) %>% 
  mutate_at(crimeVars, funs("amerindian" = .[which(Race=="Native American")])) %>% 
  ##Rates 
  mutate_at(crimeVars, funs("white_r" = .[which(Race=="White")] / popUCR.c[which(Race=="White")])) %>% 
  mutate_at(crimeVars, funs("black_r" = .[which(Race=="Black")] / popUCR.c[which(Race=="Black")])) %>% 
  mutate_at(crimeVars, funs("asian_r" = .[which(Race=="Asian")] / popUCR.c[which(Race=="Asian")])) %>% 
  mutate_at(crimeVars, funs("amerindian_r" = .[which(Race=="Native American")] / popUCR.c[which(Race=="Native American")])) %>% 
  
  ##Now convert original crime vars to "Total" - all races. Will not be labelled
  mutate_at(crimeVars, funs(sum(as.numeric(.), na.rm=T))) %>% 
  mutate_at(crimeVars, funs("r" = sum(as.numeric(.), na.rm=T) / sum(as.numeric(popUCR.c), na.rm=T))) %>% ##Total 
 ##create average over years
dplyr::group_by(FIPS) %>% 
  ##Averaged aross 2011-2015  
  mutate_if(is.numeric, funs("5yr" = mean(., na.rm=T))) %>%
  mutate_if(is.numeric, funs(replace(., is.nan(.), NA))) %>% 
  mutate_if(is.numeric, funs(replace(., is.infinite(.), NA))) 
Adding missing grouping variables: `year`, `FIPS`
##Save File, so this code in above chunks can be skipped.
##readr::write_csv(UCRData, path="createData/UCRData.csv")
fst::write.fst(UCRData, path="createData/UCRData.csv")   
  

IV. SEDA Variables - ELA, Math, Segregation, Gini, SES composite

Sean F. Reardon, Demetra Kalogrides, Andrew Ho, Ben Shear, Kenneth Shores, Erin Fahle. (2016). Stanford Education Data Archive (Version 1.1 File Title). http://purl.stanford.edu/db586ns4974.

File - district covariates by year and grade (long file).csv

1. Load SES and Frequency of Students Variables - Averaging across years

Converted files to fst format for faster loading.

totenrl = Number of Students in Grade

nsch = Number of Schools in the District

NYC schools are already aggregated, but no county FIPS codes are provided. This has to be done manually to avoid losing all NYC cases. .* leaid = “3620580” .* leaname = “NEW YORK CITY PUBLIC SCHOOLS”

Notes on variables: .* ses_all: “computed as the first principal component factor score of the following measures: median income, percent with a bachelor’s degree or higher, poverty rate, SNAP rate, single mother headed household rate, and unemployment rate”; does not vary by grade or year; original data source is ACS.

.* “theil_whiteBlack” is labelled “hswhtblk” in the SEDA dataset: Information index between schools: White/Black. “the information theory index is computed as the average deviation of each student’s school racial diversity from the district-wide racial diversity. Values of 0 indicate no segregation while values of 1 indicate complete segregation. See Theil (1972) for more informaiton”

.* gini_all “computed using counts of people in each of 16 income categories; using the rpme – Robust Pareto midpoint estimator– implemented in Stata by von Hippel and Powers”

##  "district covariates by year and grade (long file).csv"
#seda_ses <- readr::read_csv(choose.files())
seda_ses <- fst::read.fst("createData/seda_ses_fst.csv") 
seda_ses_covars <- seda_ses %>%
dplyr::mutate(countyid = replace(countyid, which(leaname == "NEW YORK CITY PUBLIC SCHOOLS" | leaid == "3620580"), "36061")) %>% 
dplyr::group_by(leaid)  %>% 
dplyr::summarise(
  FIPS = first(countyid),
  ##all students, across all grades - used for weighted averaging ses vars
  totalStudents_white = sum(wht, na.rm=T),
  totalStudents_black = sum(blk, na.rm=T),
  totalStudents_asian = sum(asn, na.rm=T),
  totalStudents_hisp = sum(hsp, na.rm=T),
  totalStudents_amer = sum(ind, na.rm=T),
  totalStudents_all = sum(totenrl, na.rm=T),
  seda_6thGraders_total = sum(totenrl[which(grade == 6)], na.rm=T),
  seda_7thGraders_total = sum(totenrl[which(grade == 7)], na.rm=T),
  seda_8thGraders_total = sum(totenrl[which(grade == 8)], na.rm=T),
  
  ses_all = weighted.mean(sesall[which(!is.na(totalStudents_all))], 
                          w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T),
  theil_whiteBlack = weighted.mean(hswhtblk[which(!is.na(totalStudents_all))], 
                                   w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T),
  gini_all = weighted.mean(giniall[which(!is.na(totalStudents_all))], 
                           w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T)
  ) %>% 
  dplyr::mutate_all(funs(replace(., is.nan(.), NA))) 

2. 6th & 8th grade ELA and Math scores

SEDA File: “district means national-referenced by year grade subject (long file)_v1_1.csv" File description: “This file contains district level means in NAEP-referenced units. Estimates are comparable between states. There are multiple observations per district; one for each year, grade and subject.” File: MeanG_V1.1

ELA scores for 6th and 8th graders, averaged across district and then county across years 2009-2013.

Weighted mean function returns missing values if values for weights (grade-specific populations) are missing, so filtering to cases where number of students in respective grade is reported.

#setwd("createData")
seda_scores <- read_csv("createData/district means national-referenced by year grade subject (long file)_v1_1.csv") %>% 
  dplyr::filter(grade %in% c("08", "06")) %>% 
  dplyr::mutate(
    leaid = ifelse(nchar(leaid) == 6, paste0("0", as.character(leaid)), as.character(leaid))
  ) %>% 
  dplyr::group_by(leaid) %>% 
  dplyr::summarise(
    ELA_8thgrade = mean(mean_link_ela[which(grade=="08")], na.rm=T),
    MATH_8thgrade = mean(mean_link_math[which(grade=="08")], na.rm=T),
    ##6th grade has a lot fewer missing values - e.g. Los Angeles districts didn't report 8th grade data
    ELA_6thgrade = mean(mean_link_ela[which(grade=="06")], na.rm=T),
    MATH_6thgrade = mean(mean_link_math[which(grade=="06")], na.rm=T)
    ) %>% 
  dplyr::mutate_all(funs(replace(., is.nan(.), NA))) 

3. 6th and 7th grade, race gaps in ELA and math

Variables:

.* gapblk_ela = white-black gap in ela .* gapblk_math = white-black gap in math

Number of non-missing cases for ELA gap: .* 8th grade - 10099 .* 7th grade - 10359 .* 6th grade - 10172

Number of non-missing caes for math gap: .* 8th grade - 9280 .* 7th grade - 9350 .* 6th grade - 10309

Using 7th grade ELA and 6th grade math gap estimates.

seda_gaps <- read_csv("createData/district gaps by year grade subject (long file)_v1_1.csv") %>% 
  dplyr::filter(grade %in% c("07","06")) %>% 
  dplyr::mutate(
    leaid = ifelse(nchar(leaid) == 6, paste0("0", as.character(leaid)), as.character(leaid))
  ) %>% 
  dplyr::group_by(leaid) %>% 
  dplyr::summarise(
    ELA_7thgrade_gapblk = mean(gapblk_ela[which(grade=="07")], na.rm=T),
    MATH_6thgrade_gapblk = mean(gapblk_math[which(grade=="06")], na.rm=T)
    ELA_7thgrade_gapblk = replace(ELA_7thgrade_gapblk, is.nan(ELA_7thgrade_gapblk), NA),
    MATH_6thgrade_gapblk = replace(MATH_6thgrade_gapblk, is.nan(MATH_6thgrade_gapblk), NA)
  ) %>% 
  dplyr::mutate_all(funs(replace(., is.nan(.), NA))) 
Parsed with column specification:
cols(
  leaid = col_integer(),
  leaname = col_character(),
  fips = col_integer(),
  stateabb = col_character(),
  year = col_integer(),
  grade = col_character(),
  gapblk_ela = col_double(),
  gapseblk_ela = col_double(),
  gaphsp_ela = col_double(),
  gapsehsp_ela = col_double(),
  gapblk_math = col_double(),
  gapseblk_math = col_double(),
  gaphsp_math = col_double(),
  gapsehsp_math = col_double(),
  racepercent_ela = col_double(),
  racepercent_math = col_double(),
  fipst = col_integer()
)

4. Combine and Aggregate by County

Below the school districts are aggregated by taking their weighted means, weighting by the number of students in each district for each respective grade.

seda <- dplyr::left_join(seda_ses_covars, seda_scores, by=c("leaid")) %>% 
  dplyr::left_join(seda_gaps, by=c("leaid")) %>% 
  dplyr::group_by(leaid) %>% 
  ##Correction for NYC - aggregating
  dplyr::mutate(
    FIPS = paste0("`", FIPS)
  ) %>% 
  dplyr::group_by(FIPS) %>% 
  dplyr::summarise(
    MATH_6thgrade_gap = weighted.mean(MATH_6thgrade_gapblk[which(!is.na(seda_6thGraders_total))], 
                                         w=seda_6thGraders_total[which(!is.na(seda_6thGraders_total))], na.rm=T),
    
    ELA_7thgrade_gap = weighted.mean(ELA_7thgrade_gapblk[which(!is.na(seda_7thGraders_total))], 
                                        w=seda_7thGraders_total[which(!is.na(seda_7thGraders_total))], na.rm=T),
    
    ELA_8thgrade = weighted.mean(ELA_8thgrade[which(!is.na(seda_8thGraders_total))], 
                                 w=seda_8thGraders_total[which(!is.na(seda_8thGraders_total))], na.rm=T),
    
    ELA_6thgrade = weighted.mean(ELA_6thgrade[which(!is.na(seda_6thGraders_total))], 
                                 w=seda_6thGraders_total[which(!is.na(seda_6thGraders_total))], na.rm=T),
    
    MATH_8thgrade = weighted.mean(MATH_8thgrade[which(!is.na(seda_8thGraders_total))], 
                                 w=seda_8thGraders_total[which(!is.na(seda_8thGraders_total))], na.rm=T),
    
    MATH_6thgrade = weighted.mean(MATH_6thgrade[which(!is.na(seda_6thGraders_total))], 
                                 w=seda_6thGraders_total[which(!is.na(seda_6thGraders_total))], na.rm=T),
    
    
  ses_all = weighted.mean(ses_all[which(!is.na(totalStudents_all))], 
                          w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T),
  theil_whiteBlack = weighted.mean(theil_whiteBlack[which(!is.na(totalStudents_all))], 
                                   w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T),
  gini_all = weighted.mean(gini_all[which(!is.na(totalStudents_all))], 
                           w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T)
  

V. ACS (American Community Survey)

1. Median Income by Race ACS (American Community Survey)

Further income data, disaggregated by race, are taken from the American Community Survey, “2011-2015 American Community Survey 5-Year Estimates”, available online at https://factfinder.census.gov/faces/affhelp/jsf/pages/metadata.xhtml?lang=en&type=dataset&id=dataset.en.ACS_15_5YR

Options:

  • Median Earnings, or
  • Median Household Income, or
  • Median Family Income, or
  • per capita income

Race-specific median family incomes by county were retrieved from the following files:

  1. [White] B19113A - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS) (WHITE ALONE HOUSEHOLDER).

  2. [Black] B19113B - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS) (BLACK OR AFRICAN AMERICAN ALONE HOUSEHOLDER)

  3. [White Non-Hispanic] B19113H - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS) (WHITE ALONE, NOT HISPANIC OR LATINO HOUSEHOLDER)

  4. [Hispanic] B19113I - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS) (HISPANIC OR LATINO HOUSEHOLDER)

  5. [All] B19113 - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS)

Aggregating the NYC burroughs using the median incomes of median incomes for each burrough.

#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
inc_all <- read_csv("createData/ACS_medianIncome.csv") %>% 
  dplyr::mutate(
    Race = "Total"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",")
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  medianIncome = col_double()
)
inc_white <- read_csv("createData/ACS_medianIncome_white.csv") %>% 
  dplyr::mutate(
    Race = "White"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",") 
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  medianIncome = col_integer()
)
inc_black <- read_csv("createData/ACS_medianIncome_black.csv") %>% 
    dplyr::mutate(
    Race = "Black"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",")
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  medianIncome = col_number()
)
inc_white_nh <- read_csv("createData/ACS_medianIncome_white_NH.csv") %>% 
    dplyr::mutate(
    Race = "White_NH"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",")
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  medianIncome = col_integer()
)
inc_hisp <- read_csv("createData/ACS_medianIncome_hispanic.csv") %>% 
    dplyr::mutate(
    Race = "Hispanic"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",") 
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  medianIncome = col_number()
)
inc_df <- dplyr::bind_rows(inc_all, inc_white, inc_white_nh, inc_black, inc_hisp) %>% 
  tidyr::spread(Race, medianIncome) %>% 
    dplyr::rename(
    medianIncome = Total,
    medianIncome_white = White,
    medianIncome_black = Black,
    medianIncome_white_nh = White_NH,
    medianIncome_hisp = Hispanic
  ) %>% 
dplyr::select(-county, -stateName)
##Fixing fips that begin with 0
for(i in 1:nrow(inc_df)){
  if(nchar(inc_df$fips[i])==4){
    inc_df$fips[i] <- paste0("`0", as.character(inc_df$fips[i]))
  } 
  if(nchar(inc_df$fips[i])==5){
    inc_df$fips[i] <- paste0("`", as.character(inc_df$fips[i]))
  } 
  
}
inc_df <- inc_df %>% 
   dplyr::rename(FIPS=fips) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    dplyr::group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

2. Poverty. ACS (American Community Survey)

Poverty [Total and by Race] S1702 - POVERTY STATUS IN THE PAST 12 MONTHS OF FAMILIES

– HC02_EST_VC01 - All families - Percent below poverty level; Estimate; Families

– HC02_EST_VC09 - All families - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is White alone"

– HC02_EST_VC10 - All families - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is Black or African American alone

– HC02_EST_VC11 - All families - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is American Indian and Alaska Native alone

–HC02_EST_VC12 - All families - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is Asian alone

– HC02_EST_VC13 - All families - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is Native Hawaiian and Other Pacific Islander alone

– HC02_EST_VC17 - All families - Percent below poverty level; Estimate; Hispanic or Latino origin (of any race)

– HC02_EST_VC18 - All families - Percent below poverty level; Estimate; White alone, not Hispanic or Latino

#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
pov <- read_csv("createData/ACS_poverty.csv") %>% 
  dplyr::select(-county)
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  poverty = col_double(),
  poverty_white = col_double(),
  poverty_black = col_double(),
  poverty_amerindian = col_double(),
  poverty_asian = col_double(),
  poverty_hawaiian = col_double(),
  poverty_hispanic = col_double(),
  poverty_white_nh = col_double()
)
##Fixing fips that begin with 0
for(i in 1:nrow(pov)){
  if(nchar(pov$fips[i])==4){
    pov$fips[i] <- paste0("`0", as.character(pov$fips[i]))
  } 
  if(nchar(pov$fips[i])==5){
    pov$fips[i] <- paste0("`", as.character(pov$fips[i]))
  } 
  
}
pov <- pov %>% 
   dplyr::rename(FIPS=fips) %>% 
    dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

3. Gini Index ACS (American Community Survey)

#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
gini <- read_csv("createData/ACS_gini_5yr.csv") %>% 
  dplyr::select(-county)
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  gini = col_double()
)
##Fixing fips that begin with 0
for(i in 1:nrow(gini)){
  } 
  if(nchar(gini$fips[i])==5){
    gini$fips[i] <- paste0("`", as.character(gini$fips[i]))
  } 
}
gini <- gini %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

4. Single-Parent-Headed Households % ACS (American Community Survey)

– B11001 - “HOUSEHOLD TYPE (INCLUDING LIVING ALONE)”

– B11001A - “HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (WHITE ALONE)”

– B11001B - “HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (BLACK OR AFRICAN AMERICAN ALONE)”

– B11001C - “HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (AMERICAN INDIAN AND ALASKA NATIVE ALONE)”

– B11001D - “HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (ASIAN ALONE)”

– B11001E - “HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)”

– B11001H - “HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (WHITE ALONE, NOT HISPANIC OR LATINO)”

– B11001I - “HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (HISPANIC OR LATINO)”

From each dataset two variables are extracted:

  1. “Estimate; Family households” [HD01_VD02]

  2. “Estimate; Family households: - Other family: - Female householder, no husband present” [HD01_VD06]

#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
sp_total <- read_csv("createData/ACS_singleParent_total.csv") %>% 
  dplyr::mutate(singleParent_prcnt = singleParent/totalFamilies)
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  totalFamilies = col_integer(),
  singleParent = col_integer()
)
sp_white <- read_csv("createData/ACS_singleParent_white.csv") %>% 
    dplyr::mutate(
      singleParent_prcnt_white = ifelse(families_white>0,
                                        singleParent_white/families_white,0)
    )
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  families_white = col_integer(),
  singleParent_white = col_integer()
)
sp_white_nh <- read_csv("createData/ACS_singleParent_white_nh.csv") %>% ##nonhispanic white
      dplyr::mutate(
        singleParent_prcnt_white_nh = ifelse(families_white_nh > 0,
                                             singleParent_white_nh/families_white_nh, 0)
      )
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  families_white_nh = col_integer(),
  singleParent_white_nh = col_integer()
)
sp_black <- read_csv("createData/ACS_singleParent_black.csv") %>% 
      dplyr::mutate(
        singleParent_prcnt_black = ifelse(families_black > 0,
                                          singleParent_black/families_black,  0)
)
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  families_black = col_integer(),
  singleParent_black = col_integer()
)
      dplyr::mutate(
        singleParent_prcnt_amerindian = ifelse(families_amerindian>0,
                                               singleParent_amerindian/families_amerindian, 0)
      )
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  families_amerindian = col_integer(),
  singleParent_amerindian = col_integer()
)
sp_asian <- read_csv("createData/ACS_singleParent_asian.csv") %>% 
      dplyr::mutate(
        singleParent_prcnt_asian = ifelse(families_asian>0,
                                          singleParent_asian/families_asian, 0)
      )
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  families_asian = col_integer(),
  singleParent_asian = col_integer()
)
sp_hawaiian <- read_csv("createData/ACS_singleParent_hawaiian.csv") %>% 
      dplyr::mutate(
        singleParent_prcnt_hawaiian = ifelse(families_hawaiian>0,
                                             singleParent_hawaiian/families_hawaiian,  0)
      )
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  families_hawaiian = col_integer(),
  singleParent_hawaiian = col_integer()
)
        singleParent_prcnt_hispanic = ifelse(families_hispanic>0,
                                             singleParent_hispanic/families_hispanic, 0)
      )
Parsed with column specification:
cols(
  fips = col_integer(),
  county = col_character(),
  families_hispanic = col_integer(),
  singleParent_hispanic = col_integer()
)
singleParent <- dplyr::left_join(sp_total, sp_white) %>% 
  dplyr::left_join(sp_white_nh) %>% 
  dplyr::left_join(sp_black) %>% 
  dplyr::left_join(sp_amerindian) %>% 
  dplyr::left_join(sp_asian) %>% 
  dplyr::left_join(sp_hawaiian) %>% 
  dplyr::left_join(sp_hispanic) %>% 
  dplyr::select(-county)
Joining, by = c("fips", "county")
Joining, by = c("fips", "county")
Joining, by = c("fips", "county")
Joining, by = c("fips", "county")
Joining, by = c("fips", "county")
Joining, by = c("fips", "county")
Joining, by = c("fips", "county")
  
##Fixing fips that begin with 0
for(i in 1:nrow(singleParent)){
  if(nchar(singleParent$fips[i])==4){
    singleParent$fips[i] <- paste0("`0", as.character(singleParent$fips[i]))
  } 
  } 
}
singleParent <- singleParent %>% 
   dplyr::rename(FIPS=fips) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

5. Educational Attainment (less than high school %) ACS (American Community Survey)

EDUCATIONAL ATTAINMENT [Total and by Race] S1501 - EDUCATIONAL ATTAINMENT

HC02_EST_VC03 - “Percent; Estimate; Population 18 to 24 years - Less than high school graduate”

#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
hs <- read_csv("createData/ACS_highschool.csv")
Parsed with column specification:
cols(
  fips = col_integer(),
  highschoolDropOut = col_double()
)
  if(nchar(hs$fips[i])==4){
    hs$fips[i] <- paste0("`0", as.character(hs$fips[i]))
  } 
  if(nchar(hs$fips[i])==5){
    hs$fips[i] <- paste0("`", as.character(hs$fips[i]))
  } 
}
hs <- hs %>% 
dplyr::rename(FIPS=fips) %>% 
dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

6. Crowing (ACS) (Occupants per room) (ACS)

[All Races, by Owner or Renter] B25014 - TENURE BY OCCUPANTS PER ROOM To calculate “crowding” we esimate the percentage of households with 1.01 or more occupants per room. We include owners and renters. We use the following variables:

  • HD01_VD01 - “Estimate; Total:”

  • HD01_VD05 - “Estimate; Owner occupied: - 1.01 to 1.50 occupants per room”

  • HD01_VD06 - “Estimate; Owner occupied: - 1.51 to 2.00 occupants per room”

  • HD01_VD07 - “Estimate; Owner occupied: - 2.01 or more occupants per room”

  • HD01_VD11 - “Estimate; Renter occupied: - 1.01 to 1.50 occupants per room”

  • HD01_VD12 - “Estimate; Renter occupied: - 1.51 to 2.00 occupants per room”

  • HD01_VD13 - “Estimate; Renter occupied: - 2.01 or more occupants per room”

#setwd("createData")
crowd <- read_csv("createData/ACS_crowding.csv")
Parsed with column specification:
cols(
  fips = col_integer(),
  crowding_prcnt = col_double()
)
for(i in 1:nrow(crowd)){
  if(nchar(crowd$fips[i])==4){
    crowd$fips[i] <- paste0("`0", as.character(crowd$fips[i]))
  } 
  if(nchar(crowd$fips[i])==5){
    crowd$fips[i] <- paste0("`", as.character(crowd$fips[i]))
  } 
  
crowd <- crowd %>% 
   dplyr::rename(FIPS=fips) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  dplyr::summarise_all(median, na.rm=T)

7. Residential Segregation - CountyHealthRankings.org from ACS 2016-2017

Averaged across 2016 and 2017 countyhealthrankings.org, “Residential segregation—black/white, 2011-2015, source=ACS”

Description: http://www.countyhealthrankings.org/measure/residential-segregation-blackwhite

“Racial/ethnic residential segregation refers to the degree to which two or more groups live separately from one another in a geographic area. The index of dissimilarity is a demographic measure of the evenness with which two groups (black and white residents, in this case) are distributed across the component geographic areas (census tracts, in this case) that make up a larger area (counties, in this case). The index score can be interpreted as the percentage of either black or white residents that would have to move to different geographic areas in order to produce a distribution that matches that of the larger area.”

#setwd("createData")
seg <- readr::read_csv("createData/segregation_countyhealth.csv") %>% 
  group_by(fips) %>% 
  dplyr::select(-State, -County, -year, -X1) %>% 
  dplyr::summarise_all(mean, na.rm=T) %>% 
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  X1 = col_integer(),
  fips = col_character(),
  State = col_character(),
  County = col_character(),
  residential_segregation_black_white = col_integer(),
  residential_segregation_nonwhite_white = col_integer(),
  year = col_integer()
)

VI. BLS Data - Employment (2011-2015 average)

Employment data are derived from the Bureau of Labor Statistics, “Labor Force Data by County, 2015 Annual Averages”. The Local Area Unemployment Statistics (LAUS) are provided at: averages https://www.bls.gov/lau/#tables Specifically, I downloaded the aggregated the following excel files:

  1. Labor force data by county, 2015 annual averages https://www.bls.gov/lau/laucnty15.xlsx

  2. Labor force data by county, 2014 annual averages https://www.bls.gov/lau/laucnty14.xlsx

  3. Labor force data by county, 2013 annual averages https://www.bls.gov/lau/laucnty13.xlsx

  4. Labor force data by county, 2012 annual averages https://www.bls.gov/lau/laucnty12.xlsx

  5. Labor force data by county, 2011 annual averages https://www.bls.gov/lau/laucnty11.xlsx

#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
blsVars <- c("laborForce", "employed", "unemployed", "unemploymentRate")
bls <- read_csv("createData/blsData_2011_2015.csv") %>% 
  dplyr::mutate(
    stateCode = paste0("`", stateCode),
    fips = paste0(stateCode, countyCode),
    fips = replace(fips, which(fips %in% nyFips), "`36061")
  ) %>% 
  dplyr::group_by(fips, year) %>% 
dplyr::group_by(year, fips) %>% 
  mutate_all(sum, na.rm=T) %>%  ##aggregating NYC
  mutate(unemploymentRate = unemployed/laborForce) %>% 
  mutate_at(blsVars, funs("5yr" = mean(., na.rm=T))) %>% 
  mutate_at(blsVars,funs("change" = mean((.[which(year==2015)] - .[which(year==2011)]) / .[which(year==2011)], na.rm=T))) %>% 
  dplyr::filter(year==2015) %>% 
  dplyr::select(-year) %>% 
Parsed with column specification:
cols(
  stateCode = col_character(),
  countyCode = col_character(),
  countyName = col_character(),
  year = col_integer(),
  laborForce = col_integer(),
  employed = col_integer(),
  unemployed = col_integer()
)

VII. POLICE DATA (LEMAS)

Law Enforcement Management and Administrative Statistics (LEMAS) Series from ICPSR (2013). https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/36164

# Law Enforcement Agency Indentifiers Crosswalk, 2012 (ICPSR 35158)
# https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/35158
nyFips <- c("36061", "36047", "36081", "36005", "36085")
#setwd("createData")
ucr_geo <-  read_tsv("createData/35158-0001-Data.tsv") %>% 
dplyr::select(ORI7, FIPS )
Parsed with column specification:
cols(
  .default = col_integer(),
  FIPS_ST = col_character(),
  FIPS_COUNTY = col_character(),
  FIPS = col_character(),
  ORI9 = col_character(),
  ORI7 = col_character(),
  NAME = col_character(),
  STATENAME = col_character(),
  COUNTYNAME = col_character(),
  UANAME = col_character(),
  LG_NAME = col_character(),
  ADDRESS_NAME = col_character(),
  ADDRESS_STR1 = col_character(),
  ADDRESS_STR2 = col_character(),
  ADDRESS_CITY = col_character(),
  ADDRESS_STATE = col_character(),
  LEMAS_ID = col_character(),
  U_POPGRP = col_character(),
  COMMENT = col_character(),
  INTPTLAT = col_double(),
  INTPTLONG = col_double()
  # ... with 3 more columns
)
See spec(...) for full column specifications.
police <- haven::read_stata("createData/36164-0001-Data.dta") %>% 
  dplyr::select(
                ORI7, 
                PERS_RESP_PATRL, #primary duty - patrol
                PERS_RESP_INVST, #primary duty - investigative
                FTSWORN,  #TOTAL NUMBER OF SWORN PERSONNEL - fulltime
                PERS_FTS_BLK, #TOTAL NUMBER OF SWORN PERSONNEL - fulltime - Black
                COM_MIS, #COMMUNITY POLICING COMPONENT IN MISSION STATEMENT
                ## 1 = No written mission statement
                ## 2 = Written statement, no community policing component
                #FINALWT #FINAL WEIGHT FOR EACH STRATUM
                ) %>% 
    policeFullTime = FTSWORN,
    policeFemale = PERS_PDSW_FFT,
    policeBlack = PERS_FTS_BLK,
    policePatrol=PERS_RESP_PATRL, 
    gunShotDetection=TECH_TYP_GUN) %>% 
  dplyr::left_join(ucr_geo) %>% 
  mutate_all(funs(replace(., is.nan(.), NA))) %>% 
    communityPolice=replace(communityPolice, which(communityPolice==3), TRUE),
    communityPolice=replace(communityPolice, which(communityPolice != 3), FALSE),
    gunShotDetection=replace(gunShotDetection, which(gunShotDetection == 1), TRUE),
  group_by(State, FIPS) %>% 
  dplyr::summarise(
    communityPolice=mean(communityPolice, na.rm=T),
    gunShotDetection=mean(gunShotDetection, na.rm=T),
    policeFullTime=sum(policeFullTime, na.rm=T),
  ) %>% 
  dplyr::mutate(
    policeFullTime_log=log(policeFullTime),
    policePrcntBlack = 100*policeBlack/policeFullTime,
    policePatrol_log = log(1+policePatrol),
    policeInvestigate_log = log(1+policeInvestigate)
  ) %>% 
  ungroup() %>% 
  dplyr::filter(!is.na(FIPS)) %>% 
  dplyr::mutate(FIPS=paste("`", as.character(FIPS), sep="")) %>% 
  dplyr::select(-State)
Joining, by = "ORI7"
Column `ORI7` has different attributes on LHS and RHS of join
  
#fst::write.fst(police, path="police.csv")

VIII. Land Area (Census)

Land Area estimates from: https://www.census.gov/support/USACdataDownloads.html

‘Land Area’ file “LND01.xls”.
Variable: LND110210D, “Land area in square miles 2010”

nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
library(dplyr)
area <- readr::read_csv("createData/LND01.csv") %>% 
  dplyr::select(Areaname, STCOU, LND110210D) %>% 
  dplyr::mutate(
    stateCode = substr(STCOU, start=3, stop=5)
  ) %>% 
  dplyr::filter(stateCode != "000") %>%   ##getting rid of state totals
  dplyr::transmute(
    FIPS = paste0("`", STCOU),
    landArea = LND110210D
  ) %>% 
  dplyr::filter(landArea > 0) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
  dplyr::group_by(FIPS) %>% 
  dplyr::summarise(
    landArea = sum(landArea, na.rm=TRUE)
  )
##eliminating 3 cases that aren't in the other data files with size 0
#Clifton Forge, VA, `51560 %>% 
#South Boston, VA, `51780

IX. Combine all Variables

if(exists("UCRData")==FALSE){
  UCRData <- fst::read.fst("createData/UCRData.csv")
}
countyVars <- census_vars %>% 
  dplyr::left_join(seda) %>%
  dplyr::left_join(police) %>% 
  dplyr::left_join(area) %>% 
  dplyr::left_join(bls) %>% 
  dplyr::left_join(inc_df) %>% 
  dplyr::left_join(pov) %>% 
  dplyr::left_join(gini) %>% 
  dplyr::left_join(singleParent) %>% 
  dplyr::left_join(hs) %>% 
  dplyr::left_join(crowd) %>% 
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
Joining, by = "FIPS"
  
fst::write.fst(countyVars, path="countyVars_f.csv")
write.csv(countyVars, file="countyVars.csv")
  

X. Principle Components

1. CRIME

if(exists("countyVars")==TRUE){
  countyVars <- countyVars
} else {
  countyVars <- fst::read.fst("Data/countyVars_f.csv")
} 
##vars <- fst::read.fst("countyVars.csv")
totalCrime <- countyVars[,c("FIPS", "murder_r_5yr", "dui_r_5yr", "vc_r_5yr", "theft_r_5yr", "vandal_r_5yr", "drugs_r_5yr")]
totalCrime <- totalCrime[complete.cases(totalCrime),] 
pc_crime <-  prcomp(~ murder_r_5yr + vc_r_5yr + dui_r_5yr  + vandal_r_5yr + theft_r_5yr + drugs_r_5yr, 
                            data=totalCrime,
                            scale. = T, center=TRUE,
                            na.action=na.omit)
totalCrime$pc1Crime <- pc_crime$x[,1]
totalCrime$pc2Crime <- pc_crime$x[,2]
totalCrime <- totalCrime[,c("FIPS", "pc1Crime", "pc2Crime")]
countyVars <- dplyr::left_join(countyVars, totalCrime)
Joining, by = "FIPS"
##print PCA analysis from FactoMineR
pcFit <- FactoMineR::PCA(countyVars[,c("murder_r_5yr",  "dui_r_5yr", "vc_r_5yr", "theft_r_5yr", "vandal_r_5yr", "drugs_r_5yr")])
Missing values are imputed by the mean of the variable: you should use the imputePCA function of the missMDA package

summary(pcFit)

Call:
FactoMineR::PCA(X = countyVars[, c("murder_r_5yr", "dui_r_5yr",  
     "vc_r_5yr", "theft_r_5yr", "vandal_r_5yr", "drugs_r_5yr")]) 


Eigenvalues
                       Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
Variance               3.178   0.921   0.714   0.496   0.390   0.302
% of var.             52.963  15.346  11.895   8.264   6.497   5.035
Cumulative % of var.  52.963  68.310  80.204  88.468  94.965 100.000

Individuals (the 10 first)
                 Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2  
1            |  1.224 | -0.232  0.001  0.036 | -0.387  0.005  0.100 |  0.526  0.012  0.185 |
2            |  1.088 | -0.634  0.004  0.340 | -0.280  0.003  0.066 |  0.576  0.015  0.280 |
3            |  1.396 | -0.211  0.000  0.023 | -0.545  0.010  0.152 |  1.050  0.049  0.565 |
4            |  2.017 |  0.035  0.000  0.000 | -0.468  0.008  0.054 |  1.899  0.161  0.886 |
5            |  2.898 | -0.247  0.001  0.007 | -0.851  0.025  0.086 |  2.592  0.300  0.800 |
6            |  1.807 | -0.819  0.007  0.205 | -0.447  0.007  0.061 |  1.498  0.100  0.687 |
7            |  4.847 |  1.910  0.037  0.155 | -1.611  0.090  0.110 |  3.889  0.675  0.644 |
8            |  1.183 | -0.511  0.003  0.187 | -0.456  0.007  0.148 |  0.669  0.020  0.319 |
9            |  3.767 |  0.965  0.009  0.066 | -1.275  0.056  0.115 |  3.304  0.487  0.769 |
10           |  1.422 |  0.245  0.001  0.030 | -0.097  0.000  0.005 |  0.871  0.034  0.375 |

Variables
                Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2  
murder_r_5yr |  0.647 13.173  0.419 | -0.230  0.053 |  0.682 65.266  0.466 |
dui_r_5yr    |  0.684 14.718  0.468 |  0.499 26.997  0.249 | -0.061  0.522  0.004 |
vc_r_5yr     |  0.816 20.972  0.666 | -0.201  4.377  0.040 |  0.122  2.079  0.015 |
theft_r_5yr  |  0.805 20.410  0.649 | -0.280  8.509  0.078 | -0.295 12.169  0.087 |
vandal_r_5yr |  0.782 19.233  0.611 | -0.257  7.184  0.066 | -0.374 19.607  0.140 |
drugs_r_5yr  |  0.604 11.494  0.365 |  0.659 47.180  0.434 |  0.050  0.357  0.003 |

2. Crime - Violent Crime, Theft, Vandalism only

if(exists("countyVars")==TRUE){
  countyVars <- countyVars
} else {
  countyVars <- fst::read.fst("Data/countyVars_f.csv")
} 
vCrime <- countyVars[,c("FIPS", "vc_r_5yr", "theft_r_5yr", "vandal_r_5yr")]
vCrime <- vCrime[complete.cases(vCrime),] 
pc_crime <-  prcomp(~  vc_r_5yr +  vandal_r_5yr + theft_r_5yr , 
                            data=vCrime,
                            scale. = T, center=TRUE,
                            na.action=na.omit)
vCrime$pc2Crime_3 <- pc_crime$x[,2]
vCrime <- vCrime[,c("FIPS", "pc1Crime_3", "pc2Crime_3")]
countyVars <- dplyr::left_join(countyVars, vCrime)
Joining, by = "FIPS"
##print PCA analysis from FactoMineR

summary(pcFit)

Call:
FactoMineR::PCA(X = countyVars[, c("vc_r_5yr", "theft_r_5yr",  
     "vandal_r_5yr")]) 


Eigenvalues
                       Dim.1   Dim.2   Dim.3
Variance               2.232   0.457   0.311
% of var.             74.413  15.233  10.354
Cumulative % of var.  74.413  89.646 100.000

Individuals (the 10 first)
                 Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2  
1            |  1.031 | -0.180  0.000  0.031 |  0.331  0.008  0.103 | -0.959  0.094  0.866 |
2            |  0.862 | -0.547  0.004  0.403 |  0.388  0.011  0.203 | -0.541 |  0.914 | -0.245  0.001  0.072 |  0.697  0.034  0.582 | -0.538  0.030  0.347 |
4            |  0.883 | -0.566  0.005  0.412 |  0.598  0.025  0.459 | -0.317  0.010  0.129 |
5            |  1.023 | -0.951  0.013  0.864 |  0.307  0.007  0.090 | -0.221  0.005  0.047 |
6            |  1.111 | -1.061  0.016  0.912 |  0.312  0.007  0.079 | -0.106  0.001  0.009 |
7            |  1.512 |  0.525  0.004  0.121 |  0.963  0.065  0.405 | -1.041  0.111  0.474 |
8            |  0.825 | -0.380  0.002  0.212 |  0.483  0.016  0.343 | -0.551  0.031  0.446 |
9            |  1.076 | -0.032  0.000  0.001 |  0.952  0.063  0.783 | -0.500  0.026  0.216 |
10           |  1.064 | -0.057  0.000  0.003 |  0.455  0.014  0.183 | -0.960  0.095  0.814 |

Variables
                Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2  
vc_r_5yr     |  0.832 31.013  0.692 |  0.540 63.798  0.292 |  0.127  5.189  0.016 |
theft_r_5yr  |  0.891 35.586  0.794 | -0.131  3.756  0.017 | -0.434 60.658  0.188 |
vandal_r_5yr |  0.864 33.401  0.746 | -0.385 32.446  0.148 |  0.326 34.153  0.106 |

3. SES

if(exists("countyVars")==TRUE){
  countyVars <- countyVars
} else {
  countyVars <- fst::read.fst("Data/countyVars_f.csv")
} 
totalSES <- countyVars[,c("FIPS", "poverty", "singleParent", "medianIncome", "unemploymentRate_5yr")]
totalSES <- totalSES[complete.cases(totalSES),] 
pc_ses <-  prcomp(~ poverty + singleParent + medianIncome + unemploymentRate_5yr, 
                            data=totalSES,
                            scale. = T, center=TRUE,
                            na.action=na.omit)
totalSES$pc1SES <- pc_ses$x[,1]
totalSES$pc2SES <- pc_ses$x[,2]
countyVars <- dplyr::left_join(countyVars, totalSES)
Joining, by = "FIPS"
##print PCA analysis from FactoMineR
pcFit <- FactoMineR::PCA(countyVars[,c("poverty", "singleParent", "medianIncome", "unemploymentRate_5yr")])
Missing values are imputed by the mean of the variable: you should use the imputePCA function of the missMDA package

summary(pcFit)

Call:
FactoMineR::PCA(X = countyVars[, c("poverty", "singleParent",  
     "medianIncome", "unemploymentRate_5yr")]) 


Eigenvalues
                       Dim.1   Dim.2   Dim.3   Dim.4
Variance               2.234   1.033   0.504   0.229
% of var.             55.853  25.835  12.593   5.719
Cumulative % of var.  55.853  81.688  94.281 100.000

Individuals (the 10 first)
                         Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2  
1                    |  0.787 | -0.739  0.008  0.881 | -0.163  0.001  0.043 |  0.215  0.003  0.074 |
2                    |  0.581 | -0.508  0.004  0.765 |  0.173  0.001  0.088 |  0.166  0.002  0.082 |
3                    |  2.186 |  2.167  0.067  0.982 |  0.040  0.000  0.000 |  0.285  0.005  0.017 |
4                    |  1.032 |  0.650 | -0.264  0.002  0.065 | -0.069  0.000  0.004 |
5                    |  0.302 |  0.025  0.000  0.007 | -0.218  0.001  0.520 | -0.201  0.003  0.443 |
6                    |  2.369 |  2.309  0.076  0.950 | -0.044  0.000  0.000 | -0.183  0.002  0.006 |
7                    |  2.263 |  2.261  0.073  0.998 | -0.024  0.000  0.000 |  0.014  0.000  0.000 |
8                    |  1.040 |  0.983  0.014  0.894 |  0.285  0.002  0.075 |  0.037  0.000  0.001 |
9                    |  1.350 |  1.337  0.025  0.981 | -0.087  0.000  0.004 | -0.166  0.002  0.015 |
10                   |  1.042 |  0.838  0.010  0.647 | -0.294  0.003  0.079 | -0.515  0.017  0.244 |

Variables
                        Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2  
poverty              |  0.910 37.059  0.828 |  0.084  0.685  0.007 | -0.203  8.203  0.041 |
singleParent         | -0.105  0.489  0.011 |  0.981 93.038  0.961 | -0.153  4.657  0.023 |
medianIncome         | -0.880 34.681  0.775 |  0.153  2.260  0.023 |  0.325 20.943  0.105 |
unemploymentRate_5yr |  0.788 27.771  0.620 |  0.204  4.016  0.042 |  0.577 66.197  0.333 |
fst::write.fst(countyVars, path="countyVars_f.csv")
write.csv(countyVars, file="countyVars.csv")

XI. Add to Police Killing Variables

1. Reload Police Killing Data (saved above)

2. Washington Post

3. Lott & Moody (2016)

4. Combining Lott & Moody (2013-2015) with 2016 data from Washington Post

5. Save Data

XII. AGGREGATION

Below I aggregate counties into geographical units of having a minimum specified population size. For example, if the threshold is 10,000, then I aggregate counties with less than 10,000 residents into a larger unit having at least 10,000 residents. The aggregation problem has many possible solutions. The algorithm I employ aggregates adjacent counties with smallest population sizes first, i.e. counties with the smallest populations in the US are selected first, then aggregated with their smallest size neighbors.

https://www.census.gov/geo/reference/county-adjacency.html

A. FUNCTIONS

1. Aggregate Counties

This is to create the index that identifies which larger regions counties belong to. There are 5 counties (or census areas) that are not included in the Census adjacency file. They in Alaska, Hawaii, and South Dakota. Their FIPS codes are: 02158, 15001, 15003, 15007, and 46102. I merge these with the smallest group within the state.

2. Aggregate.County.Data

This is to summarise all variables in the datasets used below. Because drug possession and drug sales arrest data are not provided by Florida, I’m not including these two variable sets in the aggregated data sets.

I use the adjacency table ‘countyAdjacent.csv’ I generated from the file ‘createCountyAdjacent.R’.

https://www.census.gov/geo/reference/county-adjacency.html

B. 10k Regions

C. 100k Regions

D. 1 MILLION Region

E. STATES

---
title: "Grab Data"
output: html_notebook
---


# I. POLICE SHOOTING DATA  ---------------------------------
##A.  Load Libraries
```{r, message=FALSE}
#library(Hmisc)
library(dplyr)
library(ggmap)
library(readr)


```

##B. Functions 

###1.  getStateName
This function is just a way to convert state names back and forth using base R functions, but includes washington dc and puerto rico.

```{r}


getState <- function(x){
  ##remove punctuation and twim white space, convert to lower case
  x <- tolower(trimws(gsub("[[:punct:]]", "", x)))
  x[which(x == "washington dc")] <- "district of columbia"
  
  states.abb <- c(state.abb, "DC", "DC", "PR")
  states.name <- c(state.name, "District of Columbia", "District of Columbia", "Puerto Rico")
  
  m.abb <- tolower(states.abb)
  m.name <- tolower(gsub("[[:punct:]]", "", states.name))
  
  ##automatically decide to convert from names to states or vice versa based on majority match
  if(sum(x %in% m.abb) > sum(x %in% m.name)){
    #convert abbreviations to names
    y <- states.name[match(x,m.abb)]
  } else {
    y <- states.abb[match(x,m.name)]
  }
   return(y)
}

##returns abbreviations only
getState.abb <- function(x){
  ##remove punctuation and twim white space, convert to lower case
  x <- tolower(trimws(gsub("[[:punct:]]", "", x)))
  x[which(x == "washington dc")] <- "district of columbia"
  
  states.abb <- c(state.abb, "DC", "DC", "PR")
  states.name <- c(state.name, "District of Columbia", "District of Columbia", "Puerto Rico")
  
  m.abb <- tolower(states.abb)
  m.name <- tolower(gsub("[[:punct:]]", "", states.name))
  
    x[which(x %in% m.abb)] <- states.abb[match(x[which(x %in% m.abb)] ,m.abb)]
    x[which(x %in% m.name)] <- states.abb[match(x[which(x %in% m.name)],m.name)]

  return(x)
}


##returns Names only
getState.names <- function(x){
  ##remove punctuation and twim white space, convert to lower case
  x <- tolower(trimws(gsub("[[:punct:]]", "", x)))
  x[which(x == "washington dc")] <- "district of columbia"
  
  states.abb <- c(state.abb, "DC", "DC", "PR")
  states.name <- c(state.name, "District of Columbia", "District of Columbia", "Puerto Rico")
  
  m.abb <- tolower(states.abb)
  m.name <- tolower(gsub("[[:punct:]]", "", states.name))
  
  x[which(x %in% m.abb)] <- states.name[match(x[which(x %in% m.abb)] ,m.abb)]
  x[which(x %in% m.name)] <- states.name[match(x[which(x %in% m.name)],m.name)]
  
  return(x)
}
```

###2. Get FIPS
```{r}
getFIPS <- function(df, cityVar="city", stateVar="State", countyVar="county", addressVar = "address"){
  require(tibble)
  require(readr)
  require(dplyr)
  require(ggmap)
  require(noncensus)
  setwd("createData")
#PART I - LOAD 3 DATA SETS    
  data(counties)
  counties <- sapply(counties, as.character) %>% tibble::as_data_frame()
  counties$FIPS <- paste0("`", as.character(counties$state_fips), as.character(counties$county_fips))
  counties$county_lower <- counties$county_name
  counties$county_lower  = tolower(trimws(gsub("[[:punct:]]", "", counties$county_lower)))
  counties$county_lower= gsub(" county| parish| borough", "", counties$county_lower, fixed = TRUE)

geoCode <- readr::read_tsv("35158-0001-Data.tsv") %>% 
  dplyr::mutate(
    State =  getState.abb(STATENAME),
    city_lower = ADDRESS_CITY,
    city_lower = gsub("st. ", "saint ", trimws(tolower(city_lower)), fixed = TRUE),
    city_lower = gsub("mt. ", "mount ", trimws(tolower(city_lower)), fixed = TRUE),
    city_lower = tolower(as.character(trimws(gsub("[[:punct:]]", "", city_lower)))),
    county_lower = tolower(as.character(trimws(gsub("[[:punct:]]", "", COUNTYNAME)))),
    FIPS = paste("`", FIPS, sep="")
  )

charNums <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")

geo_city <- readr::read_delim(file="POP_PLACES_20170601.txt", delim="|") %>% 
  dplyr::mutate(
    FIPS = paste0("`", STATE_NUMERIC, COUNTY_NUMERIC),
    city_lower1 = tolower(as.character(trimws(gsub("[[:punct:]]", "", FEATURE_NAME)))),
    city_lower2 = tolower(as.character(trimws(gsub("[[:punct:]]", "", MAP_NAME  )))),
    ##trim last 3 characters, consisting of a letter-number, e.g. D-3
    city_lower2 = ifelse(substring(city_lower2, nchar(city_lower2), nchar(city_lower2)) %in% charNums,
                         trimws(substr(city_lower2, 1, nchar(city_lower2)-3)),
                         city_lower2),
    county_lower = tolower(as.character(trimws(gsub("[[:punct:]]", "", COUNTY_NAME)))),
    county_lower =  ifelse(substring(county_lower, nchar(county_lower)-2, nchar(county_lower)) == " ca",
                           trimws(substr(county_lower, 1, nchar(county_lower)-3)),
                           county_lower)
  )

#PART II.  Load Data
if(cityVar %in% names(df) == FALSE){df[,cityVar] <- NA}
if(stateVar %in% names(df) == FALSE){df[,stateVar] <- NA}
if(addressVar %in% names(df) == FALSE){df[,addressVar] <- NA}
if(countyVar %in% names(df) == FALSE){df[,countyVar] <- NA}
if("FIPS" %in% names(df) == FALSE){df$FIPS <- NA}

#PART III.  Search Using Existing Datasets
scan4fips <- function(df_a, cityVar_a=cityVar, stateVar_a=stateVar, countyVar_a=countyVar){

#step 1 - identify counties if they are already labelled using noncensus counties file
city_vector <- df_a[,cityVar_a] %>% unlist() %>% as.character() %>% trimws() 
county_vector <- df_a[,countyVar_a] %>% unlist() %>% as.character() %>% trimws() 
  
county_index <- which(grepl(" county| parish| borough", city_vector, 
                            ignore.case = TRUE, fixed = FALSE) & is.na(county_vector))
##If "city" has "county" etc., in its title, move it to county column  
df_a[county_index,countyVar_a] <- df_a[county_index,cityVar_a]

for(i in which(is.na(df_a$FIPS) & !is.na(df_a[,countyVar_a]))){
  st <- df_a[[i,stateVar_a]]
  cnty <- df_a[[i,countyVar_a]]
  fips <- unique(counties$FIPS[which(counties$state == st & counties$county_name == cnty)])
  if(length(fips) == 1 & !is.null(fips)){
  if(!is.na(fips) & fips != "NULL") {
    df_a$FIPS[[i]] <- fips
  }} else {
  county_lower = cnty
  county_lower = tolower(trimws(gsub("[[:punct:]]", "", cnty)))
  county_lower = gsub(" county| parish| borough", "", county_lower, fixed = TRUE)
  fips <- unique(counties$FIPS[which(counties$state == st & counties$county_lower == county_lower)])
  
  if(length(fips) == 1 & !is.null(fips)){
  if(!is.na(fips) & fips != "NULL") {df_a$FIPS[[i]] <- fips}} else {
      fips1 <- (geo_city$FIPS[which(geo_city$county_lower == county_lower & geo_city$STATE_ALPHA == st)])
      fips2 <- (geoCode$FIPS[which(geoCode$State ==st & geoCode$county_lower== county_lower)])
    new_fips <- names(sort(table(c(fips, fips1, fips2)), decreasing=TRUE)[1])
    if(length(new_fips) == 1 & !is.null(new_fips)){
    if(!is.na(new_fips) & new_fips != "NULL"){df_a$FIPS[[i]] <- new_fips}}
  }
  } 
}
  
#step 2 - Use city/location
for(i in which(is.na(df_a$FIPS) & !is.na(df_a[,cityVar_a]))){
  st <- df_a[[i,stateVar_a]]
  i_city <- df_a[[i,cityVar_a]]
  city_lower = tolower(trimws(gsub("[[:punct:]]", "", i_city)))
  new_fips <- unique(geo_city$FIPS[which(geo_city$city_lower1 == city_lower & geo_city$STATE_ALPHA == st)])

  if(length(new_fips) == 1 & !is.null(new_fips)){
  if(!is.na(new_fips) & new_fips != "NULL"){
    df_a$FIPS[[i]] <- new_fips}} else {
      
  new_fips <-  unique(geoCode$FIPS[which(geoCode$State == st & geoCode$city_lower==city_lower)])
  
    if(length(new_fips) == 1 & !is.null(new_fips)){
    if(!is.na(new_fips) & new_fips != "NULL"){
      df_a$FIPS[[i]] <- new_fips}} else {
        
    new_fips <-  unique(geo_city$FIPS[which(geo_city$city_lower2 == city_lower & geo_city$STATE_ALPHA == st)])
  
        if(length(new_fips) == 1 & !is.null(new_fips)){
        if(!is.na(new_fips) & new_fips != "NULL"){
            df_a$FIPS[[i]] <- new_fips}} else {
  
  city_lower = gsub("Mt |mt. ", "mount ", trimws(i_city), fixed = TRUE)
  city_lower = gsub("st. ", "saint ", trimws(tolower(city_lower)), fixed = TRUE)
  city_lower = tolower(trimws(gsub("[[:punct:]]", "", city_lower)))
  city_lower = gsub(" county| parish| charter township| township", "", city_lower, fixed = TRUE)
  city_lower = ifelse(city_lower %in% c("coney island", "brooklyn", "manhattan", 
        "bronx", "the bronx", "queens", "staten island", "new york city") & 
          st == "NY", "new york", city_lower)
  city_lower = ifelse(st == "DC", "washington", city_lower)
  fips1 <- unique(geo_city$FIPS[which(geo_city$city_lower1 == city_lower & geo_city$STATE_ALPHA == st)])
  fips2 <- unique(geo_city$FIPS[which(geo_city$city_lower2 == city_lower & geo_city$STATE_ALPHA == st)])
  fips3 <- unique(geo_city$FIPS[which(geo_city$county_lower == city_lower & geo_city$STATE_ALPHA == st)])
  fips4 <- unique(geoCode$FIPS[which(geoCode$State == st & geoCode$city_lower==city_lower)])
  fips5 <- unique(geoCode$FIPS[which(geoCode$State == st & geoCode$county_lower==city_lower)])
  new_fips <- names(sort(table(c(fips1, fips2, fips3, fips4, fips5)), decreasing=TRUE)[1])
  
  if(length(new_fips) == 1 & !is.null(new_fips)){
  if(!is.na(new_fips) & new_fips != "NULL"){df_a$FIPS[[i]] <- new_fips}}
  }}}}
  return(df_a)
}

#PART IV. SEARCH USING GOOGLE MAPS
#After running scan, search for remaining missing cases using google
search4FIPS <- function(df_b, addressVar_b=addressVar, countyVar_b=countyVar){
  df_names <- names(df_b)
  df_b[,addressVar_b] <- apply(df_b[,addressVar_b], 1, function(x) trimws(as.character(x))) %>% tibble::as_data_frame()
  
  index <- which((df_b$FIPS == "NULL" | is.na(df_b$FIPS)) & !is.na(df_b[,addressVar_b]) & df_b[,addressVar_b] != "")
  fips.na <- df_b[index,]
  address_vector <- fips.na[,addressVar_b] %>% unlist() 

  fips_geo <- dplyr::bind_cols(ggmap::geocode(address_vector, source="google", output="more"),fips.na)
  #a_n <- which(names(fips_geo)=="address")[1]
  #names(fips_geo)[a_n] <- "address_google"  #deleting duplicate variable name, causing errors
  df_b[index,countyVar_b] <- as.character(fips_geo$administrative_area_level_2)
  return(df_b)
}

missing_fips <- length(df$FIPS[which(is.na(df$FIPS))])
message(paste0("Initial Missing FIPS:  ", missing_fips))
message("Step 1.  Scanning for FIPS using existing data sets")
dfa <- scan4fips(df)
missing_fips <- length(dfa$FIPS[which(is.na(dfa$FIPS))])
message(paste0("Missing FIPS after Step 1:  ", missing_fips))
message("Step 2.  Scanning for FIPS using Google Maps")
dfb <- search4FIPS(dfa)
dfc <- scan4fips(dfb)
missing_fips <- length(dfc$FIPS[which(is.na(dfc$FIPS))])
message(paste0("Missing FIPS after Step 2:  ", missing_fips))


return(dfc)

}


```


## C.  Load Washington Post Data 

```{r, message=FALSE}

wapo <- read_csv("https://cdn.rawgit.com/washingtonpost/data-police-shootings/master/fatal-police-shootings-data.csv") %>% 
  mutate(address = paste(city, state, "USA", sep=", ")) %>% 
  dplyr::rename(Weapon=armed, Race=race, mental_illness=signs_of_mental_illness) %>% 
  dplyr::mutate(
    year=as.numeric(format(date,"%Y")),
    age_f= cut(age, breaks=c(0, 14, 19, 39, 200), 
                    right= FALSE, labels=c("AgeUnder15", "Age15_19", "Age20_39", "Age40Above")), 
         
    Race=replace(Race, which(Race=="W"), "White"),
         Race=replace(Race, which(Race=="B"), "Black"),
         Race=replace(Race, which(Race=="H"), "Hispanic"),
         Race=replace(Race, which(Race=="A"), "Asian"),
         Race=replace(Race, which(Race=="N"), "Native American"),
          Race=replace(Race, which(Race=="O"), "Other"),
         gender=replace(gender, which(gender=="M"), "Male"),
         gender=replace(gender, which(gender=="F"), "Female"),
         State =  getState.abb(state),     
         mental_illness=replace(mental_illness, 
                                which(mental_illness=="True"), 
                                "Mentally Ill"),
         mental_illness=replace(mental_illness, 
                                which(mental_illness=="False"), 
                                "Not Mentally Ill/Unknown"),
         body_camera=replace(body_camera, 
                             which(body_camera=="False"), "No BodyCam"),
         body_camera=replace(body_camera, 
                             which(body_camera=="True"), "BodyCam"),
    dataSource="Wapo") %>% 
  dplyr::mutate(
    date=as.character(date)
  )
  

```


## D.  Load Lott & Moody (2016) 
```{r}
#setwd("createData")
lott <- haven::read_stata("createData/police.shoot.dta") %>%  
  mutate_all( funs(replace(., is.nan(.), NA))) %>% 
  dplyr::mutate(
    OfficerRace = NA,
    OfficerRace = replace(OfficerRace, which(pw==1), "White"),
    OfficerRace = replace(OfficerRace, which(ph==1), "Hispanic"),
    OfficerRace = replace(OfficerRace, which(pb==1), "Black"),
    OfficerRace = replace(OfficerRace, which(pu==1), "Unknown"),
    OfficerRace = replace(OfficerRace, which(po==1), "Other"),
    Officer2Race = NA,
    Officer2Race = replace(Officer2Race, which(officer2race==1), "White"),
    Officer2Race = replace(Officer2Race, which(officer2race==3), "Hispanic"),
    Officer2Race = replace(Officer2Race, which(officer2race==2), "Black"),
    Officer2Race = replace(Officer2Race, which(officer2race==4), "Asian/Pacific Islander"),
    Officer2Race = replace(Officer2Race, which(officer2race==5), "Native American"),
    Officer2Race = replace(Officer2Race, which(officer2race==6), "Other"),
    OfficerSex = NA,
    OfficerSex = replace(OfficerSex, which(officer1gender==1), "Male"),
    OfficerSex = replace(OfficerSex, which(officer1gender==0), "Female"),
    Officer2Sex = NA,
    Officer2Sex = replace(Officer2Sex, which(officer2gender==1), "Male"),
    Officer2Sex = replace(Officer2Sex, which(officer2gender==0), "Female"),
    Race = NA,
    Race = replace(Race, which(offender1race == 1), "White"),
    Race = replace(Race, which(offender1race == 2), "Black"),
    Race = replace(Race, which(offender1race == 3), "Hispanic"),
    Race = replace(Race, which(offender1race == 4), "Asian/Pacific Islander"),
    Race = replace(Race, which(offender1race == 5), "Native American"),
    Race = replace(Race, which(offender1race == 6), "Other")
  ) %>% 
  dplyr::select(-contains("dum")) %>%  ##deleting dummy variables
  dplyr::select(-state, -stnumber, -cityid, -citystate, -sw, -sb, -sh, 
                -pf, -pw, -ph, -pb, -pu, -po, -offender1race, 
                -officer1gender, -officer2gender, -officer2race) %>% 
dplyr::mutate(
  suicidal = replace(suicidal, which(suicidal == 0), "False"),
  suicidal = replace(suicidal, which(suicidal == 1), "True"),
  drugrelated = replace(drugrelated, which(drugrelated == 0), "False"),
  drugrelated = replace(drugrelated, which(drugrelated == 1), "True"),
  offender2 = replace(offender2, which(offender2 == 0), "False"),
  offender2 = replace(offender2, which(offender2 == 1), "True"),
  # polkilled = replace(polkilled, which(polkilled==0), "False"),
  # polkilled = replace(polkilled, which(polkilled==1), "True"),
  firearm = replace(firearm, which(firearm==0), "False"),
  firearm = replace(firearm, which(firearm==1), "True"),
  knife = replace(knife, which(knife==0), "False"),
  knife=replace(knife, which(knife==1), "True"),
  vehicle = replace(vehicle, which(vehicle==0), "False"),
  vehicle=replace(vehicle, which(vehicle==1), "True"),
  otherweapon=replace(otherweapon, which(otherweapon==0), "False"),
  otherweapon=replace(otherweapon, which(otherweapon==1), "True"),
  armed = replace(armed, which(armed==0), "False"),
  armed=replace(armed, which(armed==1), "True"),
  ipcr = replace(ipcr, which(ipcr==0), "False"),
  ipcr = replace(ipcr, which(ipcr==1), "True"),
    ivcr = replace(ivcr, which(ivcr==0), "False"),
    ivcr = replace(ivcr, which(ivcr==1), "True"),
    bodycam = replace(bodycam, which(bodycam==0), "False"),
  bodycam = replace(bodycam, which(bodycam==1), "True"),
    guncam = replace(guncam, which(guncam==0), "False"),
  guncam = replace(guncam, which(guncam==1), "True"),
    gunshots = replace(gunshots, which(gunshots==0), "False"),
  gunshots = replace(gunshots, which(gunshots==1), "True"),
    sameofficers = replace(sameofficers, which(sameofficers==0), "False"),
  sameofficers = replace(sameofficers, which(sameofficers==1), "True"),
    helicopters = replace(helicopters, which(helicopters==0), "False"),
  helicopters = replace(helicopters, which(helicopters==1), "True"),
    union = replace(union, which(union==0), "False"),
  union = replace(union, which(union==1), "True"),
    college = replace(college, which(college==0), "False"),
  college = replace(college, which(college==1), "True"),
    community = replace(community, which(community==0), "False"),
  community = replace(community, which(community==1), "True"),
  age_f= cut(sa, breaks=c(0, 14, 19, 39, 200), 
                    right= FALSE, labels=c("AgeUnder15", "Age15_19", "Age20_39", "Age40Above")),
  State =  getState.abb(stnm), 
  #state_lower = stateFromLower(stnm),
  dataSource="lott",
  address=paste(city, stnm, "USA", sep=", ")
) %>% 
  dplyr::rename(Age = sa, date=Date)

for(i in 1:ncol(lott)) {
  if(Hmisc::label(lott)[i]!=""){
    
    lab <- Hmisc::label(lott)[i]
    if(lab == "suspect's age"){lab <- "Age"}
    if(lab == "Date"){lab <- "date"} #keep consistent with other df's
    if(lab == "suspect arm**ed, percent"){lab <- "Armed"}
    lab <- gsub(", ", "", lab)
    lab <- gsub("percent ", "", lab)
    lab <- gsub("percent", "", lab)
    if(lab == "whether offender2 exists"){lab <- "Offender 2"}
    lab <- gsub("suspect ", "", lab)
    names(lott)[i] <- lab
  }
}
```  


##E.  Combine Data
```{r, warning=FALSE}

lott_nested <- 
 lott %>%
  tidyr::nest(-dataSource, -address, -State, -city, -year)

wapo_nested <- 
  wapo %>%
  tidyr::nest(-dataSource, -address, -State, -city, -year) 

killData <- dplyr::bind_rows(lott_nested, wapo_nested) %>% 
   dplyr::filter(State %in% c(state.abb, "DC"), year < 2017)
#excluding a case in Virgin Islands (lott), and onec case in puerto rico (lott)

```



##F.  Load Geo-Coordinates (FIPS) data 

Two files are used to identify the counties in which police shootings occurred:  
1.  Law Enforcement Agency Indentifiers Crosswalk, 2012 (ICPSR 35158)  
https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/35158
2.  "Populated Places" file "POP_PLACES_20170601.txt", a subset of the larger file "NationalFile_20170601.txt" from USGS - Domestic and Antarctic Names - State and Topical Gazetteer Download Files
Source:  https://geonames.usgs.gov/domestic/download_data.htm

The remaining cases without a FIPS code are identified using the *geocode()* function from the ggmaps package.


```{r}
kd <- getFIPS(df=killData)
kd %>% dplyr::filter(is.na(FIPS)) %>% tibble::as_tibble()

```

There are four remaining cases without a FIPS code.  After examining these four cases, I made the following manual changes:

1. "Florence, AK, USA" - Lott - 2013 - is identified as Florence, Alabama, Lauderdale County, FIPS `01077

2. "Opp, FL, USA" - Lott - 2015 - is identified as Opp, Alabama, Covington County, FIPS  `01039

3. "Boise National Forest, ID, USA" - Lott - 2013 - is in/near Lowman, ID, Boise County, FIPS `16015

4.  "Cockeysville, TX, USA" could not be verified and is dropped.

In addition, the following changes are made:

1. "Prairie Grove, AZ, USA" - Lott - 2013 - is Prairie Grove, Arkansas, Washington County, FIPS `05143

2. "Alabama, GA, USA" - Lott - 2015 - DROPPED

3. "National Park, AZ, USA" - Lott - 2013 - DROPPED
 


```{r}
drop_addresses <- c("Alabama, GA, USA", "National Park, AZ, USA", "Cockeysville, TX, USA")
nrow(kd)
kd <- subset(kd, !(address %in% drop_addresses))
nrow(kd)
kd$FIPS[which(kd$address == "Prairie Grove, AZ, USA")] <- "`05143"
kd$State[which(kd$address == "Prairie Grove, AZ, USA")] <- "AR"
kd$FIPS[which(kd$address == "Florence, AK, USA")] <- "`01077"
kd$State[which(kd$address == "Florence, AK, USA")] <- "AL"
kd$FIPS[which(kd$address == "Opp, FL, USA")] <- "`01039"
kd$State[which(kd$address == "Opp, FL, USA")] <- "AL"
kd$FIPS[which(kd$address == "Boise National Forest, ID, USA")] <- "`16015"

```

###NYC Corrrection
(see section below in UCR crime data - using Manhattan FIPS for all NYC FIPS)
```{r}
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")

kd <- kd %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
  dplyr::select(-county)

```



## G.   Save Data.
```{r}

wapo <- kd %>% dplyr::filter(dataSource=="Wapo") %>% 
  tidyr::unnest()
fst::write.fst(wapo, path="df_wapo_fst.csv")
write.csv(wapo, file="df_wapo.csv")

lott <- kd %>% dplyr::filter(dataSource=="lott") %>% 
  tidyr::unnest()
fst::write.fst(lott, path="df_lott_fst.csv")
write.csv(lott, file="df_lott.csv")



```



# II. POPULATION CENSUS DATA ---------------------------------


##1.  COUNTY-LEVEL Population Data
CC-EST2015-ALLDATA-[ST-FIPS]: Annual County Resident Population Estimates by Age, Sex, Race, and Hispanic Origin:
April 1, 2010 to July 1, 2015 File: 7/1/2015 County Characteristics Resident Population Estimates
Source: U.S. Census Bureau, Population Division Release Date: June 2016

###Load Data from Census Website
```{r}
library(tidyr)
library(readr)
library(dplyr)

# I.  COUNTY-LEVEL POPULATION CENSUS DATA ---------------------------------
##Using dplyr/tidy-verse code
county_census <- read_csv(file="https://www2.census.gov/programs-surveys/popest/datasets/2010-2015/counties/asrh/cc-est2015-alldata.csv")
                          
##https://www.census.gov/popest/data/counties/asrh/2015/files/CC-EST2015-ALLDATA.csv

##Race-Total, gender-Both, age-All

countyPop <- county_census %>% 
  dplyr::filter(YEAR %in% c(4,5,6,7,8)) %>% 
  dplyr::mutate(YEAR=replace(YEAR, which(YEAR==4), 2011),
         YEAR=replace(YEAR, which(YEAR==5), 2012),
         YEAR=replace(YEAR, which(YEAR==6), 2013),
         YEAR=replace(YEAR, which(YEAR==7), 2014),
         YEAR=replace(YEAR, which(YEAR==8), 2015),
         FIPS=paste("`", STATE, COUNTY, sep=""),
         
         
         age=NA,
         age=replace(age, which(AGEGRP==0), "All"),
         age=replace(age, which(AGEGRP %in% c(1,2,3)), "Under15"),
         age=replace(age, which(AGEGRP == 4), "15_19"),
         age=replace(age, which(AGEGRP == 5), "20_24"),
         age=replace(age, which(AGEGRP == 6), "25_29"),
         age=replace(age, which(AGEGRP == 7), "30_34"),
         age=replace(age, which(AGEGRP == 8), "35_39"),
         age=replace(age, which(AGEGRP == 9), "40_44"),
         age=replace(age, which(AGEGRP == 10), "45_49"),
         age=replace(age, which(AGEGRP == 11), "50_54"),
         age=replace(age, which(AGEGRP == 12), "55_59"),
         age=replace(age, which(AGEGRP > 12), "60Above")
  ) %>% 

  transmute(year=YEAR, 
            state=STNAME,
            county=CTYNAME,
            FIPS=FIPS, age=age,

          Total_Male=TOT_MALE, 
          Total_Female=TOT_FEMALE, 
          Total_Both = Total_Male+Total_Female,
          
          Black_Male=BA_MALE, Black_Female=BA_FEMALE,
          Black_Both=Black_Male+Black_Female,
          
          Amerindian_Male=IA_MALE, Amerindian_Female=IA_FEMALE,
          Amerindian_Both=Amerindian_Male+Amerindian_Female,
          
          Asian_Male=AA_MALE, Asian_Female=AA_FEMALE,
          Asian_Both=Asian_Male+Asian_Female,
          
          PacificIslander_Male=NA_MALE, PacificIslander_Female=NA_FEMALE,
          PacificIslander_Both=PacificIslander_Male+PacificIslander_Female,
          
          Hispanic_Male=HWA_MALE, #Hispanic, White alone male population 
          Hispanic_Female=HWA_FEMALE, #Hispanic, White alone female population 
          Hispanic_Both=Hispanic_Male+Hispanic_Female,
          
          White_Male=NHWA_MALE,  #Not Hispanic, White alone male population
          White_Female=NHWA_FEMALE, #Not Hispanic, White alone female population
          White_Both=White_Male+White_Female
    ) 

##Separate, then re-unite in long format
white_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, White_Male, White_Female, White_Both) %>% 
  dplyr::rename(Male=White_Male, Female=White_Female, Both=White_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  mutate(Race="White") %>% 
  group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))

Total_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Total_Male, Total_Female, Total_Both) %>% 
  dplyr::rename(Male=Total_Male, Female=Total_Female, Both=Total_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Total") %>% 
  group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))

Black_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Black_Male, Black_Female, Black_Both) %>% 
  dplyr::rename(Male=Black_Male, Female=Black_Female, Both=Black_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Black") %>% 
  dplyr::group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))

Asian_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Asian_Male, Asian_Female, Asian_Both) %>% 
  dplyr::rename(Male=Asian_Male, Female=Asian_Female, Both=Asian_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Asian") %>% 
  dplyr::group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))

Amerindian_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Amerindian_Male, Amerindian_Female, Amerindian_Both) %>% 
  dplyr::rename(Male=Amerindian_Male, Female=Amerindian_Female, Both=Amerindian_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Native American") %>% 
  dplyr::group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))

Hispanic_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, Hispanic_Male, Hispanic_Female, Hispanic_Both) %>% 
  dplyr::rename(Male=Hispanic_Male, Female=Hispanic_Female, Both=Hispanic_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Hispanic") %>% 
  dplyr::group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))

PacificIslander_df <- countyPop %>% 
  dplyr::select(year, state, county, FIPS, age, PacificIslander_Male, PacificIslander_Female, PacificIslander_Both) %>% 
  dplyr::rename(Male=PacificIslander_Male, Female=PacificIslander_Female, Both=PacificIslander_Both) %>% 
  tidyr::gather(gender, population, 6:8) %>% 
  dplyr::mutate(Race="Pacific Islander") %>% 
  dplyr::group_by(year, state, county, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=(sum(as.numeric(population), na.rm = TRUE)))

##Recombine - can delete total,both, and all values
countyPop2 <- bind_rows(Total_df , white_df, Black_df, Hispanic_df,
                        Asian_df, Amerindian_df, PacificIslander_df
                        )

census_county <- countyPop2
rm(countyPop2)
rm(countyPop)
#fst::write.fst(countyPop2, path="census_county.csv")
#write.csv(countyPop2, file="census_county.csv")

```


###NYC Correction
```{r}
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")

census_county <- census_county %>% ungroup() %>% 
  dplyr::mutate(FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")) %>% 
  dplyr::group_by(year, FIPS, Race, gender, age) %>% 
  dplyr::summarise(population=sum(population, na.rm=T),
                   state=first(state),
                   county=first(county))
```


### County Populations
All population variables are for year 2015, except variables appended with "5yr", which are averaged across 2011 to 2015.  Variables appended with "change" are the percent change from 2011 to 2015.

```{r}


census_vars <- census_county %>% 
  dplyr::group_by(year, FIPS) %>% 
  dplyr::mutate(
    pop.c.total = population[which(Race=="Total"  & gender=="Both" & age=="All")],
    pop.c.white_nh = population[which(Race=="White"  & gender=="Both" & age=="All")],
    pop.c.black = population[which(Race == "Black" & gender=="Both" & age=="All")],
    pop.c.asian = population[which(Race=="Asian" & gender=="Both" & age=="All")],
    #Create Asian/PAcific Islander Category for some datasets that conflate the two
    pop.c.AP = mean(population[which(gender=="Both" & age=="All" & (Race == "Asian" | Race == "Pacific Islander"))], na.rm=T),
    ##Create White/Hispanic category to match UCR
    pop.c.WH = mean(population[which(gender=="Both" & age=="All" & (Race == "White" | Race == "Hispanic"))], na.rm=T),
    pop.c.amerindian = population[which(Race=="Native American"  & gender=="Both" & age=="All")],
    pop.c.hispanic = population[which(Race=="Hispanic"  & gender=="Both" & age=="All")],
    pop.male.15_29 = sum(population[which(Race=="Total" & gender=="Male" & age %in% c("15_19","20_24","25_29"))], na.rm=T),
    pop.black.male15_29 = sum(population[which(Race=="Black"  & gender=="Male" & age %in% c("15_19","20_24","25_29"))])
  ) %>% 
  dplyr::group_by(FIPS) %>% 
  dplyr::mutate(
    ##Averaged aross 2011-2015    
    pop.c.total_5yr = mean(pop.c.total, na.rm=T),
         pop.c.white_nh_5yr = mean(pop.c.white_nh, na.rm=T),
          pop.c.WH_5yr = mean(pop.c.WH, na.rm=T),
         pop.c.black_5yr = mean(pop.c.black, na.rm=T),
         pop.c.asian_5yr = mean(pop.c.asian, na.rm=T),
         pop.c.hispanic_5yr = mean(pop.c.hispanic, na.rm=T),
         pop.c.amerindian_5yr = mean(pop.c.amerindian, na.rm=T),
         pop.male.15_29_5yr = mean(pop.male.15_29, na.rm=T),
  
    ##Change from 2011 - 2015 
  pop.c.total_change = mean((pop.c.total[which(year==2015)]-pop.c.total[which(year==2011)])/pop.c.total[which(year==2011)], 
                            na.rm=T),
  pop.c.white_nh_change = mean((pop.c.white_nh[which(year==2015)]-pop.c.white_nh[which(year==2011)])/pop.c.white_nh[which(year==2011)], 
                               na.rm=T),
  
  
  pop.c.black_change = mean((pop.c.black[which(year==2015)]-pop.c.black[which(year==2011)])/pop.c.black[which(year==2011)], na.rm=T),
  
  ##A few counties had zero blacks, which resulted in infinite values (division by zero), therefore adding + 1
  pop.c.black_change = ifelse(is.infinite(pop.c.black_change),  
        #If infinite, change                    
        mean(((pop.c.black[which(year==2015)]+1)-(pop.c.black[which(year==2011)]+1))/(pop.c.black[which(year==2011)]+1), na.rm=T),  
        ##If not infinite, keep the same
        pop.c.black_change)
  
  ) %>% 
  dplyr::filter(year==2015, Race=="Total", age=="All", gender=="Both") %>% 
  dplyr::select(-Race, -gender, -age, -year)
  




```


##2. STATE-LEVEL Population Data
State Characteristics Datasets: Annual State Resident Population Estimates for 5 Race Groups (5 Race Alone or in Combination Groups) by Age, Sex, and Hispanic Origin: April 1, 2010 to July 1, 2015
https://www2.census.gov/programs-surveys/popest/datasets/2010-2015/state/asrh/sc-est2015-alldata5.csv

###Load Data from Census website

```{r, eval=FALSE}
census_state <- read_csv("https://www2.census.gov/programs-surveys/popest/datasets/2010-2015/state/asrh/sc-est2015-alldata5.csv")
  
##formerly, read_csv("https://www.census.gov/popest/data/state/asrh/2015/files/SC-EST2015-ALLDATA5.csv")

census_state2 <- census_state %>% 
  transmute(State=NAME, gender=SEX, Hisp=ORIGIN, Race=RACE, AGE=AGE,
            y2011=POPESTIMATE2011, y2012=POPESTIMATE2012,
            y2013=POPESTIMATE2013, y2014=POPESTIMATE2014,
            y2015=POPESTIMATE2015) %>% 
  dplyr::mutate(
    gender=replace(gender, which(gender==0), "Both"),
    gender=replace(gender, which(gender==1), "Male"),
    gender=replace(gender, which(gender==2), "Female"),
        age=NA,
         age=replace(age, which(AGE < 15), "Under15"),
         age=replace(age, which(AGE>=15 & AGE<=19), "15_19"),
         age=replace(age, which(AGE>=20 & AGE<=24), "20_24"),
         age=replace(age, which(AGE>=25 & AGE<=29), "25_29"),
         age=replace(age, which(AGE>=30 & AGE<=34), "30_34"),
         age=replace(age, which(AGE>=35 & AGE<=39), "35_39"),
         age=replace(age, which(AGE>=40 & AGE<=44), "40_44"),
         age=replace(age, which(AGE>=45 & AGE<=49), "45_49"),
         age=replace(age, which(AGE>=50 & AGE<=54), "50_54"),
         age=replace(age, which(AGE>=55 & AGE<=59), "55_59"),
         age=replace(age, which(AGE>=60), "60Above"),
    ##Need to add a row for all ages, or not
        #age=replace(age, which(age==0), "All"),
    
    Race=replace(Race, which(Race==1 & Hisp==1), "White"),
    Race=replace(Race, which(Race==1 & Hisp==2), "Hispanic"),
    Race=replace(Race, which(Race==2 & Hisp==0), "Black"),
    Race=replace(Race, which(Race==3 & Hisp==0), "Native American"),
    Race=replace(Race, which(Race==4 & Hisp==0), "Asian"),
    Race=replace(Race, which(Race==5 & Hisp==0), "Pacific Islander")
  ) %>% 
  dplyr::select(-c(Hisp, AGE)) %>% 
  dplyr::filter(Race %in% c("White", "Hispanic", "Black","Asian",
                     "Native American", "Pacific Islander")) %>% 
  dplyr::group_by(State, Race, gender, age) %>% 
  dplyr::summarise(y2011=sum(y2011, na.rm=T),
            y2012=sum(y2012, na.rm=T),
            y2013=sum(y2013, na.rm=T),
            y2014=sum(y2014, na.rm=T),
            y2015=sum(y2015, na.rm=T)
            
            ) %>% 
  ###File size prior to tidying:  631 KB
  ###File size after gathering years into one column:  2544 KB.   
  tidyr::gather(year, population, 5:9) %>% 
  dplyr::mutate(
    year=replace(year, which(year=="y2011"), 2011),
    year=replace(year, which(year=="y2012"), 2012),
    year=replace(year, which(year=="y2013"), 2013),
    year=replace(year, which(year=="y2014"), 2014),
    year=replace(year, which(year=="y2015"), 2015)
  )

##Adding a summary categories:
##State = United States
##gender = 'Both' (already created)
##age = 'All'
##Race = 'Total'

##age -'All'
age_df <- census_state2 %>% 
  dplyr::group_by(year, State, Race, gender) %>% 
  dplyr::summarise(population=sum(population, na.rm=T)) %>% 
  dplyr::mutate(age="All")

census_state2 <- dplyr::bind_rows(census_state2, age_df)

##Race - Total
race_df <- census_state2 %>% 
  dplyr::group_by(year, State, gender, age) %>% 
  dplyr::summarise(population=sum(population, na.rm=T)) %>% 
  dplyr::mutate(Race="Total")

census_state2 <- dplyr::bind_rows(census_state2, race_df)

##State=US

us_df <- census_state2 %>% 
  dplyr::group_by(year, gender, Race, age) %>% 
  dplyr::summarise(population=sum(population, na.rm=T)) %>% 
  dplyr::mutate(State="United States")

census_state <- dplyr::bind_rows(census_state2, us_df)
##file size after adding rows:  3366 KB
##write.csv(census_state, file="census_state.csv")
```

### State Populations
All population variables are for year 2015, except variables appended with "5yr", which are averaged across 2011 to 2015.  Variables appended with "change" are the percent change from 2011 to 2015.

```{r, eval=FALSE}

census_state <- census_state %>% 
   dplyr::group_by(year, State) %>% 
  dplyr::mutate(
    pop.total = population[which(Race=="Total"  & gender=="Both" & age=="All")],
    pop.white_nh = population[which(Race=="White"  & gender=="Both" & age=="All")],
    pop.black = population[which(Race == "Black" & gender=="Both" & age=="All")],
    pop.asian = population[which(Race=="Asian" & gender=="Both" & age=="All")],
    #Create Asian/PAcific Islander Category for some datasets that conflate the two
    pop.AP = mean(population[which(gender=="Both" & age=="All" & (Race == "Asian" | Race == "Pacific Islander"))], na.rm=T),
    ##Create White/Hispanic category to match UCR
    pop.WH = mean(population[which(gender=="Both" & age=="All" & (Race == "White" | Race == "Hispanic"))], na.rm=T),
    pop.amerindian = population[which(Race=="Native American"  & gender=="Both" & age=="All")],
    pop.hispanic = population[which(Race=="Hispanic"  & gender=="Both" & age=="All")],
    pop.male.15_29 = sum(population[which(Race=="Total" & gender=="Male" & age %in% c("15_19","20_24","25_29"))], na.rm=T),
    pop.black.male15_29 = sum(population[which(Race=="Black"  & gender=="Male" & age %in% c("15_19","20_24","25_29"))])
  ) %>% 
  dplyr::group_by(State) %>% 
  dplyr::mutate(
    ##Averaged aross 2011-2015    
    pop.total_5yr = mean(pop.total, na.rm=T),
         pop.white_nh_5yr = mean(pop.white_nh, na.rm=T),
          pop.WH_5yr = mean(pop.WH, na.rm=T),
         pop.black_5yr = mean(pop.black, na.rm=T),
         pop.asian_5yr = mean(pop.asian, na.rm=T),
         pop.hispanic_5yr = mean(pop.hispanic, na.rm=T),
         pop.amerindian_5yr = mean(pop.amerindian, na.rm=T),
         pop.male.15_29_5yr = mean(pop.male.15_29, na.rm=T),
  
    ##Change from 2011 - 2015 
  pop.total_change = mean((pop.total[which(year==2015)]-pop.total[which(year==2011)])/pop.total[which(year==2011)], 
                            na.rm=T),
  pop.white_nh_change = mean((pop.white_nh[which(year==2015)]-pop.white_nh[which(year==2011)])/pop.white_nh[which(year==2011)], 
                               na.rm=T),
  
  
  pop.black_change = mean((pop.black[which(year==2015)]-pop.black[which(year==2011)])/pop.black[which(year==2011)], na.rm=T),
  
  ##A few counties had zero blacks, which resulted in infinite values (division by zero), therefore adding + 1
  pop.black_change = ifelse(is.infinite(pop.black_change),  
        #If infinite, change                    
        mean(((pop.black[which(year==2015)]+1)-(pop.black[which(year==2011)]+1))/(pop.black[which(year==2011)]+1), na.rm=T),  
        ##If not infinite, keep the same
        pop.black_change)
  
  ) %>% 
  dplyr::filter(year==2015, Race=="Total", age=="All", gender=="Both") %>% 
  dplyr::select(-Race, -gender, -age, -year)
  


```





# III. ARREST DATA (Panel 3) ---------------------------------

##1.  Load Data
###2015 Data

Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, 2015 (ICPSR 36794)
https://www.icpsr.umich.edu/icpsrweb/NACJD/studies/36794
ICPSR_36794

Note that the offense codes changed in the 2015 edition.  For example, Murder is now "01A" instead of "011".  
File `36794-0001-Data.tsv`
I've converted this to 'fst' format for faster loading.  

```{r}
# ucr_15 <- readr::read_tsv("createData/36794-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_15, path="createData/ucr_15.csv")

ucr_15 <- fst::read.fst(path="createData/ucr_15.csv")
  
```


  


###2014 Data
Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, Summarized Yearly, 2014 (ICPSR 36400).
http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/36400

File `36400-0001-Data.tsv`

```{r}
##File = 36400-0001-Data.tsv
# ucr_14 <- read_tsv("createData/36400-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_14, path="createData/ucr_14.csv")

ucr_14 <- fst::read.fst(path="createData/ucr_14.csv")

```

###2013 Data
Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, Summarized Yearly, 2013 (ICPSR 36116)
http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/36116
File `36116-0001-Data.tsv`

```{r}
##File = 36116-0001-Data.tsv
# ucr_13 <- read_tsv("createData/36116-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_13, path="createData/ucr_13.csv")

ucr_13 <- fst::read.fst(path="createData/ucr_13.csv")

```


###2012 Data
Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, Summarized Yearly, 2012 (ICPSR 35018)
http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/35018

File `35018-0001-Data.tsv`

```{r}
##File = 35018-0001-Data.tsv
# ucr_12 <- read_tsv("createData/35018-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_12, path="createData/ucr_12.csv")

ucr_12 <- fst::read.fst(path="createData/ucr_12.csv")

```

###2011 Data
Uniform Crime Reporting Program Data: Arrests by Age, Sex, and Race, Summarized Yearly, 2011 (ICPSR 34581)
http://www.icpsr.umich.edu/icpsrweb/NACJD/studies/34581
File `34581-0001-Data.tsv`

```{r}
## File = 34581-0001-Data.tsv
# ucr_11 <- read_tsv("createData/34581-0001-Data.tsv", na = c("", "NA", 999998, 99998, 99999, 998))
# fst::write.fst(ucr_11, path="createData/ucr_11.csv")

ucr_11 <- fst::read.fst(path="createData/ucr_11.csv")

```

##2. Combine and Filter
Note that violent crime is composed of four offenses: murder and nonnegligent manslaughter, forcible rape, robbery, and aggravated assault.


```{r}
offenses_2011_2014 <- c("011", "020", "030", "040", "050", 
              "060", "070", "130", "140", "180", 
              "185", "210")

keepVars <- c("YEAR","STATE","STNAME","COUNTY", "ORI", "AGENCY","OFFENSE","M0_9", "M10_12", "M13_14","M15", "M16","M17","M18","M19",
              "M20","M21","M22","M23","M24","M25_29", "M30_34", "M35_39", "M40_44", "M45_49", "M50_54","M55_59","M60_64","M65","F0_9",
              "F10_12","F13_14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","F25_29","F30_34","F35_39","F40_44",
              "F45_49",  "F50_54",  "F55_59","F60_64","F65","JW", "JB","JI","JA","AW","AB", "AI","AA")

ucr_15 <- ucr_15[,keepVars] 
 ucr_15 <- ucr_15 %>%  dplyr::mutate(crime=NA,
                crime=replace(crime, which(OFFENSE == "01A"), "murder"),
                ##Murder and non-negligent manslaughter
                crime=replace(crime, which(OFFENSE == "02"), "rape"),
                ##Forcible rape
                crime=replace(crime, which(OFFENSE == "03"), "robbery"),
                ##robbery
                crime=replace(crime, which(OFFENSE == "04"), "assault" ),
                ##Aggravated assault
                crime=replace(crime, which(OFFENSE == "05"), "burglary" ),
                ##Burglary-breaking or entering
                crime=replace(crime, 
                              which(OFFENSE %in% c("06", "07", "13")), 
                              "theft" ),
                ## 060 = Larceny-theft (not motor vehicle)
                ## 070 = Motor vehicle theft
                ## 130 = Stolen property-buy, receive, poss.
                crime=replace(crime, which(OFFENSE == "140"), "vandalism" ),
                ##Vandalism
                crime=replace(crime, which(OFFENSE =="180"), "drug_sale"),
                ##Sale/manufacture (subtotal) - drugs
                crime=replace(crime, which(OFFENSE =="185"), "drug_poss"),
                ##Possession (subtotal)
                crime=replace(crime, which(OFFENSE =="21"), "dui")
                ##Driving under the influence
  ) %>% ##eliminating data for other crimes
  dplyr::filter(!is.na(crime))

ucr <- dplyr::bind_rows(ucr_11, ucr_12, ucr_13, ucr_14) 
ucr <- ucr[,keepVars] 

  ##dplyr::filter(OFFENSE %in% offenses_2011_2014) %>% 
  ##dplyr::select(c(7, 3, 18, 13, 4, 17, 22:70, 73:76)) %>% 
ucr <- ucr %>% dplyr::mutate(crime=NA,
         crime=replace(crime, which(OFFENSE == "011"), "murder"),
         ##Murder and non-negligent manslaughter
         crime=replace(crime, which(OFFENSE == "020"), "rape"),
         ##Forcible rape
         crime=replace(crime, which(OFFENSE == "030"), "robbery"),
         ##robbery
         crime=replace(crime, which(OFFENSE == "040"), "assault" ),
         ##Aggravated assault
         crime=replace(crime, which(OFFENSE == "050"), "burglary" ),
         ##Burglary-breaking or entering
         crime=replace(crime, 
                       which(OFFENSE %in% c("060", "070", "130")), 
                       "theft" ),
         ## 060 = Larceny-theft (not motor vehicle)
         ## 070 = Motor vehicle theft
         ## 130 = Stolen property-buy, receive, poss.
         crime=replace(crime, which(OFFENSE == "140"), "vandalism" ),
         ##Vandalism
         crime=replace(crime, which(OFFENSE =="180"), "drug_sale"),
         ##Sale/manufacture (subtotal) - drugs
         crime=replace(crime, which(OFFENSE =="185"), "drug_poss"),
         ##Possession (subtotal)
         crime=replace(crime, which(OFFENSE =="210"), "dui")
         ##Driving under the influence
         ) %>% ##eliminating data for other crimes
  dplyr::filter(!is.na(crime)) %>% 
  dplyr::bind_rows(ucr_15)
```

###Finding data entry errors 
Delphi, Indiana in 2013 mistakenly reported crimes with values of 10000 appear in multiple columns and for multiple rows (i.e. offenses) for the same agency and which also exceed population size. In addition, 2 police stations report negative values for arrests:
  * FLOYD COUNTY POLICE DEPT, GA 2013 rape
  * LEE, GA 2015 assault

```{r}
ucr$totals <- NA
ucr$totals <- apply(ucr[,8:59], 1, sum, na.rm=T)

error_rows <- as.numeric()

for(i in c(8:59)){
  #otherc <- setdiff(8:59, i) ##other variables/columns
  #ucr$totals <- apply(ucr[,otherc], 1, sum, na.rm=T)
  ##if one arrest column exceeds the sum of all the others - impossible, therefore flag.
  er = which(ucr[,i] == 10000 | ucr[,i] == 10001 | ucr[,i] == 20000)
  
  if(length(er)>0){
    message(paste0("variable = ", names(ucr)[i]))
    message(paste(ucr$AGENCY[er], sep=" \n"))
    error_rows <- unique(c(error_rows, er))
  }
}
total_Zero <- which(ucr$totals < 0)
error_rows <- unique(c(error_rows, total_Zero))
ucr_error <- ucr[error_rows,]

```

###Deleting data entry errors
 Deleting these cases.

```{r}
ucr <- ucr[-which(ucr$AGENCY=="DELPHI" & ucr$STATE==13 & ucr$YEAR==2013 ),] ## removed 6 cases
ucr <- ucr[-which(ucr$AGENCY=="FLOYD COUNTY POLICE DEPT" & ucr$STATE==10 & ucr$YEAR==2013 & ucr$crime == "rape"),] # one case
ucr <- ucr[-which(ucr$AGENCY=="LEE" & ucr$STATE==10 & ucr$YEAR==2015 & ucr$crime == "assault"),] # one case
```


##3. Attach FIPS codes to UCR Data, aggregate by county

1.-- Some identical FIPS codes have separate county names - e.g. California, FIPS code 06065 = both 'Riverside' & 'Riverside County'.  Therefore grouping by FIPS rather than by county.
2.-- There is also an FIPS coding error for Philadelphia.  The ORI code `PAPEP00` is read as Indiana County, PA, FIPS = 42063, which should instead be 42101. 

###Load Geographic Codes - County FIPS 
Law Enforcement Agency Indentifiers Crosswalk, 2012 (ICPSR 35158)
https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/35158

```{r}
#setwd("createData")
ucr_geo <- read_tsv("createData/35158-0001-Data.tsv") %>% 
  dplyr::select(ORI7, FIPS, COUNTYNAME, STATENAME ) %>% 
  dplyr::rename(ORI=ORI7, county=COUNTYNAME, State=STATENAME)

```

###Aggregate by County
 
```{r}
ucr2 <- left_join(ucr, ucr_geo)
ucr2 <- ucr2 %>% arrange(YEAR, State, county, FIPS, OFFENSE) 

##Fix Philly.
ucr2 <- ucr2 %>% 
  dplyr::mutate(
  FIPS=replace(FIPS, which(ORI == "PAPEP00"), 42101)
) %>% 
  dplyr::rename(year=YEAR) %>% ungroup() %>% 
  dplyr::select(-c(STATE, STNAME, COUNTY, county, AGENCY, ORI, OFFENSE)) %>% 
  dplyr::filter(!is.na(FIPS))

ucr2 <- ucr2 %>% 
  dplyr::group_by(year, State, FIPS, crime) %>%
  dplyr::summarise_each_(funs(sum(., na.rm = TRUE)),
                  names(ucr2)[c(2:53)])

```


##4. Arrests by Sex & Age by County
{Note:  I've decided to keep these two files separate.  When writing to .csv files, combined they are approx. 37KB.  When combining these data frames using dplyr::full_join, for instance, the total file size exceeds 1GB.}

```{r, eval=FALSE}
ucr_age_sex <- ucr2 %>% 
  dplyr::select(-c(AW, JW, AB, JB, AA, JA, AI, JI)) %>% 
  dplyr::group_by(year, State, FIPS, crime) %>% 
  ## Convention:  first letter = gender (M,F,B), 
  ##then age group, matching census age groups
  ## B = Both Genders
  ## All = all ages
  dplyr::mutate_all(as.numeric) %>% 
  dplyr::summarise(
    MAll=sum( c(M0_9, M10_12, M13_14, M15, M16, M17, M18, M19,     
                M20, M21,M22,M23,M24,M25_29,M30_34,M35_39,M40_44,
                M45_49,M50_54,M55_59,M60_64,M65), na.rm=T), 
    ##Census: AGEGRP=0
    FAll=sum( c( F0_9,  F10_12,  F13_14,  F15,  F16,  F17,  F18,  F19,     
                   F20,  F21, F22, F23, F24, F25_29, F30_34, F35_39, F40_44,
                   F45_49, F50_54, F55_59, F60_64, F65), na.rm=T), 
    ##Census: AGEGRP=0
    
    BAll=MAll+FAll,
    
    ##Males
    MUnder15=sum(c(M0_9, M10_12, M13_14), na.rm=T), ##Census: AGEGRP=1,2,3
    M15_19=sum(c( M15, M16, M17, M18, M19), na.rm=T), ##Census: AGEGRP=4
    M20_24=sum(c(M20, M21,M22,M23,M24), na.rm=T), ##Census: AGEGRP=5
    M25_29=M25_29, ##Census: AGEGRP=6
    M30_34=M30_34, ##Census: AGEGRP=7
    M35_39=M35_39, ##Census: AGEGRP=8
    M40_44=M40_44, ##Census: AGEGRP=9
    M45_49=M45_49, ##Census: AGEGRP=10
    M50_54=M50_54, ##Census: AGEGRP=11
    M55_59=M55_59, ##Census: AGEGRP=12
    M60Above=sum(c(M60_64,M65), na.rm=T), ##Census: AGEGRP=13-18
    
    FUnder15=sum(c(F0_9, F10_12, F13_14), na.rm=T), ##Census: AGEGRP=1,2,3
    F15_19=sum(c( F15, F16, F17, F18, F19), na.rm=T), ##Census: AGEGRP=4
    F20_24=sum(c(F20, F21,F22,F23,F24), na.rm=T), ##Census: AGEGRP=5
    F25_29=F25_29, ##Census: AGEGRP=6
    F30_34=F30_34, ##Census: AGEGRP=7
    F35_39=F35_39, ##Census: AGEGRP=8
    F40_44=F40_44, ##Census: AGEGRP=9
    F45_49=F45_49, ##Census: AGEGRP=10
    F50_54=F50_54, ##Census: AGEGRP=11
    F55_59=F55_59, ##Census: AGEGRP=12
    F60Above=sum(c(F60_64,F65), na.rm=T), ##Census: AGEGRP=13-18
    
    BUnder15=FUnder15+MUnder15,  ##Census: AGEGRP=1,2,3
    B15_19=F15_19+M15_19, ##Census: AGEGRP=4
    B20_24=F20_24+M20_24, ##Census: AGEGRP=5
    B25_29=F25_29+M25_29, ##Census: AGEGRP=6
    B30_34=F30_34+M30_34, ##Census: AGEGRP=7
    B35_39=F35_39+M35_39, ##Census: AGEGRP=8
    B40_44=F40_44+M40_44, ##Census: AGEGRP=9
    B45_49=F45_49+M45_49, ##Census: AGEGRP=10
    B50_54=F50_54+M50_54, ##Census: AGEGRP=11
    B55_59=F55_59+F55_59, ##Census: AGEGRP=12
    B60Above=M60Above+F60Above ##Census: AGEGRP=13-18
    ) %>% 
  tidyr::gather("sex_age", "freq", 5:40) %>% 
  tidyr::separate(sex_age, into=c("gender", "age"), sep=1) %>% 
  dplyr::mutate(gender=replace(gender, which(gender=="M"), "Male"),
         gender=replace(gender, which(gender=="F"), "Female"),
         gender=replace(gender, which(gender=="B"), "Both")
         ) %>% 
 tidyr::spread(crime, freq) %>%
 dplyr::mutate(
 #for Counties that reported to UCR but not specific crimes such as murder,
 #treating these as zeros for aggregating Violent Crime Rates
 murder=replace(murder, which(is.na(murder)), 0),
 rape=replace(rape, which(is.na(rape)), 0),
 robbery=replace(robbery, which(is.na(robbery)), 0),
 assault=replace(assault, which(is.na(assault)), 0),
 ViolentCrime = murder+rape+robbery+assault) %>%
  arrange(year, State, FIPS, gender, age)

```


##5. Arrests by Race by County
```{r}
##UCR Race
ucr_race <- ucr2 %>% 
  dplyr::group_by(year, State, FIPS, crime) %>% 
  dplyr::select(year, State, FIPS, 
        crime, JW, JB, JI, JA, AW, AB, AI, AA) %>% 
  dplyr::mutate(
    White= AW + JW,
    Black=AB + JB,
    Asian=AA + JA,
    Amerindian= AI + JI) %>% 
dplyr::select(-c(AW, JW, AB, JB, AA, JA, AI, JI)) %>% 
tidyr::gather("Race", "freq", 5:8) %>% 
tidyr::spread(crime, freq) %>%
dplyr::mutate(
   Race=replace(Race, which(Race=="Amerindian"), "Native American"),
   murder=replace(murder, which(is.na(murder)), 0),
   rape=replace(rape, which(is.na(rape)), 0),
   robbery=replace(robbery, which(is.na(robbery)), 0),
   assault=replace(assault, which(is.na(assault)), 0),
   ViolentCrime = murder+rape+robbery+assault) %>%
dplyr::group_by(year, State, FIPS, Race) %>% 
dplyr::mutate(
    drugs = sum(c(drug_poss, drug_sale), na.rm=T)
  ) %>% 
dplyr::select(-drug_sale, -drug_poss) %>% 
  dplyr::arrange(year, State, FIPS, Race)
```


##6. FLORIDA 
Florida arrest data are not included in the UCR.  Here I include Florida arrest data, taken from the excel file "Arrests by County, Age, Sex and Offense for Florida, 2004 - 2015" available on the Florida Department of Law Enforcement website: 
http://www.fdle.state.fl.us/cms/FSAC/Data-Statistics/UCR-Arrest-Data.aspx

Note that the Florida offense codes don't exactly match the UCR.  The UCR murder code is for "Murder and non-negligent manslaughter", whereas the Florida records are separate for murder and for manslaughter.  The UCR also distinguishes between "Manslaughter by negligence" and "Murder and non-negligent manslaughter", whereas Florida records only indicate one category for manslaughter.  

Florida also does not report separate categories for drug sales and drug possession arrests.  I therefore conflate the categories in the rest of the UCR.

```{r}
##Adding FIPS codes to Florida
#setwd("createData")
require(noncensus)
  data(counties)
  counties <- sapply(counties, as.character) %>% tibble::as_data_frame()
  counties$FIPS <- paste0("`", as.character(counties$state_fips), as.character(counties$county_fips))
  cnty_df <- counties[which(counties$state=="FL"),c("county_name", "state", "FIPS")]
  names(cnty_df) <- c("county", "state", "FIPS")
  cnty_df$county  <- tolower(trimws(gsub("[[:punct:]]", "", cnty_df$county)))
  cnty_df$county <- gsub(" county| parish| borough", "", cnty_df$county, fixed = FALSE)

florida <- readr::read_csv("createData/floridaArrest_2011_2015.csv") %>% 
  tidyr::gather(Race, freq, 6:9) %>% 
  tidyr::spread(crime, freq) %>% 
  dplyr::mutate(
    murder = murder_a + manslaughter,
    theft = buy_receive_possess_stolen_property + larceny + motor_vehicle_theft,
    ViolentCrime = murder+rape+robbery+assault,
    county = tolower(trimws(gsub("[[:punct:]]", "", county)))
  ) %>% 
  dplyr::select(-murder_a, -manslaughter, -buy_receive_possess_stolen_property, -larceny, -motor_vehicle_theft) %>% 
  dplyr::left_join(cnty_df)


unique(florida$county[which(is.na(florida$FIPS))])

```

###Fixing missing counties
Osecola county was not recognized. Entering it manually.

```{r}

florida <- florida %>% 
  dplyr::mutate(
    FIPS=replace(FIPS, which(county=="osecola"), "`12097")
  )

unique(florida$county[which(is.na(florida$FIPS))])

```

###Adding Florida to the UCR_Race dataset.
```{r}

uVars <- names(ucr_race)
florida <- florida[,which(names(florida) %in% uVars)]

ucr_race <- ucr_race %>% 
  dplyr::bind_rows(florida)

```


##7. NYC Arrest Data
The UCR Data only contains information for Manhattan, NY - FIPS code 36061.  Manhattan is also called 'New York County.'  It does not include data from:  Kings County (Brooklyn, 36047), Queens (36081), The Bronx (36005), Staten Island (36085)

The ORI codes for which arrest data are reported in the UCR for FIPS 36061 includes:
- NY03083 (NYC METRO TRNSPRTN AUTH)
- NY330SS (SP: NEW YORK COUNTY)
- NY03031 (ST PRK: NEW YORK CITY RE)

Further, the only ORI7 code listed in the UCR files as Group "1A", indicating a city larger than 1 Million, is NY03030, (in the CrossWalk identified as "NEW YORK CITY POLICE DEPARTMENT"), which contains only missing values, i.e. no arrests are reported.  Right now, ZERO murders are reported for NYC for example, from 2013-2015.  The Law Enforcement Management and Administrative Statistics (LEMAS) data also reports police officers for ORI7 district NY03030.  

To resolve these problems I aggregate NYC into one county across all datasets using county code 36061 (Manhattan or New York County).  Because arrest data are incomplete from the UCR, I use the New York City Police Department's "Crime and Enforcement Activity in New York City" report, available from:
http://www.nyc.gov/html/nypd/html/analysis_and_planning/crime_and_enforcement_activity.shtml

Specifically, I use the file "Enforcement Report Data Tables 2008-2015", 
http://www.nyc.gov/html/nypd/downloads/zip/analysis_and_planning/crime_and_enforcement_report_data_tables_2008-2015_2016-02.zip

Unlike the UCR Data, the NYC arrest data distinguishes Hispanics and Non-Hispanic Whites.  To make this data consistent with the UCR data, I collapse Hispanic and White into a single 'White' racial category.

```{r}
#setwd("createData")
nyc <- read_csv("createData/nycArrests.csv") %>% 
  dplyr::filter(year<2015) %>% 
  dplyr::mutate(
    Race=replace(Race, which(Race=="Hispanic"), "White"),
    FIPS="36061",
    State="NEW YORK"
  ) %>% 
  dplyr::group_by(year, State, FIPS, Race) %>% 
  dplyr::summarise_all(sum, na.rm=T)


nyFips <- c("36061", "36047", "36081", "36005", "36085")

ucr_race <- ucr_race %>% 
  dplyr::filter(!(FIPS %in% nyFips)) %>% 
  dplyr::bind_rows(nyc)


```


##8.  Create Race Crime Rate Variables
###Attach county-level Census Population Data
```{r}
# if(exists("census_county")==FALSE){
#   census_county <- fst::read.fst("createData/census_county.csv")
# }

census_race <- census_county %>% 
  dplyr::ungroup() %>% 
  dplyr::filter(gender == "Both", age == "All") %>% 
  dplyr::select(-gender, -age, -state, -county)

ucr_race2 <- ucr_race 

##Fixing fips that begin with 0
for(i in 1:nrow(ucr_race2)){
  if(nchar(ucr_race2$FIPS[i])==4){
    ucr_race2$FIPS[i] <- paste0("`0", as.character(ucr_race2$FIPS[i]))
  } else {
  if(nchar(ucr_race2$FIPS[i])==5){
      ucr_race2$FIPS[i] <- paste0("`", as.character(ucr_race2$FIPS[i]))
    } 
  }
}
  
ucr_race2 <- ucr_race2 %>% dplyr::left_join(census_race)
```

###Creating Crime Rate Variables by Race

```{r}

crimeVars <- c("murder", "vc", "dui", "vandal", "theft", "drugs")
#florida_fips <- unique(ucr_race2$FIPS[which(ucr_race2$State=="FLORIDA")])

UCRData <-  ucr_race2 %>% 
  dplyr::group_by(year, FIPS) %>% 
    #mutate(State=stateFromLower(tolower(State), rev=T)) %>% 
    ##selecting, changing names, dropping all other vars
    dplyr::transmute(
      Race=Race, 
      murder=murder, 
      vc=ViolentCrime, 
      dui=dui, 
      vandal=vandalism, 
      theft=theft, 
      drugs=drugs,
      #drug_poss=drug_poss, 
      #drug_sale=drug_sale, 
      popUCR.c=population) %>% 
  dplyr::group_by(year, FIPS) %>% 
  mutate_at(crimeVars, funs(replace(., is.na(.), 0))) %>% ##Converting missing to zeros for counties reporting to UCR
  ##Frequencies
  mutate_at(crimeVars, funs("white" = .[which(Race=="White")])) %>% 
  mutate_at(crimeVars, funs("black" = .[which(Race=="Black")])) %>% 
  mutate_at(crimeVars, funs("asian" = .[which(Race=="Asian")])) %>% 
  mutate_at(crimeVars, funs("amerindian" = .[which(Race=="Native American")])) %>% 
  ##Rates 
  mutate_at(crimeVars, funs("white_r" = .[which(Race=="White")] / popUCR.c[which(Race=="White")])) %>% 
  mutate_at(crimeVars, funs("black_r" = .[which(Race=="Black")] / popUCR.c[which(Race=="Black")])) %>% 
  mutate_at(crimeVars, funs("asian_r" = .[which(Race=="Asian")] / popUCR.c[which(Race=="Asian")])) %>% 
  mutate_at(crimeVars, funs("amerindian_r" = .[which(Race=="Native American")] / popUCR.c[which(Race=="Native American")])) %>% 
  
  ##Now convert original crime vars to "Total" - all races. Will not be labelled
  mutate_at(crimeVars, funs(sum(as.numeric(.), na.rm=T))) %>% 
  mutate_at(crimeVars, funs("r" = sum(as.numeric(.), na.rm=T) / sum(as.numeric(popUCR.c), na.rm=T))) %>% ##Total 
 ##create average over years
dplyr::group_by(FIPS) %>% 
  ##Averaged aross 2011-2015  
  mutate_if(is.numeric, funs("5yr" = mean(., na.rm=T))) %>%
  dplyr::filter(year==2015, Race=="White") %>% 
  dplyr::select(-year, -Race) %>% 
  mutate_if(is.numeric, funs(replace(., is.nan(.), NA))) %>% 
  mutate_if(is.numeric, funs(replace(., is.infinite(.), NA))) 


##Save File, so this code in above chunks can be skipped.
##readr::write_csv(UCRData, path="createData/UCRData.csv")
fst::write.fst(UCRData, path="createData/UCRData.csv")   
  
```




#IV.  SEDA Variables - ELA, Math, Segregation, Gini, SES composite

`Sean F. Reardon, Demetra Kalogrides, Andrew Ho, Ben Shear, Kenneth Shores, Erin Fahle. (2016).` 
`  Stanford Education Data Archive (Version 1.1 File Title).`
http://purl.stanford.edu/db586ns4974.

File - `district covariates by year and grade (long file).csv`


##1. Load SES and Frequency of Students Variables - Averaging across years 

Converted files to fst format for faster loading.

`totenrl` = Number of Students in Grade

`nsch` =	Number of Schools in the District


NYC schools are already aggregated, but no county FIPS codes are provided.  This has to be done manually to avoid losing all NYC cases.
 .* leaid = "3620580"
 .* leaname = "NEW YORK CITY PUBLIC SCHOOLS"

Notes on variables:
.*  ses_all:  "computed as the first principal component factor score of the following measures: median income, percent with a bachelor's degree or higher, poverty rate, SNAP rate, single mother headed household rate, and unemployment rate"; does not vary by grade or year; original data source is ACS.

.*  "theil_whiteBlack" is labelled "hswhtblk" in the SEDA dataset:   Information index between schools: White/Black.   "the information theory index is computed as the average deviation of each student's school racial diversity from the district-wide racial diversity. Values of 0 indicate no segregation while values of 1 indicate complete segregation. See Theil (1972) for more informaiton"

.*  gini_all "computed using counts of people in each of 16 income categories; using the rpme -- Robust Pareto midpoint estimator-- implemented in Stata by von Hippel and Powers"

```{r}

##  "district covariates by year and grade (long file).csv"
#seda_ses <- readr::read_csv(choose.files())
seda_ses <- fst::read.fst("createData/seda_ses_fst.csv") 

seda_ses_covars <- seda_ses %>%
dplyr::mutate(countyid = replace(countyid, which(leaname == "NEW YORK CITY PUBLIC SCHOOLS" | leaid == "3620580"), "36061")) %>% 
dplyr::group_by(leaid)  %>% 
dplyr::summarise(
  FIPS = first(countyid),

  ##all students, across all grades - used for weighted averaging ses vars
  totalStudents_white = sum(wht, na.rm=T),
  totalStudents_black = sum(blk, na.rm=T),
  totalStudents_asian = sum(asn, na.rm=T),
  totalStudents_hisp = sum(hsp, na.rm=T),
  totalStudents_amer = sum(ind, na.rm=T),
  totalStudents_all = sum(totenrl, na.rm=T),
  seda_6thGraders_total = sum(totenrl[which(grade == 6)], na.rm=T),
  seda_7thGraders_total = sum(totenrl[which(grade == 7)], na.rm=T),
  seda_8thGraders_total = sum(totenrl[which(grade == 8)], na.rm=T),
  
  ses_all = weighted.mean(sesall[which(!is.na(totalStudents_all))], 
                          w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T),
  theil_whiteBlack = weighted.mean(hswhtblk[which(!is.na(totalStudents_all))], 
                                   w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T),
  gini_all = weighted.mean(giniall[which(!is.na(totalStudents_all))], 
                           w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T)
  ) %>% 
  dplyr::mutate_all(funs(replace(., is.nan(.), NA))) 

```


##2.  6th & 8th grade ELA and Math scores

SEDA File: "district means national-referenced by year grade subject (long file)_v1_1.csv"
File description: "This file contains district level means in NAEP-referenced units. Estimates are comparable between states. There are multiple observations per district; one for each year, grade and subject."
File:  MeanG_V1.1

ELA scores for 6th and 8th graders, averaged across district and then county across years 2009-2013.

Weighted mean function returns missing values if values for weights (grade-specific populations) are missing, so filtering to cases where number of students in respective grade is reported.

```{r}
#setwd("createData")


seda_scores <- read_csv("createData/district means national-referenced by year grade subject (long file)_v1_1.csv") %>% 
  dplyr::filter(grade %in% c("08", "06")) %>% 
  dplyr::mutate(
    leaid = ifelse(nchar(leaid) == 6, paste0("0", as.character(leaid)), as.character(leaid))
  ) %>% 
  dplyr::group_by(leaid) %>% 
  dplyr::summarise(
    ELA_8thgrade = mean(mean_link_ela[which(grade=="08")], na.rm=T),
    MATH_8thgrade = mean(mean_link_math[which(grade=="08")], na.rm=T),
    ##6th grade has a lot fewer missing values - e.g. Los Angeles districts didn't report 8th grade data
    ELA_6thgrade = mean(mean_link_ela[which(grade=="06")], na.rm=T),
    MATH_6thgrade = mean(mean_link_math[which(grade=="06")], na.rm=T)
    ) %>% 
  dplyr::mutate_all(funs(replace(., is.nan(.), NA))) 


```


##3.  6th and 7th grade, race gaps in ELA and math
Variables:

.* gapblk_ela = white-black gap in ela
.* gapblk_math = white-black gap in math

Number of non-missing cases for ELA gap:
.* 8th grade - 10099
.* 7th grade - 10359
.* 6th grade - 10172

Number of non-missing caes for math gap:
.* 8th grade - 9280
.* 7th grade - 9350
.* 6th grade - 10309

Using 7th grade ELA and 6th grade math gap estimates.

```{r}


seda_gaps <- read_csv("createData/district gaps by year grade subject (long file)_v1_1.csv") %>% 
  dplyr::filter(grade %in% c("07","06")) %>% 
  dplyr::mutate(
    leaid = ifelse(nchar(leaid) == 6, paste0("0", as.character(leaid)), as.character(leaid))
  ) %>% 
  dplyr::group_by(leaid) %>% 
  dplyr::summarise(
    ELA_7thgrade_gapblk = mean(gapblk_ela[which(grade=="07")], na.rm=T),
    MATH_6thgrade_gapblk = mean(gapblk_math[which(grade=="06")], na.rm=T)
    ) %>% 
  dplyr::mutate(
    ELA_7thgrade_gapblk = replace(ELA_7thgrade_gapblk, is.nan(ELA_7thgrade_gapblk), NA),
    MATH_6thgrade_gapblk = replace(MATH_6thgrade_gapblk, is.nan(MATH_6thgrade_gapblk), NA)
  ) %>% 
  dplyr::mutate_all(funs(replace(., is.nan(.), NA))) 


```



##4.  Combine and Aggregate by County 

Below the school districts are aggregated by taking their weighted means, weighting by the number of students in each district for each respective grade.

```{r}

seda <- dplyr::left_join(seda_ses_covars, seda_scores, by=c("leaid")) %>% 
  dplyr::left_join(seda_gaps, by=c("leaid")) %>% 
  dplyr::group_by(leaid) %>% 
  ##Correction for NYC - aggregating
  dplyr::mutate(
    FIPS = as.character(FIPS),
    FIPS = replace(FIPS, which(nchar(FIPS)==4), paste0("0", FIPS)),
    FIPS = paste0("`", FIPS)
  ) %>% 
  dplyr::group_by(FIPS) %>% 
  dplyr::summarise(
    MATH_6thgrade_gap = weighted.mean(MATH_6thgrade_gapblk[which(!is.na(seda_6thGraders_total))], 
                                         w=seda_6thGraders_total[which(!is.na(seda_6thGraders_total))], na.rm=T),
    
    ELA_7thgrade_gap = weighted.mean(ELA_7thgrade_gapblk[which(!is.na(seda_7thGraders_total))], 
                                        w=seda_7thGraders_total[which(!is.na(seda_7thGraders_total))], na.rm=T),
    
    ELA_8thgrade = weighted.mean(ELA_8thgrade[which(!is.na(seda_8thGraders_total))], 
                                 w=seda_8thGraders_total[which(!is.na(seda_8thGraders_total))], na.rm=T),
    
    ELA_6thgrade = weighted.mean(ELA_6thgrade[which(!is.na(seda_6thGraders_total))], 
                                 w=seda_6thGraders_total[which(!is.na(seda_6thGraders_total))], na.rm=T),
    
    MATH_8thgrade = weighted.mean(MATH_8thgrade[which(!is.na(seda_8thGraders_total))], 
                                 w=seda_8thGraders_total[which(!is.na(seda_8thGraders_total))], na.rm=T),
    
    MATH_6thgrade = weighted.mean(MATH_6thgrade[which(!is.na(seda_6thGraders_total))], 
                                 w=seda_6thGraders_total[which(!is.na(seda_6thGraders_total))], na.rm=T),
    
    
  ses_all = weighted.mean(ses_all[which(!is.na(totalStudents_all))], 
                          w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T),
  theil_whiteBlack = weighted.mean(theil_whiteBlack[which(!is.na(totalStudents_all))], 
                                   w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T),
  gini_all = weighted.mean(gini_all[which(!is.na(totalStudents_all))], 
                           w=totalStudents_all[which(!is.na(totalStudents_all))], na.rm=T)
  ) %>% 
  dplyr::mutate_all(funs(replace(., is.nan(.), NA))) 
  
```






#V.  ACS (American Community Survey)

##1. Median Income by Race ACS (American Community Survey)
Further income data, disaggregated by race, are taken from the American Community Survey, "2011-2015 American Community Survey 5-Year Estimates", available online at
https://factfinder.census.gov/faces/affhelp/jsf/pages/metadata.xhtml?lang=en&type=dataset&id=dataset.en.ACS_15_5YR

*Options*:  

- Median Earnings, or 
- Median Household Income, or 
- Median Family Income, or
- per capita income

**Race-specific median family incomes by county were retrieved from the following files:**

1. [White] B19113A - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS) (WHITE ALONE HOUSEHOLDER).

2. [Black] B19113B - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS) (BLACK OR AFRICAN AMERICAN ALONE HOUSEHOLDER)

3.  [White Non-Hispanic] B19113H - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS) (WHITE ALONE, NOT HISPANIC OR LATINO HOUSEHOLDER)

4. [Hispanic] B19113I - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS) (HISPANIC OR LATINO HOUSEHOLDER)

5. [All] B19113 - MEDIAN FAMILY INCOME IN THE PAST 12 MONTHS (IN 2015 INFLATION-ADJUSTED DOLLARS)

Aggregating the NYC burroughs using the median incomes of median incomes for each burrough.
```{r}
#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")

inc_all <- read_csv("createData/ACS_medianIncome.csv") %>% 
  dplyr::mutate(
    Race = "Total"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",")

inc_white <- read_csv("createData/ACS_medianIncome_white.csv") %>% 
  dplyr::mutate(
    Race = "White"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",") 

inc_black <- read_csv("createData/ACS_medianIncome_black.csv") %>% 
    dplyr::mutate(
    Race = "Black"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",")

inc_white_nh <- read_csv("createData/ACS_medianIncome_white_NH.csv") %>% 
    dplyr::mutate(
    Race = "White_NH"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",")

inc_hisp <- read_csv("createData/ACS_medianIncome_hispanic.csv") %>% 
    dplyr::mutate(
    Race = "Hispanic"
  ) %>% 
  tidyr::separate(county, into=c("county", "stateName"), sep=",") 



inc_df <- dplyr::bind_rows(inc_all, inc_white, inc_white_nh, inc_black, inc_hisp) %>% 
  tidyr::spread(Race, medianIncome) %>% 
    dplyr::rename(
    medianIncome = Total,
    medianIncome_white = White,
    medianIncome_black = Black,
    medianIncome_white_nh = White_NH,
    medianIncome_hisp = Hispanic
  ) %>% 
dplyr::select(-county, -stateName)


##Fixing fips that begin with 0
for(i in 1:nrow(inc_df)){
  if(nchar(inc_df$fips[i])==4){
    inc_df$fips[i] <- paste0("`0", as.character(inc_df$fips[i]))
  } 
  if(nchar(inc_df$fips[i])==5){
    inc_df$fips[i] <- paste0("`", as.character(inc_df$fips[i]))
  } 
  
}


inc_df <- inc_df %>% 
   dplyr::rename(FIPS=fips) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    dplyr::group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)


```


##2.  Poverty. ACS (American Community Survey)
**Poverty**
[Total and by Race] S1702 - POVERTY STATUS IN THE PAST 12 MONTHS OF FAMILIES

-- HC02_EST_VC01 - All families  - Percent below poverty level; Estimate; Families

-- HC02_EST_VC09 - All families  - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is *White alone*"

-- HC02_EST_VC10 - All families  - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is *Black or African American alone*

-- HC02_EST_VC11 - All families  - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is *American Indian and Alaska Native alone*

--HC02_EST_VC12 - All families  - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is *Asian alone*

-- HC02_EST_VC13 - All families  - Percent below poverty level; Estimate; RACE AND HISPANIC OR LATINO ORIGIN - Families with a householder who is *Native Hawaiian and Other Pacific Islander alone*

-- HC02_EST_VC17 - All families  - Percent below poverty level; Estimate; Hispanic or Latino origin (of any race)

-- HC02_EST_VC18 - All families  - Percent below poverty level; Estimate; White alone, not Hispanic or Latino

```{r}
#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")

pov <- read_csv("createData/ACS_poverty.csv") %>% 
  dplyr::select(-county)

##Fixing fips that begin with 0
for(i in 1:nrow(pov)){
  if(nchar(pov$fips[i])==4){
    pov$fips[i] <- paste0("`0", as.character(pov$fips[i]))
  } 
  if(nchar(pov$fips[i])==5){
    pov$fips[i] <- paste0("`", as.character(pov$fips[i]))
  } 
  
}


pov <- pov %>% 
   dplyr::rename(FIPS=fips) %>% 
    dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

```

##3. Gini Index ACS (American Community Survey)

- B19083 - 	GINI INDEX OF INCOME INEQUALITY (5 year averages)
https://factfinder.census.gov/faces/affhelp/jsf/pages/metadata.xhtml?lang=en&type=table&id=table.en.ACS_15_5YR_B19083

```{r}
#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")

gini <- read_csv("createData/ACS_gini_5yr.csv") %>% 
  dplyr::select(-county)


##Fixing fips that begin with 0
for(i in 1:nrow(gini)){
  if(nchar(gini$fips[i])==4){
    gini$fips[i] <- paste0("`0", as.character(gini$fips[i]))
  } 
  if(nchar(gini$fips[i])==5){
    gini$fips[i] <- paste0("`", as.character(gini$fips[i]))
  } 
}

gini <- gini %>% 
   dplyr::rename(FIPS=fips) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

```


##4. Single-Parent-Headed Households %  ACS (American Community Survey)

-- B11001 - "HOUSEHOLD TYPE (INCLUDING LIVING ALONE)"

-- B11001A - "HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (WHITE ALONE)"

-- B11001B - "HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (BLACK OR AFRICAN AMERICAN ALONE)"

-- B11001C - "HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (AMERICAN INDIAN AND ALASKA NATIVE ALONE)"

-- B11001D - "HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (ASIAN ALONE)"

-- B11001E - "HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)"

-- B11001H - "HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (WHITE ALONE, NOT HISPANIC OR LATINO)"

-- B11001I - "HOUSEHOLD TYPE (INCLUDING LIVING ALONE) (HISPANIC OR LATINO)"

From each dataset two variables are extracted:  

1.  "Estimate; Family households" [HD01_VD02]  

2. "Estimate; Family households: - Other family: - Female householder, no husband present" [HD01_VD06]

```{r}
#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")

sp_total <- read_csv("createData/ACS_singleParent_total.csv") %>% 
  dplyr::mutate(singleParent_prcnt = singleParent/totalFamilies)


sp_white <- read_csv("createData/ACS_singleParent_white.csv") %>% 
    dplyr::mutate(
      singleParent_prcnt_white = ifelse(families_white>0,
                                        singleParent_white/families_white,0)
    )

sp_white_nh <- read_csv("createData/ACS_singleParent_white_nh.csv") %>% ##nonhispanic white
      dplyr::mutate(
        singleParent_prcnt_white_nh = ifelse(families_white_nh > 0,
                                             singleParent_white_nh/families_white_nh, 0)
      )

sp_black <- read_csv("createData/ACS_singleParent_black.csv") %>% 
      dplyr::mutate(
        singleParent_prcnt_black = ifelse(families_black > 0,
                                          singleParent_black/families_black,  0)
)

sp_amerindian <- read_csv("createData/ACS_singleParent_amerindian.csv") %>% 
      dplyr::mutate(
        singleParent_prcnt_amerindian = ifelse(families_amerindian>0,
                                               singleParent_amerindian/families_amerindian, 0)
      )

sp_asian <- read_csv("createData/ACS_singleParent_asian.csv") %>% 
      dplyr::mutate(
        singleParent_prcnt_asian = ifelse(families_asian>0,
                                          singleParent_asian/families_asian, 0)
      )

sp_hawaiian <- read_csv("createData/ACS_singleParent_hawaiian.csv") %>% 
      dplyr::mutate(
        singleParent_prcnt_hawaiian = ifelse(families_hawaiian>0,
                                             singleParent_hawaiian/families_hawaiian,  0)
      )

sp_hispanic <- read_csv("createData/ACS_singleParent_hispanic.csv") %>% 
      dplyr::mutate(
        singleParent_prcnt_hispanic = ifelse(families_hispanic>0,
                                             singleParent_hispanic/families_hispanic, 0)
      )

singleParent <- dplyr::left_join(sp_total, sp_white) %>% 
  dplyr::left_join(sp_white_nh) %>% 
  dplyr::left_join(sp_black) %>% 
  dplyr::left_join(sp_amerindian) %>% 
  dplyr::left_join(sp_asian) %>% 
  dplyr::left_join(sp_hawaiian) %>% 
  dplyr::left_join(sp_hispanic) %>% 
  dplyr::select(-county)
  

##Fixing fips that begin with 0
for(i in 1:nrow(singleParent)){
  if(nchar(singleParent$fips[i])==4){
    singleParent$fips[i] <- paste0("`0", as.character(singleParent$fips[i]))
  } 
  if(nchar(singleParent$fips[i])==5){
    singleParent$fips[i] <- paste0("`", as.character(singleParent$fips[i]))
  } 
}

singleParent <- singleParent %>% 
   dplyr::rename(FIPS=fips) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

```

##5.  Educational Attainment (less than high school %)  ACS (American Community Survey)
**EDUCATIONAL ATTAINMENT**
[Total and by Race] S1501 - EDUCATIONAL ATTAINMENT

HC02_EST_VC03 - "Percent; Estimate; Population 18 to 24 years - Less than high school graduate"

```{r}
#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")

hs <- read_csv("createData/ACS_highschool.csv")

##Fixing fips that begin with 0
for(i in 1:nrow(hs)){
  if(nchar(hs$fips[i])==4){
    hs$fips[i] <- paste0("`0", as.character(hs$fips[i]))
  } 
  if(nchar(hs$fips[i])==5){
    hs$fips[i] <- paste0("`", as.character(hs$fips[i]))
  } 
}

hs <- hs %>% 
dplyr::rename(FIPS=fips) %>% 
dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

```





##6.  Crowing (ACS)  (Occupants per room)  (ACS)
[All Races, by Owner or Renter] B25014 - TENURE BY OCCUPANTS PER ROOM
To calculate "crowding" we esimate the percentage of households with 1.01 or more occupants per room.  We include owners and renters.  We use the following variables:

- HD01_VD01 - "Estimate; Total:"

- HD01_VD05 - "Estimate; Owner occupied: - 1.01 to 1.50 occupants per room"

- HD01_VD06 - "Estimate; Owner occupied: - 1.51 to 2.00 occupants per room"

- HD01_VD07 - "Estimate; Owner occupied: - 2.01 or more occupants per room"

- HD01_VD11 - "Estimate; Renter occupied: - 1.01 to 1.50 occupants per room"

- HD01_VD12 - "Estimate; Renter occupied: - 1.51 to 2.00 occupants per room"

- HD01_VD13 - "Estimate; Renter occupied: - 2.01 or more occupants per room"

```{r}
#setwd("createData")
crowd <- read_csv("createData/ACS_crowding.csv")

##Fixing fips that begin with 0
for(i in 1:nrow(crowd)){
  if(nchar(crowd$fips[i])==4){
    crowd$fips[i] <- paste0("`0", as.character(crowd$fips[i]))
  } 
  if(nchar(crowd$fips[i])==5){
    crowd$fips[i] <- paste0("`", as.character(crowd$fips[i]))
  } 
  
}

crowd <- crowd %>% 
   dplyr::rename(FIPS=fips) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

```




##7.  Residential Segregation - CountyHealthRankings.org from ACS 2016-2017
Averaged across 2016 and 2017
countyhealthrankings.org, "Residential segregation—black/white, 2011-2015, source=ACS"

Description: http://www.countyhealthrankings.org/measure/residential-segregation-blackwhite

"Racial/ethnic residential segregation refers to the degree to which two or more groups live separately from one another in a geographic area. The index of dissimilarity is a demographic measure of the evenness with which two groups (black and white residents, in this case) are distributed across the component geographic areas (census tracts, in this case) that make up a larger area (counties, in this case). The index score can be interpreted as the percentage of either black or white residents that would have to move to different geographic areas in order to produce a distribution that matches that of the larger area."

```{r}
#setwd("createData")
seg <- readr::read_csv("createData/segregation_countyhealth.csv") %>% 
  group_by(fips) %>% 
  dplyr::select(-State, -County, -year, -X1) %>% 
  dplyr::summarise_all(mean, na.rm=T) %>% 
   dplyr::rename(FIPS=fips) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
    group_by(FIPS) %>% 
  dplyr::summarise_all(median, na.rm=T)

```




#VI. BLS Data - Employment (2011-2015 average)

Employment data are derived from the Bureau of Labor Statistics, "Labor Force Data by County, 2015 Annual Averages".  The Local Area Unemployment Statistics (LAUS) are provided at:   averages
https://www.bls.gov/lau/#tables
Specifically, I downloaded the aggregated the following excel files:
  
  1. Labor force data by county, 2015 annual averages  https://www.bls.gov/lau/laucnty15.xlsx
  
  2. Labor force data by county, 2014 annual averages  https://www.bls.gov/lau/laucnty14.xlsx
  
  3. Labor force data by county, 2013 annual averages  https://www.bls.gov/lau/laucnty13.xlsx
  
  4. Labor force data by county, 2012 annual averages  https://www.bls.gov/lau/laucnty12.xlsx
  
  5. Labor force data by county, 2011 annual averages  https://www.bls.gov/lau/laucnty11.xlsx
 

```{r}
#setwd("createData")
nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")
blsVars <- c("laborForce", "employed", "unemployed", "unemploymentRate")

bls <- read_csv("createData/blsData_2011_2015.csv") %>% 
  dplyr::mutate(
    stateCode = paste0("`", stateCode),
    fips = paste0(stateCode, countyCode),
    fips = replace(fips, which(fips %in% nyFips), "`36061")
  ) %>% 
  dplyr::group_by(fips, year) %>% 
  dplyr::summarise_all(median, na.rm=T) %>% ##necessary to remove extra NYC 
  dplyr::select(-countyName, -stateCode, -countyCode) %>% 
dplyr::group_by(year, fips) %>% 
  mutate_all(sum, na.rm=T) %>%  ##aggregating NYC
  mutate(unemploymentRate = unemployed/laborForce) %>% 
dplyr::group_by(fips) %>% 
  mutate_at(blsVars, funs("5yr" = mean(., na.rm=T))) %>% 
  mutate_at(blsVars,funs("change" = mean((.[which(year==2015)] - .[which(year==2011)]) / .[which(year==2011)], na.rm=T))) %>% 
  dplyr::filter(year==2015) %>% 
  dplyr::select(-year) %>% 
  dplyr::rename(FIPS=fips)



```


#VII.  POLICE DATA (LEMAS)

Law Enforcement Management and Administrative Statistics (LEMAS) Series from ICPSR (2013).
https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/36164

```{r}
# Law Enforcement Agency Indentifiers Crosswalk, 2012 (ICPSR 35158)
# https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/35158

nyFips <- c("36061", "36047", "36081", "36005", "36085")

#setwd("createData")
ucr_geo <-  read_tsv("createData/35158-0001-Data.tsv") %>% 
dplyr::select(ORI7, FIPS )


police <- haven::read_stata("createData/36164-0001-Data.dta") %>% 
  dplyr::select(
                ORI7, 
                STATECODE, 
                PERS_RESP_PATRL, #primary duty - patrol
                PERS_RESP_INVST, #primary duty - investigative
                PERS_PDSW_FFT, #NUMBER OF FEMALE SWORN PERSONNEL - fulltime
                FTSWORN,  #TOTAL NUMBER OF SWORN PERSONNEL - fulltime
                PERS_FTS_BLK, #TOTAL NUMBER OF SWORN PERSONNEL - fulltime - Black
                COM_MIS, #COMMUNITY POLICING COMPONENT IN MISSION STATEMENT
                ## 1 = No written mission statement
                ## 2 = Written statement, no community policing component
                ## 3 = Written statement, yes community policing component
                TECH_TYP_GUN #Gunshot detection system, 1 = Yes (8%), 2 = NO.
                #FINALWT #FINAL WEIGHT FOR EACH STRATUM
                ) %>% 
  dplyr::rename(
    State=STATECODE,
    policeFullTime = FTSWORN,
    policeFemale = PERS_PDSW_FFT,
    policeBlack = PERS_FTS_BLK,
    policePatrol=PERS_RESP_PATRL, 
    policeInvestigate=PERS_RESP_INVST,
    communityPolice=COM_MIS,
    gunShotDetection=TECH_TYP_GUN) %>% 
  dplyr::left_join(ucr_geo) %>% 
  mutate_all(funs(replace(., is.nan(.), NA))) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "36061"), ###NYC aggregation
    communityPolice=replace(communityPolice, which(communityPolice==3), TRUE),
    communityPolice=replace(communityPolice, which(communityPolice != 3), FALSE),
    gunShotDetection=replace(gunShotDetection, which(gunShotDetection == 1), TRUE),
    gunShotDetection=replace(gunShotDetection, which(gunShotDetection == 2), FALSE)
  ) %>% 
  group_by(State, FIPS) %>% 
  dplyr::summarise(
    policePatrol=sum(policePatrol, na.rm=T),
    policeInvestigate=sum(policeInvestigate, na.rm=T),
    communityPolice=mean(communityPolice, na.rm=T),
    gunShotDetection=mean(gunShotDetection, na.rm=T),
    policeFullTime=sum(policeFullTime, na.rm=T),
    policeFemale=sum(policeFemale, na.rm=T),
    policeBlack=sum(policeBlack, na.rm=T)
  ) %>% 
  dplyr::mutate(
    policeFullTime_log=log(policeFullTime),
    policePrcntBlack = 100*policeBlack/policeFullTime,
    policePrcntFemale = 100*policeFemale/policeFullTime,
    policePatrol_log = log(1+policePatrol),
    policeInvestigate_log = log(1+policeInvestigate)
  ) %>% 
  mutate_all( funs(replace(., is.nan(.), NA))) %>% 
  mutate_all( funs(replace(., is.infinite(.), NA))) %>% 
  ungroup() %>% 
  dplyr::filter(!is.na(FIPS)) %>% 
  dplyr::mutate(FIPS=paste("`", as.character(FIPS), sep="")) %>% 
  dplyr::select(-State)
  

#fst::write.fst(police, path="police.csv")


```

#VIII. Land Area (Census)
Land Area estimates from:
https://www.census.gov/support/USACdataDownloads.html

'Land Area' file "LND01.xls".  
Variable:  LND110210D, "Land area in square miles 2010" 


```{r}

nyFips <- c("`36061", "`36047", "`36081", "`36005", "`36085")

library(dplyr)

area <- readr::read_csv("createData/LND01.csv") %>% 
  dplyr::select(Areaname, STCOU, LND110210D) %>% 
  dplyr::mutate(
    stateCode = substr(STCOU, start=3, stop=5)
  ) %>% 
  dplyr::filter(stateCode != "000") %>%   ##getting rid of state totals
  dplyr::transmute(
    FIPS = paste0("`", STCOU),
    landArea = LND110210D
  ) %>% 
  dplyr::filter(landArea > 0) %>% 
  dplyr::mutate(
    FIPS = replace(FIPS, which(FIPS %in% nyFips), "`36061")
  ) %>% 
  dplyr::group_by(FIPS) %>% 
  dplyr::summarise(
    landArea = sum(landArea, na.rm=TRUE)
  )
##eliminating 3 cases that aren't in the other data files with size 0
#Yellowstone National Park, MT, `30113
#Clifton Forge, VA, `51560 %>% 
#South Boston, VA, `51780


```



#IX.  Combine all Variables


```{r}
if(exists("UCRData")==FALSE){
  UCRData <- fst::read.fst("createData/UCRData.csv")
}

countyVars <- census_vars %>% 
  dplyr::left_join(seda) %>%
  dplyr::left_join(police) %>% 
  dplyr::left_join(area) %>% 
  dplyr::left_join(bls) %>% 
  dplyr::left_join(inc_df) %>% 
  dplyr::left_join(pov) %>% 
  dplyr::left_join(gini) %>% 
  dplyr::left_join(singleParent) %>% 
  dplyr::left_join(hs) %>% 
  dplyr::left_join(crowd) %>% 
  dplyr::left_join(seg) %>% 
  dplyr::left_join(UCRData)
  
fst::write.fst(countyVars, path="countyVars_f.csv")
write.csv(countyVars, file="countyVars.csv")
  
```



#X.  Principle Components 
##1.  CRIME
```{r}
if(exists("countyVars")==TRUE){
  countyVars <- countyVars
} else {
  countyVars <- fst::read.fst("Data/countyVars_f.csv")
} 
##vars <- fst::read.fst("countyVars.csv")

totalCrime <- countyVars[,c("FIPS", "murder_r_5yr", "dui_r_5yr", "vc_r_5yr", "theft_r_5yr", "vandal_r_5yr", "drugs_r_5yr")]
totalCrime <- totalCrime[complete.cases(totalCrime),] 

pc_crime <-  prcomp(~ murder_r_5yr + vc_r_5yr + dui_r_5yr  + vandal_r_5yr + theft_r_5yr + drugs_r_5yr, 
                            data=totalCrime,
                            scale. = T, center=TRUE,
                            na.action=na.omit)

totalCrime$pc1Crime <- pc_crime$x[,1]
totalCrime$pc2Crime <- pc_crime$x[,2]
totalCrime <- totalCrime[,c("FIPS", "pc1Crime", "pc2Crime")]

countyVars <- dplyr::left_join(countyVars, totalCrime)



##print PCA analysis from FactoMineR

pcFit <- FactoMineR::PCA(countyVars[,c("murder_r_5yr",  "dui_r_5yr", "vc_r_5yr", "theft_r_5yr", "vandal_r_5yr", "drugs_r_5yr")])
summary(pcFit)

```


##2.  Crime - Violent Crime, Theft, Vandalism only
```{r}

if(exists("countyVars")==TRUE){
  countyVars <- countyVars
} else {
  countyVars <- fst::read.fst("Data/countyVars_f.csv")
} 

vCrime <- countyVars[,c("FIPS", "vc_r_5yr", "theft_r_5yr", "vandal_r_5yr")]
vCrime <- vCrime[complete.cases(vCrime),] 

pc_crime <-  prcomp(~  vc_r_5yr +  vandal_r_5yr + theft_r_5yr , 
                            data=vCrime,
                            scale. = T, center=TRUE,
                            na.action=na.omit)

vCrime$pc1Crime_3 <- pc_crime$x[,1]
vCrime$pc2Crime_3 <- pc_crime$x[,2]
vCrime <- vCrime[,c("FIPS", "pc1Crime_3", "pc2Crime_3")]

countyVars <- dplyr::left_join(countyVars, vCrime)



##print PCA analysis from FactoMineR

pcFit <- FactoMineR::PCA(countyVars[,c("vc_r_5yr", "theft_r_5yr", "vandal_r_5yr")])
summary(pcFit)


```

##3.  SES
```{r}

if(exists("countyVars")==TRUE){
  countyVars <- countyVars
} else {
  countyVars <- fst::read.fst("Data/countyVars_f.csv")
} 


totalSES <- countyVars[,c("FIPS", "poverty", "singleParent", "medianIncome", "unemploymentRate_5yr")]
totalSES <- totalSES[complete.cases(totalSES),] 

pc_ses <-  prcomp(~ poverty + singleParent + medianIncome + unemploymentRate_5yr, 
                            data=totalSES,
                            scale. = T, center=TRUE,
                            na.action=na.omit)

totalSES$pc1SES <- pc_ses$x[,1]
totalSES$pc2SES <- pc_ses$x[,2]
totalSES <- totalSES[,c("FIPS", "pc1SES", "pc2SES")]

countyVars <- dplyr::left_join(countyVars, totalSES)



##print PCA analysis from FactoMineR

pcFit <- FactoMineR::PCA(countyVars[,c("poverty", "singleParent", "medianIncome", "unemploymentRate_5yr")])
summary(pcFit)

fst::write.fst(countyVars, path="countyVars_f.csv")
write.csv(countyVars, file="countyVars.csv")
```




#XI.  Add to Police Killing Variables
##1. Reload Police Killing Data (saved above)
```{r}
wapo <- fst::read.fst("df_wapo_fst.csv") %>% 
    dplyr::mutate(
    State=state,
    date = as.Date(date, format= "%Y-%m-%d"),
    age = as.numeric(age),
    gender= factor(gender, levels=c("Male", "Female"))
  )

lott <- fst::read.fst("df_lott_fst.csv") 

```

##2. Washington Post
```{r}
wapo_c <- wapo %>% 
  dplyr::filter(!is.na(FIPS), year < 2017) %>% 
  group_by(dataSource, FIPS) %>% 
  dplyr::summarise(
    State=first(State), 
    killed = n(), 
    killed.unarmed = length(which(Weapon=="unarmed")),
    killed.noattack = length(which(threat_level=="other")),
    killed.noflee = length(which(flee=="Not fleeing")),
    killed.unarmed_noattack = killed.unarmed + killed.noattack,
    killed.unarmed_noattack_noflee = killed.unarmed + killed.noattack + killed.noflee,
    
    ##Males
    killed.male = length(which(gender=="Male")), 
    killed.unarmed.male = length(which(Weapon=="unarmed" & gender=="Male")),
    killed.noattack.male = length(which(threat_level=="other" & gender=="Male")),
    killed.noflee.male = length(which(flee=="Not fleeing" & gender=="Male")),
    killed.unarmed_noattack.male = killed.unarmed.male + killed.noattack.male,
    killed.unarmed_noattack_noflee.male = killed.unarmed.male + killed.noattack.male + killed.noflee.male,
    
    ##Females
    killed.female = length(which(gender=="Female")), 
    killed.unarmed.female = length(which(Weapon=="unarmed" & gender=="Female")),
    killed.noattack.female = length(which(threat_level=="other" & gender=="Female")),
    killed.noflee.female = length(which(flee=="Not fleeing" & gender=="Female")),
    killed.unarmed_noattack.female = killed.unarmed.female + killed.noattack.female,
    killed.unarmed_noattack_noflee.female = killed.unarmed.female + killed.noattack.female + killed.noflee.female,
    
    ##White NonHispanic
    killed.white_nh = length(which(Race=="White")), 
    killed.unarmed.white_nh = length(which(Weapon=="unarmed" & Race=="White")),
    killed.noattack.white_nh = length(which(threat_level=="other" & Race=="White")),
    killed.noflee.white_nh = length(which(flee=="Not fleeing" & Race=="White")),
    killed.unarmed_noattack.white_nh = killed.unarmed.white_nh + killed.noattack.white_nh,
    killed.unarmed_noattack_noflee.white_nh = killed.unarmed.white_nh + killed.noattack.white_nh + killed.noflee.white_nh,
    
    ##Black
    killed.black = length(which(Race=="Black")), 
    killed.unarmed.black = length(which(Weapon=="unarmed" & Race=="Black")),
    killed.noattack.black = length(which(threat_level=="other" & Race=="Black")),
    killed.noflee.black = length(which(flee=="Not fleeing" & Race=="Black")),
    killed.unarmed_noattack.black = killed.unarmed.black + killed.noattack.black,
    killed.unarmed_noattack_noflee.black = killed.unarmed.black + killed.noattack.black + killed.noflee.black,
    
    #Hispanic
    killed.hisp = length(which(Race=="Hispanic")), 
    killed.unarmed.hisp = length(which(Weapon=="unarmed" & Race=="Hispanic")),
    killed.noattack.hisp = length(which(threat_level=="other" & Race=="Hispanic")),
    killed.noflee.hisp = length(which(flee=="Not fleeing" & Race=="Hispanic")),
    killed.unarmed_noattack.hisp = killed.unarmed.hisp + killed.noattack.hisp,
    killed.unarmed_noattack_noflee.hisp = killed.unarmed.hisp + killed.noattack.hisp + killed.noflee.hisp,
    
    
    ##White + Hispanic category - matches UCR
    killed.white = length(which(Race=="White" | Race == "Hispanic")), 
    killed.unarmed.white = length(which(Weapon=="unarmed" & (Race=="White" | Race == "Hispanic"))),
    killed.noattack.white = length(which(threat_level=="other" & (Race=="White" | Race == "Hispanic") )),
    killed.noflee.white = length(which(flee=="Not fleeing" & (Race=="White" | Race == "Hispanic")   )),
    killed.unarmed_noattack.white = killed.unarmed.white + killed.noattack.white,
    killed.unarmed_noattack_noflee.white = killed.unarmed.white + killed.noattack.white + killed.noflee.white,
    
    
    #Asian
    killed.asian = length(which(Race=="Asian")), 
    killed.unarmed.asian = length(which(Weapon=="unarmed" & Race=="Asian")),
    killed.noattack.asian = length(which(threat_level=="other" & Race=="Asian")),
    killed.noflee.asian = length(which(flee=="Not fleeing" & Race=="Asian")),
    killed.unarmed_noattack.asian = killed.unarmed.asian + killed.noattack.asian,
    killed.unarmed_noattack_noflee.asian = killed.unarmed.asian + killed.noattack.asian + killed.noflee.asian,
    
    #Native American
    killed.amerindian = length(which(Race=="Native American")), 
    killed.unarmed.amerindian = length(which(Weapon=="unarmed" & Race=="Native American")),
    killed.noattack.amerindian = length(which(threat_level=="other" & Race=="Native American")),
    killed.noflee.amerindian = length(which(flee=="Not fleeing" & Race=="Native American")),
    killed.unarmed_noattack.amerindian = killed.unarmed.amerindian + killed.noattack.amerindian,
    killed.unarmed_noattack_noflee.amerindian = killed.unarmed.amerindian + killed.noattack.amerindian + killed.noflee.amerindian
            ) 

##Repeat for States
wapo_state <- wapo %>% 
  dplyr::filter(!is.na(State), year < 2017) %>% 
  group_by(dataSource, State) %>% 
  dplyr::summarise(
    killed = n(), 
    killed.unarmed = length(which(Weapon=="unarmed")),
    killed.noattack = length(which(threat_level=="other")),
    killed.noflee = length(which(flee=="Not fleeing")),
    killed.unarmed_noattack = killed.unarmed + killed.noattack,
    killed.unarmed_noattack_noflee = killed.unarmed + killed.noattack + killed.noflee,
    
    ##Males
    killed.male = length(which(gender=="Male")), 
    killed.unarmed.male = length(which(Weapon=="unarmed" & gender=="Male")),
    killed.noattack.male = length(which(threat_level=="other" & gender=="Male")),
    killed.noflee.male = length(which(flee=="Not fleeing" & gender=="Male")),
    killed.unarmed_noattack.male = killed.unarmed.male + killed.noattack.male,
    killed.unarmed_noattack_noflee.male = killed.unarmed.male + killed.noattack.male + killed.noflee.male,
    
    ##Females
    killed.female = length(which(gender=="Female")), 
    killed.unarmed.female = length(which(Weapon=="unarmed" & gender=="Female")),
    killed.noattack.female = length(which(threat_level=="other" & gender=="Female")),
    killed.noflee.female = length(which(flee=="Not fleeing" & gender=="Female")),
    killed.unarmed_noattack.female = killed.unarmed.female + killed.noattack.female,
    killed.unarmed_noattack_noflee.female = killed.unarmed.female + killed.noattack.female + killed.noflee.female,
    
    ##White NonHispanic
    killed.white_nh = length(which(Race=="White")), 
    killed.unarmed.white_nh = length(which(Weapon=="unarmed" & Race=="White")),
    killed.noattack.white_nh = length(which(threat_level=="other" & Race=="White")),
    killed.noflee.white_nh = length(which(flee=="Not fleeing" & Race=="White")),
    killed.unarmed_noattack.white_nh = killed.unarmed.white_nh + killed.noattack.white_nh,
    killed.unarmed_noattack_noflee.white_nh = killed.unarmed.white_nh + killed.noattack.white_nh + killed.noflee.white_nh,
    
    ##Black
    killed.black = length(which(Race=="Black")), 
    killed.unarmed.black = length(which(Weapon=="unarmed" & Race=="Black")),
    killed.noattack.black = length(which(threat_level=="other" & Race=="Black")),
    killed.noflee.black = length(which(flee=="Not fleeing" & Race=="Black")),
    killed.unarmed_noattack.black = killed.unarmed.black + killed.noattack.black,
    killed.unarmed_noattack_noflee.black = killed.unarmed.black + killed.noattack.black + killed.noflee.black,
    
    #Hispanic
    killed.hisp = length(which(Race=="Hispanic")), 
    killed.unarmed.hisp = length(which(Weapon=="unarmed" & Race=="Hispanic")),
    killed.noattack.hisp = length(which(threat_level=="other" & Race=="Hispanic")),
    killed.noflee.hisp = length(which(flee=="Not fleeing" & Race=="Hispanic")),
    killed.unarmed_noattack.hisp = killed.unarmed.hisp + killed.noattack.hisp,
    killed.unarmed_noattack_noflee.hisp = killed.unarmed.hisp + killed.noattack.hisp + killed.noflee.hisp,
    
    
    ##White + Hispanic category - matches UCR
    killed.white = length(which(Race=="White" | Race == "Hispanic")), 
    killed.unarmed.white = length(which(Weapon=="unarmed" & (Race=="White" | Race == "Hispanic"))),
    killed.noattack.white = length(which(threat_level=="other" & (Race=="White" | Race == "Hispanic") )),
    killed.noflee.white = length(which(flee=="Not fleeing" & (Race=="White" | Race == "Hispanic")   )),
    killed.unarmed_noattack.white = killed.unarmed.white + killed.noattack.white,
    killed.unarmed_noattack_noflee.white = killed.unarmed.white + killed.noattack.white + killed.noflee.white,
    
    
    #Asian
    killed.asian = length(which(Race=="Asian")), 
    killed.unarmed.asian = length(which(Weapon=="unarmed" & Race=="Asian")),
    killed.noattack.asian = length(which(threat_level=="other" & Race=="Asian")),
    killed.noflee.asian = length(which(flee=="Not fleeing" & Race=="Asian")),
    killed.unarmed_noattack.asian = killed.unarmed.asian + killed.noattack.asian,
    killed.unarmed_noattack_noflee.asian = killed.unarmed.asian + killed.noattack.asian + killed.noflee.asian,
    
    #Native American
    killed.amerindian = length(which(Race=="Native American")), 
    killed.unarmed.amerindian = length(which(Weapon=="unarmed" & Race=="Native American")),
    killed.noattack.amerindian = length(which(threat_level=="other" & Race=="Native American")),
    killed.noflee.amerindian = length(which(flee=="Not fleeing" & Race=="Native American")),
    killed.unarmed_noattack.amerindian = killed.unarmed.amerindian + killed.noattack.amerindian,
    killed.unarmed_noattack_noflee.amerindian = killed.unarmed.amerindian + killed.noattack.amerindian + killed.noflee.amerindian
    
    
            ) 


```


##3. Lott & Moody (2016)
```{r}


lott_pkill <- lott %>% 
  group_by(dataSource, FIPS, year) %>% 
  dplyr::summarise(
    policeKilled = first(`police killed`)
  ) %>% 
  group_by(dataSource, FIPS) %>% 
  dplyr::summarise(
    policeKilled = sum(policeKilled)
  )

lott_c <- lott %>% 
  filter(!is.na(FIPS), year < 2017) %>% 
  group_by(dataSource, FIPS) %>% ##average across years 2012-2015
  dplyr::summarise(
    State=first(State), 
    killed = n(), 
    killed.unarmed = length(which(Armed==FALSE)),
    
   ##White NonHispanic
    killed.white_nh = length(which(Race=="White")), 
    killed.unarmed.white_nh = length(which(Armed==FALSE & Race=="White")),  
   
    ##Black
    killed.black = length(which(Race=="Black")), 
    killed.unarmed.black = length(which(Armed==FALSE & Race=="Black")),
   
    ##Hispanic
    killed.hisp = length(which(Race=="Hispanic")), 
    killed.unarmed.hisp = length(which(Armed==FALSE & Race=="Hispanic")),
   
    ##White + Hispanic
    killed.white = length(which(Race=="Hispanic" | Race == "White")), 
    killed.unarmed.white = length(which(Armed==FALSE & (Race=="Hispanic" | Race == "White"))),
   
    ##Asian
    killed.asian = length(which(Race=="Asian/Pacific Islander")), 
    killed.unarmed.asian = length(which(Armed==FALSE & Race=="Asian/Pacific Islander")),
      
    ##Native American
    killed.amerindian = length(which(Race=="Native American")), 
    killed.unarmed.amerindian = length(which(Armed==FALSE & Race=="Native American")),
   
   av_Police_on_Scene = mean(`number police on scene`),
   officer_white = length(which(OfficerRace=="White")) / killed,  
   #Percentage of first officers white out of total incicents
   officer_unknown = length(which(OfficerRace=="Unknown")) / killed,
   officer_black = length(which(OfficerRace=="Black")) / killed,
   officer_hispanic = length(which(OfficerRace=="Hispanic")) / killed
  ) %>% 
  dplyr::left_join(lott_pkill)
                            


##Repeat for States

lott_pkill_state <- lott %>% 
  dplyr::group_by(dataSource, State, year) %>% 
  dplyr::summarise(
    policeKilled = first(`police killed`)
  ) %>% 
  dplyr::group_by(dataSource, State) %>% 
  dplyr::summarise(
    policeKilled = sum(policeKilled)
  )

lott_state <- lott %>% 
  dplyr::filter(!is.na(State), year < 2017) %>% 
  dplyr::group_by(dataSource, State) %>% ##average across years 2012-2015
  dplyr::summarise(
    killed = n(), 
    killed.unarmed = length(which(Armed==FALSE)),
    
   ##White NonHispanic
    killed.white_nh = length(which(Race=="White")), 
    killed.unarmed.white_nh = length(which(Armed==FALSE & Race=="White")),  
   
    ##Black
    killed.black = length(which(Race=="Black")), 
    killed.unarmed.black = length(which(Armed==FALSE & Race=="Black")),
   
    ##Hispanic
    killed.hisp = length(which(Race=="Hispanic")), 
    killed.unarmed.hisp = length(which(Armed==FALSE & Race=="Hispanic")),
   
    ##White + Hispanic
    killed.white = length(which(Race=="Hispanic" | Race == "White")), 
    killed.unarmed.white = length(which(Armed==FALSE & (Race=="Hispanic" | Race == "White"))),
   
       ##Asian
    killed.asian = length(which(Race=="Asian/Pacific Islander")), 
    killed.unarmed.asian = length(which(Armed==FALSE & Race=="Asian/Pacific Islander")),
      
    ##Native American
    killed.amerindian = length(which(Race=="Native American")), 
    killed.unarmed.amerindian = length(which(Armed==FALSE & Race=="Native American")),
   
   av_Police_on_Scene = mean(`number police on scene`),
   officer_white = length(which(OfficerRace=="White")) / killed,  
   #Percentage of first officers white out of total incicents
   officer_unknown = length(which(OfficerRace=="Unknown")) / killed,
   officer_black = length(which(OfficerRace=="Black")) / killed,
   officer_hispanic = length(which(OfficerRace=="Hispanic")) / killed
  ) %>% 
  dplyr::left_join(lott_pkill_state)
                            
```

##4. Combining Lott & Moody (2013-2015) with 2016 data from Washington Post
```{r}
lott_wapo <- wapo %>% 
  dplyr::filter(!is.na(FIPS), year == 2016) %>% 
  group_by(dataSource,FIPS) %>% 
  dplyr::summarise(
    killed = n(), 
    killed.unarmed = length(which(Weapon=="unarmed")),
    ##White NonHispanic
    killed.white_nh = length(which(Race=="White")), 
    killed.unarmed.white_nh = length(which(Weapon=="unarmed" & Race=="White")),
    
    ##Black
    killed.black = length(which(Race=="Black")), 
    killed.unarmed.black = length(which(Weapon=="unarmed" & Race=="Black")),

    #Hispanic
    killed.hisp = length(which(Race=="Hispanic")), 
    killed.unarmed.hisp = length(which(Weapon=="unarmed" & Race=="Hispanic")),
    
    ##White + Hispanic category - matches UCR
    killed.white = length(which(Race=="White" | Race == "Hispanic")), 
    killed.unarmed.white = length(which(Weapon=="unarmed" & (Race=="White" | Race == "Hispanic"))),
    
     #Asian
    killed.asian = length(which(Race=="Asian")), 
    killed.unarmed.asian = length(which(Weapon=="unarmed" & Race=="Asian")),
    # killed.noattack.asian = length(which(threat_level=="other" & Race=="Asian")),
    # killed.noflee.asian = length(which(flee=="Not fleeing" & Race=="Asian")),
    # killed.unarmed_noattack.asian = killed.unarmed.asian + killed.noattack.asian,
    # killed.unarmed_noattack_noflee.asian = killed.unarmed.asian + killed.noattack.asian + killed.noflee.asian,
    
    #Native American
    killed.amerindian = length(which(Race=="Native American")), 
    killed.unarmed.amerindian = length(which(Weapon=="unarmed" & Race=="Native American"))
    # killed.noattack.amerindian = length(which(threat_level=="other" & Race=="Native American")),
    # killed.noflee.amerindian = length(which(flee=="Not fleeing" & Race=="Native American")),
    # killed.unarmed_noattack.amerindian = killed.unarmed.amerindian + killed.noattack.amerindian,
    # killed.unarmed_noattack_noflee.amerindian = killed.unarmed.amerindian + killed.noattack.amerindian + killed.noflee.amerindian
  )


lott_c2 <- lott_c %>% 
  group_by(dataSource,FIPS) %>% 
  dplyr::select(contains("killed")) %>% 
  dplyr::select(-policeKilled)


lott_wapo <- dplyr::bind_rows(lott_wapo, lott_c2) %>% 
  group_by(FIPS) %>% 
  dplyr::summarise_if(is.numeric, sum, na.rm=T) %>% 
  dplyr::mutate(
    dataSource = "WaPo & Lott"
  )


##Repeat for States
lott_wapo_state <- wapo %>% 
  dplyr::filter(!is.na(State), year == 2016) %>% 
  group_by(dataSource,State) %>% 
  dplyr::summarise(
    killed = n(), 
    killed.unarmed = length(which(Weapon=="unarmed")),
    ##White NonHispanic
    killed.white_nh = length(which(Race=="White")), 
    killed.unarmed.white_nh = length(which(Weapon=="unarmed" & Race=="White")),
    
    ##Black
    killed.black = length(which(Race=="Black")), 
    killed.unarmed.black = length(which(Weapon=="unarmed" & Race=="Black")),

    #Hispanic
    killed.hisp = length(which(Race=="Hispanic")), 
    killed.unarmed.hisp = length(which(Weapon=="unarmed" & Race=="Hispanic")),
    
    ##White + Hispanic category - matches UCR
    killed.white = length(which(Race=="White" | Race == "Hispanic")), 
    killed.unarmed.white = length(which(Weapon=="unarmed" & (Race=="White" | Race == "Hispanic"))),
    
     #Asian
    killed.asian = length(which(Race=="Asian")), 
    killed.unarmed.asian = length(which(Weapon=="unarmed" & Race=="Asian")),
    # killed.noattack.asian = length(which(threat_level=="other" & Race=="Asian")),
    # killed.noflee.asian = length(which(flee=="Not fleeing" & Race=="Asian")),
    # killed.unarmed_noattack.asian = killed.unarmed.asian + killed.noattack.asian,
    # killed.unarmed_noattack_noflee.asian = killed.unarmed.asian + killed.noattack.asian + killed.noflee.asian,
    
    #Native American
    killed.amerindian = length(which(Race=="Native American")), 
    killed.unarmed.amerindian = length(which(Weapon=="unarmed" & Race=="Native American"))
    # killed.noattack.amerindian = length(which(threat_level=="other" & Race=="Native American")),
    # killed.noflee.amerindian = length(which(flee=="Not fleeing" & Race=="Native American")),
    # killed.unarmed_noattack.amerindian = killed.unarmed.amerindian + killed.noattack.amerindian,
    # killed.unarmed_noattack_noflee.amerindian = killed.unarmed.amerindian + killed.noattack.amerindian + killed.noflee.amerindian
  )


lott_c2_state <- lott_state %>% 
  group_by(dataSource,State) %>% 
  dplyr::select(contains("killed")) %>% 
  dplyr::select(-policeKilled)


lott_wapo_state <- dplyr::bind_rows(lott_wapo_state, lott_c2_state) %>% 
  group_by(State) %>% 
  summarise_if(is.numeric, sum, na.rm=T) %>% 
  dplyr::mutate(
    dataSource = "WaPo & Lott"
  )


```


##5.  Save Data
```{r}

readr::write_csv(lott_wapo, path="killed_lott_wapo.csv")
readr::write_csv(lott_c, path="killed_lott.csv")
readr::write_csv(wapo_c, path="killed_wapo.csv")

readr::write_csv(lott_wapo_state, path="killed_lott_wapo_state.csv")
readr::write_csv(lott_state, path="killed_lott_state.csv")
readr::write_csv(wapo_state, path="killed_wapo_state.csv")

```




#XII.  AGGREGATION
Below I aggregate counties into geographical units of having a minimum specified population size.   For example, if the threshold is 10,000, then I aggregate counties with less than 10,000 residents  into a larger unit having at least 10,000 residents.  The aggregation problem has many possible solutions.  The algorithm I employ aggregates adjacent counties with smallest population sizes first, i.e. counties with the smallest populations in the US are selected first, then aggregated with their smallest size neighbors.  

https://www.census.gov/geo/reference/county-adjacency.html

##A.  FUNCTIONS
###1. Aggregate Counties
This is to create the index that identifies which larger regions counties belong to.  There are 5 counties (or census areas) that are not included in the Census adjacency file.  They in Alaska, Hawaii, and South Dakota.  Their FIPS codes are: 02158, 15001, 15003, 15007, and 46102.  I merge these with the smallest group within the state.



```{r}
Aggregate.Counties <- function(pop.th,  adjacentFile = "createData/countyAdjacent.csv",
                               varFile = "countyVars_f.csv"){

library(readr)
library(tidyr)
library(dplyr)

  adjacent <- fst::read.fst(adjacentFile)
  countyVars <- fst::read.fst(path=varFile)
  df_agg <- countyVars[,c("FIPS", "state", "pop.c.total_5yr")] 
  df <- df_agg
  names(df) <- c("FIPS","state",  "pop")

##grabbing extra FIPS not assigned in adjacency file - attaching to smallest counties within the state
extraFIPS <- setdiff( df_agg$FIPS, adjacent$FIPS)
extra_df <- df_agg[which(df_agg$FIPS %in% extraFIPS),]
df_agg <- df_agg[!(df_agg$FIPS %in% extraFIPS),]
extra_df$n_FIPS <- NA
 for(i in seq_along(extra_df$FIPS)){
   i_state <- extra_df$state[i]
   df_agg_subset <- subset(df_agg, state == i_state)
   extra_df$n_FIPS[i] <- df_agg_subset$FIPS[which.min(df_agg_subset$pop.c.total_5yr)]
 }

extra_df <- extra_df[,c("FIPS", "n_FIPS")]
##Add these are rows to adj file below
adj <- adjacent[,c("FIPS", "n_FIPS")] %>% 
  dplyr::bind_rows(extra_df)

adj <- dplyr::left_join(adj, df, by=c("FIPS"))
df <- df[,c("FIPS", "pop")]
names(df) <- c( "n_FIPS",  "pop_n")
df <- dplyr::left_join(adj, df, by=c("n_FIPS")) %>% 
  dplyr::arrange(FIPS, pop_n) %>%   ##arrange so that neighbors with smallest populations come first
  group_by(FIPS) %>% 
  dplyr::mutate(GroupPop = pop) 

dfG <- df %>% group_by(FIPS) %>% dplyr::summarise()
dfG$Group <- as.numeric(row.names(dfG))
df <- df %>% dplyr::left_join(dfG)

#listwise deletion of cases
df <- df[complete.cases(df),]

message(paste0("INITIAL METRICS: THERE ARE ", length(unique(df$Group)), " UNIQUE GROUPS"))
message(paste0("THERE ARE ", length(unique(df$Group[which(df$GroupPop < pop.th)])), " GROUPS WITH POPULATIONS SMALLER THAN THRESHOLD"))

##Condition 1 - Combine if neighbor also has small population
for(i in 1:nrow(df)){
  f <- df$FIPS[i]
  f2 <- df$n_FIPS[i]
  i2 <- which(df$FIPS==f2)
  
  pop_i2 <- unique(df$pop[i2])
  Gpop_i2 <- unique(df$GroupPop[i2])
  g_i1 <- df$Group[i]
  g_i2 <- unique(df$Group[i2])
  
  if(df$pop[i] < pop.th & df$GroupPop[i] < pop.th){ ##1) If county i has population smaller than threshold 
    if(pop_i2 < pop.th & Gpop_i2 < pop.th){         ##2) If neighbor also has pop. smaller than threshold 
      if(g_i1 != g_i2){                             ##3) If they don't already belong to the same group, then...
        df$Group[which(df$Group == g_i2)] <- g_i1
        
        #check to ensure they don't have the exact same population value, then add unique values
        if(df$GroupPop[i] == Gpop_i2){message("Warning - two counties have exact same group population values")} else {
          df$GroupPop[which(df$Group == g_i1)] <- sum(unique(df$pop[which(df$Group == g_i1)]))
        }
      }}} 
}

message(paste0("END OF ROUND 1, THERE ARE ", length(unique(df$Group)), " UNIQUE GROUPS"))
message(paste0("THERE ARE ", length(unique(df$Group[which(df$GroupPop < pop.th)])), " GROUPS WITH POPULATIONS SMALLER THAN THRESHOLD"))

##Condition 2 - Merge with Larger Neighbors also
for(i in 1:nrow(df)){
  f <- df$FIPS[i]
  f2 <- df$n_FIPS[i]
  i2 <- which(df$FIPS==f2)
  
  pop_i2 <- unique(df$pop[i2])
  Gpop_i2 <- unique(df$GroupPop[i2])
  g_i1 <- df$Group[i]
  g_i2 <- unique(df$Group[i2])
  
  if(df$pop[i] < pop.th & df$GroupPop[i] < pop.th){ ##1) If county i has population smaller than threshold 
    if(g_i1 != g_i2){                             ##3) If they don't already belong to the same group, then...
      df$Group[which(df$Group == g_i2)] <- g_i1
      
      #check to ensure they don't have the exact same population value, then add unique values
      if(df$GroupPop[i] == Gpop_i2){message("Warning - two counties have exact same group population values")} else {
        df$GroupPop[which(df$Group == g_i1)] <- sum(unique(df$pop[which(df$Group == g_i1)]))
      }
    }}
}

message(paste0("END OF ROUND 2, THERE ARE ", length(unique(df$Group)), " UNIQUE GROUPS"))
message(paste0("THERE ARE ", length(unique(df$Group[which(df$GroupPop < pop.th)])), " GROUPS WITH POPULATIONS SMALLER THAN THRESHOLD"))

df$nGroup <- NA
myGroups <- unique(df$Group)
for(i in seq_along(unique(df$Group))){
  df$nGroup[which(df$Group == myGroups[i])] <- i
}

df_id <- df[,c("FIPS", "nGroup")] %>% 
  dplyr::group_by(FIPS, nGroup) %>% 
  dplyr::summarise()

names(df_id) <- c("FIPS", "Group")
return(df_id)
}

```



###2.  Aggregate.County.Data
This is to summarise all variables in the datasets used below.
Because drug possession and drug sales arrest data are not provided by Florida, I'm not including these two variable sets in the aggregated data sets.

```{r}
Aggregate.County.Data <- function(killFile, indexFile, varFile = "countyVars_f.csv"){

library(readr)
library(tidyr)
library(dplyr)
df_new <- fst::read.fst(path=varFile)
df <- dplyr::left_join(df_new, readr::read_csv(killFile)) %>% 
  dplyr::mutate(
    policeRatio = policeFullTime/pop.c.total_5yr,
    bw_income_gap = medianIncome_black/medianIncome_white_nh
  )

crimeVars <- c("murder",  "dui", "drugs", "vc", "theft", "vandal",
               "murder_5yr",  "dui_5yr", "drugs_5yr", "vc_5yr", "theft_5yr", "vandal_5yr" ,
               "murder_white",  "murder_black",  "dui", "dui_white", "dui_black",
               "drugs", "drugs_white", "drugs_black",
               "vc_white", "vc_black",  "theft_white", "theft_black",
               "vandal_white", "vandal_black",
               "murder_white_5yr", "murder_black_5yr",
               "dui_white_5yr", "dui_black_5yr",
               "drugs_white_5yr", "drugs_black_5yr",
               "vc_white_5yr", "vc_black_5yr",
               "theft_white_5yr", "theft_black_5yr",
               "vandal_white_5yr" , "vandal_black_5yr")

popVars <- c("population", "pop.c.total","pop.c.black",  "pop.c.white_nh", "pop.c.WH", "pop.c.asian", 
             "pop.c.AP", "pop.c.amerindian", "pop.c.hispanic", "pop.male.15_29", "pop.black.male15_29",
             "pop.c.total_5yr","pop.c.black_5yr",  "pop.c.white_nh_5yr", "pop.c.WH_5yr", "pop.c.asian_5yr", 
             "pop.c.amerindian_5yr", "pop.c.hispanic_5yr", "pop.male.15_29_5yr", 
             "landArea")

killVars <- c("killed",  "killed.unarmed", "killed.white_nh", "killed.unarmed.white_nh", 
              "killed.black" ,"killed.unarmed.black","killed.hisp", "killed.unarmed.hisp", "killed.white",
              "killed.unarmed.white" , "killed.asian", "killed.unarmed.asian" ,"killed.amerindian", 
              "killed.unarmed.amerindian" )


sesCountVars <- c("unemployed", "laborForce", "employed", "laborForce_5yr", "employed_5yr", "unemployed_5yr",
                  "policeFullTime",  "policeBlack" , "policePatrol",  "policeInvestigate" , 
                  "policeFemale" , "totalFamilies", "singleParent",
                  "families_white", "singleParent_white", "families_white_nh", "singleParent_white_nh",
                  "families_black", "singleParent_black", "families_amerindian", "singleParent_amerindian",
                  "families_asian",  "singleParent_asian", "families_hawaiian", "singleParent_hawaiian" ,
                  "families_hispanic"  , "singleParent_hispanic"  , "singleParent_prcnt_hispanic" )

##WEIGHTED AVERAGES - TOTAL - 5YR (weighted by total population, averaged over several years)
wMeanVars.total_5yr <- c("medianIncome", "poverty", "gini", 
                         "residential_segregation_black_white", 
                         "residential_segregation_nonwhite_white", "highschoolDropOut",
                         "murder_r_5yr", "vc_r_5yr", "dui_r_5yr","drugs_r_5yr",
                         "theft_r_5yr"  , "vandal_r_5yr", "crowding_prcnt",
                         "policePrcntBlack", "policeRatio", "bw_income_gap",
                         "ELA_8thgrade", "MATH_8thgrade" ,
                         "ELA_6thgrade" , "MATH_6thgrade",
                          "MATH_6thgrade_gap", "ELA_7thgrade_gap",
                          "theil_whiteBlack")

###WEIGHTED AVERAGES - White & Hispanic - 5YR - Note "White" includes sispanics
wMeanVars.WH_5yr <- c("murder_white_r_5yr" , "vc_white_r_5yr" , "dui_white_r_5yr" ,"vandal_white_r_5yr",
                      "theft_white_r_5yr" ,"drugs_white_r_5yr"  , 
                      "medianIncome_white" ,  "poverty_white"   )

###WEIGHTED AVERAGES - Black - 5YR 
wMeanVars.Black_5yr <- c("murder_black_r_5yr" , "vc_black_r_5yr" , "dui_black_r_5yr", "vandal_black_r_5yr",
                         "theft_black_r_5yr" ,"drugs_black_r_5yr" , 
                         "medianIncome_black", "poverty_black" ) 
#Rates for 2015
crimeRate.total_2015 <- c("murder_r", "vc_r", "dui_r" , "vandal_r" , "theft_r"  , "drugs_r" )

#Race-specific Rates for year 2015 -
crimeRate.WH_2015 <- c("murder_white_r" ,"vc_white_r" , "dui_white_r", "vandal_white_r", "theft_white_r", 
                       "drugs_white_r")

#2015 - **Also includes Change in Black Pop 
crimeRate.Black_2015 <- c("murder_black_r" , "vc_black_r", "dui_black_r", "vandal_black_r", 
                          "theft_black_r" ,"drugs_black_r" ,
                          "pop.c.black_change")

##AGGREGATE
df_index <- readr::read_csv(indexFile) 
df <- df %>% dplyr::left_join(df_index)

df_id <- df %>% dplyr::group_by(Group) %>% 
  dplyr::summarise(FIPS = paste(unique(FIPS), collapse=", "),
                   State = paste(unique(state), collapse = ", ")
                   )

df_sums <- df[,c("Group", killVars, popVars, crimeVars, sesCountVars)] %>% 
  dplyr::group_by(Group) %>% 
  dplyr::summarise_all(sum, na.rm=T)

df_wMean.total_5yr <- df[,c("Group", wMeanVars.total_5yr, "pop.c.total_5yr")] %>% 
  dplyr::group_by(Group) %>% 
  dplyr::summarise_at(wMeanVars.total_5yr, funs(weighted.mean(.[which(!is.na(pop.c.total_5yr))], 
                                                              w = pop.c.total_5yr[which(!is.na(pop.c.total_5yr))], na.rm=T)))

df_wMean.WH_5yr <- df[,c("Group", wMeanVars.WH_5yr, "pop.c.WH_5yr" )] %>% 
  dplyr::group_by(Group) %>% 
  dplyr::summarise_at(wMeanVars.WH_5yr, funs(weighted.mean(.[which(!is.na(pop.c.WH_5yr))], 
                                                           w = pop.c.WH_5yr[which(!is.na(pop.c.WH_5yr))], na.rm=T)))

df_wMean.Black_5yr <- df[,c("Group", wMeanVars.Black_5yr, "pop.c.black_5yr" )] %>% 
  dplyr::group_by(Group) %>% 
  dplyr::summarise_at(wMeanVars.Black_5yr, funs(weighted.mean(.[which(!is.na(pop.c.black_5yr))], 
                                                              w = pop.c.black_5yr[which(!is.na(pop.c.black_5yr))], na.rm=T)))

df_crime.total_2015 <- df[,c("Group", crimeRate.total_2015, "pop.c.total")] %>% 
  dplyr::group_by(Group) %>% 
  dplyr::summarise_at(crimeRate.total_2015, funs(weighted.mean(.[which(!is.na(pop.c.total))], 
                                                               w = pop.c.total[which(!is.na(pop.c.total))], na.rm=T)))

df_crime.WH_2015 <- df[,c("Group", crimeRate.WH_2015, "pop.c.WH" )] %>% 
  dplyr::group_by(Group) %>% 
  dplyr::summarise_at(crimeRate.WH_2015, funs(weighted.mean(.[which(!is.na(pop.c.WH))], 
                                                            w = pop.c.WH[which(!is.na(pop.c.WH))], na.rm=T)))

df_crime.Black_2015 <- df[,c("Group", crimeRate.Black_2015, "pop.c.black" )] %>% 
  dplyr::group_by(Group) %>% 
  dplyr::summarise_at(crimeRate.Black_2015, funs(weighted.mean(.[which(!is.na(pop.c.black))], 
                                                               w = pop.c.black[which(!is.na(pop.c.black))], na.rm=T)))

##Finally, calculate race-specific rates of median income and poverty over 5yr period
df_ses_white_hispanic <- df[,c("Group", "medianIncome_white_nh" , "poverty_white_nh", 
                                        "medianIncome_hisp" ,  "poverty_hispanic",
                                        "pop.c.white_nh_5yr", "pop.c.hispanic_5yr")] %>% 
  dplyr::group_by(Group) %>% 
  dplyr::summarise(
    medianIncome_white_nh = weighted.mean(medianIncome_white_nh[which(!is.na(pop.c.white_nh_5yr))], 
                                          w = pop.c.white_nh_5yr[which(!is.na(pop.c.white_nh_5yr))], na.rm=T),
                   
    poverty_white_nh = weighted.mean(poverty_white_nh[which(!is.na(pop.c.white_nh_5yr))], 
                                     w = pop.c.white_nh_5yr[which(!is.na(pop.c.white_nh_5yr))], na.rm=T),
            
    medianIncome_hisp = weighted.mean(medianIncome_hisp[which(!is.na(pop.c.hispanic_5yr))], 
                                      w = pop.c.hispanic_5yr[which(!is.na(pop.c.hispanic_5yr))], na.rm=T),
            
    poverty_hispanic = weighted.mean(poverty_hispanic[which(!is.na(pop.c.hispanic_5yr))], 
                                     w = pop.c.hispanic_5yr[which(!is.na(pop.c.hispanic_5yr))], na.rm=T)  )

##COMBINE ALL DATASETS
df_new <- dplyr::left_join(df_id, df_sums) %>% 
  dplyr::left_join(df_wMean.total_5yr) %>% 
  dplyr::left_join(df_wMean.WH_5yr) %>% 
  dplyr::left_join(df_wMean.Black_5yr) %>%
  dplyr::left_join(df_crime.total_2015) %>%
  dplyr::left_join(df_crime.WH_2015 ) %>%
  dplyr::left_join(df_crime.Black_2015) %>%
  dplyr::left_join(df_ses_white_hispanic) %>% 
  dplyr::mutate(
    unemploymentRate_5yr = unemployed_5yr/laborForce_5yr,
    singleParent_prcnt = singleParent/totalFamilies
  )


##CAlculate principle components - Crime and SES
##1.  Crime
totalCrime <- df_new[,c("Group", "murder_r_5yr",  "dui_r_5yr", "vc_r_5yr", "theft_r_5yr", "vandal_r_5yr", "drugs_r_5yr" )]
totalCrime <- totalCrime[complete.cases(totalCrime),] 
pc_crime <-  prcomp(~ murder_r_5yr + vc_r_5yr + dui_r_5yr  + vandal_r_5yr + theft_r_5yr + drugs_r_5yr, 
                            data=totalCrime,
                            scale. = T, center=TRUE,
                            na.action=na.omit)
totalCrime$pc1Crime <- pc_crime$x[,1]
totalCrime$pc2Crime <- pc_crime$x[,2]
totalCrime <- totalCrime[,c("Group", "pc1Crime", "pc2Crime")]
df_new <- dplyr::left_join(df_new, totalCrime)

##2. SES
totalSES <- df_new[,c("Group", "poverty", "singleParent", "medianIncome", "unemploymentRate_5yr")]
totalSES <- totalSES[complete.cases(totalSES),] 

pc_ses <-  prcomp(~ poverty + singleParent + medianIncome + unemploymentRate_5yr, 
                            data=totalSES,
                            scale. = T, center=TRUE,
                            na.action=na.omit)

totalSES$pc1SES <- pc_ses$x[,1]
totalSES$pc2SES <- pc_ses$x[,2]
totalSES <- totalSES[,c("Group", "pc1SES", "pc2SES")]

df_new <- dplyr::left_join(df_new, totalSES)

return(df_new)
}

```



I use the adjacency table 'countyAdjacent.csv' I generated from the file 'createCountyAdjacent.R'.  

https://www.census.gov/geo/reference/county-adjacency.html

##B. 10k Regions
```{r, warning=FALSE}
df_10k_id <- Aggregate.Counties(pop.th = 10000  )
readr::write_csv(df_10k_id, "index_ten_k.csv")

#Wapo & Lott/Moody
df_WL <- Aggregate.County.Data(killFile = "killed_lott_wapo.csv", indexFile = "index_ten_k.csv") 
readr::write_csv(df_WL, path="killed_lott_wapo_tenk.csv")

df_W <- Aggregate.County.Data(killFile =  "killed_wapo.csv", indexFile = "index_ten_k.csv") 
readr::write_csv(df_W, path="killed_wapo_tenk.csv")
```


##C.  100k Regions
```{r}

df_100k_id <- Aggregate.Counties(pop.th = 100000  )
readr::write_csv(df_100k_id, "index100k.csv")

#Wapo & Lott/Moody
df_WL <- Aggregate.County.Data(killFile = "killed_lott_wapo.csv", indexFile = "index100k.csv") 
readr::write_csv(df_WL, path="killed_lott_wapo_100k.csv")

df_W <- Aggregate.County.Data(killFile =  "killed_wapo.csv", indexFile = "index100k.csv") 
readr::write_csv(df_W, path="killed_wapo_100k.csv")
```

##D.  1 MILLION Region
```{r}

df_1M_id <- Aggregate.Counties(pop.th = 1e06 )
readr::write_csv(df_1M_id, "index_1Million.csv")

#Wapo & Lott/Moody
df_WL <- Aggregate.County.Data(killFile = "killed_lott_wapo.csv", indexFile = "index_1Million.csv") 
readr::write_csv(df_WL, path="killed_lott_wapo_1M.csv")

df_W <- Aggregate.County.Data(killFile =  "killed_wapo.csv", indexFile = "index_1Million.csv") 
readr::write_csv(df_W, path="killed_wapo_1M.csv")



```

##E. STATES
```{r}
df_id_state <- fst::read.fst(path="countyVars_f.csv") %>% 
  dplyr::group_by(FIPS) %>% 
  dplyr::summarise(Group = first(state))

readr::write_csv(df_id_state, "index_state.csv")

#Wapo & Lott/Moody
df_WL <- Aggregate.County.Data(killFile = "killed_lott_wapo.csv", indexFile = "index_state.csv") 
readr::write_csv(df_WL, path="killed_lott_wapo_State.csv")

df_W <- Aggregate.County.Data(killFile =  "killed_wapo.csv", indexFile = "index_state.csv") 
readr::write_csv(df_W, path="killed_wapo_State.csv")

```

