Cleaning msa - for 2008
- Got the lists with msa from census.gov for 2008
- Extract only the msa codes and the city and consolidate into a dataset
- The city and the state in the “msa code” dataset are together,therefore separate them into two columns and call them msa state dataset
library(plyr)
options(stringsAsFactors = FALSE)
file_loading <- function(path) {
lines <- readLines(path, encoding = "Latin 1")
n <- length(lines)
last <- which(lines[-n] == "" & lines[-1] == "")
data <- lines[grepl("Statistical Area", lines)]
data <- gsub(" (Metropolitan|Micropolitan) Statistical Area", "", data)
data <- gsub(",", "", data)
pieces <- strsplit(data, " {2,}")
pieces <- pieces[sapply(pieces, length) == 2]
df <- as.data.frame(do.call("rbind", pieces))
names(df) <- c("msa_code", "city")
df
}
files <- dir("data/msa-changes/original/", "^2", full.names = TRUE)
data <- ldply(files, file_loading)
data <- unique(data[order(data$msa_code), ])
write.table(data, "msa-codes.csv", row = FALSE, sep = ",")
head(data)
## msa_code city
## 1 10020 Abbeville LA
## 2 10100 Aberdeen SD
## 3 10140 Aberdeen WA
## 4 10180 Abilene TX
## 5 10220 Ada OK
## 6 10260 Adjuntas PR
# Separate states from city names for msa state
msa <- read.csv("msa-codes.csv")
divider <- as.numeric(regexpr(" [A-Z-]+$", msa$city))
city <- substr(msa$city, 1, divider - 1)
states <- strsplit(substr(msa$city, divider + 1, 100), "-")
lengths <- sapply(states, length)
rep <- rep(1:nrow(msa), lengths)
citystate <- data.frame(city = city[rep], state = unlist(states), msa_code = msa$msa_code[rep])
write.table(citystate, "msa-states.csv", row = FALSE, sep = ",")
head(citystate)
## city state msa_code
## 1 Abbeville LA 10020
## 2 Aberdeen SD 10100
## 3 Aberdeen WA 10140
## 4 Abilene TX 10180
## 5 Ada OK 10220
## 6 Adjuntas PR 10260
codes08 <- file_loading("data/msa-changes/original/2008.txt")
divider <- as.numeric(regexpr(" [A-Z-]+$", codes08$city))
city <- substr(codes08$city, 1, divider - 1)
state <- substr(codes08$city, divider + 1, 100)
major_city <- sapply(strsplit(city, "-"), "[", 1)
major_state <- sapply(strsplit(state, "-"), "[", 1)
major <- data.frame(msa_code = codes08$msa_code, city = major_city, state = major_state,
label = paste(abbreviate(major_city, 6), major_state, sep = ""))
write.table(major, "msa-major.csv", row = FALSE, sep = ",")
head(major)
## msa_code city state label
## 1 10020 Abbeville LA AbbvllLA
## 2 10100 Aberdeen SD AberdnSD
## 3 10140 Aberdeen WA AberdnWA
## 4 10180 Abilene TX AbilenTX
## 5 10220 Ada OK AdaOK
## 6 10260 Adjuntas PR AdjntsPR