Cleaning msa - for 2008

Got the lists with msa from census.gov for 2008
Extract only the msa codes and the city and consolidate into a dataset
The city and the state in the “msa code” dataset are together,therefore separate them into two columns and call them msa state dataset

library(plyr)
options(stringsAsFactors = FALSE)

file_loading <- function(path) {
    lines <- readLines(path, encoding = "Latin 1")
    n <- length(lines)

    last <- which(lines[-n] == "" & lines[-1] == "")

    data <- lines[grepl("Statistical Area", lines)]
    data <- gsub(" (Metropolitan|Micropolitan) Statistical Area", "", data)
    data <- gsub(",", "", data)

    pieces <- strsplit(data, " {2,}")
    pieces <- pieces[sapply(pieces, length) == 2]

    df <- as.data.frame(do.call("rbind", pieces))
    names(df) <- c("msa_code", "city")
    df
}


files <- dir("data/msa-changes/original/", "^2", full.names = TRUE)
data <- ldply(files, file_loading)

data <- unique(data[order(data$msa_code), ])
write.table(data, "msa-codes.csv", row = FALSE, sep = ",")
head(data)

##   msa_code         city
## 1    10020 Abbeville LA
## 2    10100  Aberdeen SD
## 3    10140  Aberdeen WA
## 4    10180   Abilene TX
## 5    10220       Ada OK
## 6    10260  Adjuntas PR

# Separate states from city names for msa state
msa <- read.csv("msa-codes.csv")
divider <- as.numeric(regexpr(" [A-Z-]+$", msa$city))
city <- substr(msa$city, 1, divider - 1)
states <- strsplit(substr(msa$city, divider + 1, 100), "-")
lengths <- sapply(states, length)
rep <- rep(1:nrow(msa), lengths)

citystate <- data.frame(city = city[rep], state = unlist(states), msa_code = msa$msa_code[rep])
write.table(citystate, "msa-states.csv", row = FALSE, sep = ",")
head(citystate)

##        city state msa_code
## 1 Abbeville    LA    10020
## 2  Aberdeen    SD    10100
## 3  Aberdeen    WA    10140
## 4   Abilene    TX    10180
## 5       Ada    OK    10220
## 6  Adjuntas    PR    10260

codes08 <- file_loading("data/msa-changes/original/2008.txt")
divider <- as.numeric(regexpr(" [A-Z-]+$", codes08$city))
city <- substr(codes08$city, 1, divider - 1)
state <- substr(codes08$city, divider + 1, 100)
major_city <- sapply(strsplit(city, "-"), "[", 1)
major_state <- sapply(strsplit(state, "-"), "[", 1)

major <- data.frame(msa_code = codes08$msa_code, city = major_city, state = major_state, 
    label = paste(abbreviate(major_city, 6), major_state, sep = ""))
write.table(major, "msa-major.csv", row = FALSE, sep = ",")
head(major)

##   msa_code      city state    label
## 1    10020 Abbeville    LA AbbvllLA
## 2    10100  Aberdeen    SD AberdnSD
## 3    10140  Aberdeen    WA AberdnWA
## 4    10180   Abilene    TX AbilenTX
## 5    10220       Ada    OK    AdaOK
## 6    10260  Adjuntas    PR AdjntsPR