Cleaning the gdp metro data


library(reshape2)
library(stringr)

allgmp <- read.csv("data/gdp-metro/allgmp.csv")
head(allgmp)
##   component_id                                         component_name
## 1          200 GDP by Metropolitan Area (millions of current dollars)
## 2          200 GDP by Metropolitan Area (millions of current dollars)
## 3          200 GDP by Metropolitan Area (millions of current dollars)
## 4          200 GDP by Metropolitan Area (millions of current dollars)
## 5          200 GDP by Metropolitan Area (millions of current dollars)
## 6          200 GDP by Metropolitan Area (millions of current dollars)
##    FIPS                 area_name industry_id
## 1 00998 U.S. Metropolitan Portion         1  
## 2 00998 U.S. Metropolitan Portion         2  
## 3 00998 U.S. Metropolitan Portion         3  
## 4 00998 U.S. Metropolitan Portion         4  
## 5 00998 U.S. Metropolitan Portion         5  
## 6 00998 U.S. Metropolitan Portion         6  
##                                    industry_name       X2001       X2002
## 1                             All industry total 9046139.000 9347825.000
## 2                             Private industries 8026223.000 8258864.000
## 3    Agriculture, forestry, fishing, and hunting   51168.000   51264.000
## 4             Crop and animal production (Farms)   35984.000   35939.000
## 5      Forestry, fishing, and related activities   15184.000   15325.000
## 6                                         Mining   88968.000   77420.000
##         X2003        X2004        X2005        X2006        X2007
## 1 9771963.000 10413832.000 11082353.000 11772193.000 12324166.000
## 2 8617065.000  9197683.000  9803105.000 10426966.000 10899969.000
## 3   58056.000    69766.000    66533.000    63517.000    82931.000
## 4   41755.000    52718.000    48412.000    44444.000    63630.000
## 5   16301.000    17049.000    18120.000    19073.000    19301.000
## 6  107198.000   129218.000   170753.000   198165.000   207769.000
##          X2008
## 1 12724270.000
## 2 11221566.000
## 3    76455.000
## 4          n/a
## 5          n/a
## 6   244226.000
tail(allgmp)
##                                                                                                                           component_id
## 96900                                                                                                                              900
## 96901                           Note: NAICS Industry detail is based on the 1997 North American Industry Classification System (NAICS)
## 96902 Note: (D) Not shown in order to avoid the disclosure of confidential information; estimates are included in higher level totals.
## 96903                                                        Note: (L) Less than $500,000 in nominal or real GDP by metropolitan area.
## 96904                                                                                                         Note: n/a Not available.
## 96905                Source: U.S. Department of Commerce / Bureau of Economic Analysis / Regional Product Division  --  September 2009
##                                                         component_name
## 96900 Real GDP by Metropolitan Area (millions of chained 2001 dollars)
## 96901                                                                 
## 96902                                                                 
## 96903                                                                 
## 96904                                                                 
## 96905                                                                 
##        FIPS      area_name industry_id
## 96900 49740 Yuma, AZ (MSA)         109
## 96901                                 
## 96902                                 
## 96903                                 
## 96904                                 
## 96905                                 
##                               industry_name    X2001    X2002    X2003
## 96900 Private services-providing industries 1228.000 1342.000 1485.000
## 96901                                                                 
## 96902                                                                 
## 96903                                                                 
## 96904                                                                 
## 96905                                                                 
##          X2004    X2005    X2006    X2007    X2008
## 96900 1598.000 1745.000 1872.000 1994.000 2084.000
## 96901                                             
## 96902                                             
## 96903                                             
## 96904                                             
## 96905
allgmp <- subset(allgmp, component_name == "GDP by Metropolitan Area (millions of current dollars)")
allgmp$component_name <- NULL
all_m <- melt(allgmp, m = paste("X", 2001:2008, sep = ""))
all_m$year <- as.numeric(str_replace(all_m$variable, "X", ""))
all_m$variable <- NULL
gdp <- all_m[c("FIPS", "industry_id", "year", "value")]
names(gdp) <- c("fips", "indust", "year", "gdp")
gdp <- gdp[!gdp$fips %in% c("FIPS", ""), ]
gdp$code <- ifelse(gdp$gdp %in% c("n/a", "(D)", "(L)"), as.character(gdp$gdp), 
    NA)
gdp$gdp[!is.na(gdp$code)] <- NA
gdp$gdp <- as.numeric(as.character(gdp$gdp))

gdp$fips <- as.numeric(as.character(gdp$fips))
gdp$indust <- as.numeric(as.character(gdp$indust))

write.table(gdp, "gdp-metro.csv", sep = ",", row = F)
head(gdp)
##   fips indust year     gdp code
## 1  998      1 2001 9046139 <NA>
## 2  998      2 2001 8026223 <NA>
## 3  998      3 2001   51168 <NA>
## 4  998      4 2001   35984 <NA>
## 5  998      5 2001   15184 <NA>
## 6  998      6 2001   88968 <NA>
tail(gdp)
##         fips indust year  gdp code
## 258363 49740    103 2008  283 <NA>
## 258364 49740    104 2008  394 <NA>
## 258365 49740    105 2008  158 <NA>
## 258366 49740    106 2008   NA  n/a
## 258367 49740    108 2008 1198 <NA>
## 258368 49740    109 2008 2348 <NA>