Cleaning for getting population MSA


df <- read.csv("data/census-population/raw-cbsa.csv", skip = 4, stringsAsFactors = FALSE)
head(df)
##   X10180  X   X.2         Abilene..TX Metropolitan.Statistical.Area
## 1  10180 NA 48059 Callahan County, TX          County or equivalent
## 2  10180 NA 48253    Jones County, TX          County or equivalent
## 3  10180 NA 48441   Taylor County, TX          County or equivalent
## 4  10420 NA    NA           Akron, OH Metropolitan Statistical Area
## 5  10420 NA 39133  Portage County, OH          County or equivalent
## 6  10420 NA 39153   Summit County, OH          County or equivalent
##   X160245 X160241 X160107 X158284 X158099 X158029 X158547 X158681 X158734
## 1   12905   12905   12917   12717   12802   12889   13107   13211   13288
## 2   20785   20785   20723   20125   19982   19911   19726   19544   19497
## 3  126555  126551  126467  125442  125315  125229  125714  125926  125949
## 4  694960  694962  695961  698205  699463  700110  700382  700403  699255
## 5  152061  152061  152383  153026  153692  154473  154837  155056  154581
## 6  542899  542901  543578  545179  545771  545637  545545  545347  544674
##   X159156 X159521 X.134 X.1823 X.185 X.70 X518 X134   X53 X422  X365 X240
## 1   13446   13533    12   -200    85   87  218  104    77  158    87   -9
## 2   19257   19197   -62   -598  -143  -71 -185 -182   -47 -240   -60  -22
## 3  126453  126791   -84  -1025  -127  -86  485  212    23  504   338  271
## 4  699082  698553   999   2244  1258  647  272   21 -1148 -173  -529  810
## 5  155348  155991   322    643   666  781  364  219  -475  767   643  193
## 6  543734  542562   677   1601   592 -134  -92 -198  -673 -940 -1172  617
##   X829 X760 X773 X808 X915 X946 X922 X978 X582 X2376 X2337 X2244 X2381
## 1  -14   -6  -26   -7   -3   19   -3   28   24   145   130   116   147
## 2  -18  -53  -26   -5   10   -6    3   15   41   200   189   173   170
## 3  861  819  825  820  908  933  922  935  517  2031  2018  1955  2064
## 4 2286 2070 1772 1651 1451 1677 2102 2021 2252  8826  8527  8352  8320
## 5  659  473  457  479  343  377  515  557  442  1732  1603  1609  1651
## 6 1627 1597 1315 1172 1108 1300 1587 1464 1810  7094  6924  6743  6669
##   X2457 X2454 X2429 X2476 X342 X1547 X1577 X1471 X1573 X1542 X1508 X1507
## 1   129   159   156   164   33   159   136   142   154   132   140   159
## 2   211   197   177   185   63   218   242   199   175   201   203   174
## 3  2117  2098  2096  2127  246  1170  1199  1130  1244  1209  1165  1174
## 4  8272  8124  8518  8548 1442  6540  6457  6580  6669  6821  6447  6416
## 5  1659  1602  1683  1734  249  1073  1130  1152  1172  1316  1225  1168
## 6  6613  6522  6835  6814 1193  5467  5327  5428  5497  5505  5222  5248
##   X1498 X.373 X.2549 X.800 X.631 X.160 X.645 X.820 X.503 X.635 X89 X116
## 1   136    22   -177   106   120   242   118    65   156    52   1    5
## 2   170   -38   -562   -66   -20  -166  -182   -30  -216   -29   1    6
## 3  1192  -357  -1810  -840  -731  -236  -581  -855  -443  -658  87  105
## 4  6527   127   -403 -1043  -660 -1399 -1229 -2752 -2218 -2594 223  709
## 5  1177   115    138   337   524   -18    18  -783   299   136  33  119
## 6  5350    12   -541 -1380 -1184 -1381 -1247 -1969 -2517 -2730 190  590
##   X.110 X.617 X441 X86 X263 X45 X142 X.462 X.2665 X.690  X.14 X.601 X.731
## 1     4    -2    6   3    5   3    5    21   -182   102   122   236   115
## 2     4    -2    7   4    6   2    4   -39   -568   -70   -18  -173  -186
## 3  -118  -613  428  79  252  40  133  -444  -1915  -722  -118  -664  -660
## 4   653   539  521 533  548 483  485   -96  -1112 -1696 -1199 -1920 -1762
## 5   110    89   93  94   97  85   85    82     19   227   435  -111   -76
## 6   543   450  428 439  451 398  400  -178  -1131 -1923 -1634 -1809 -1686
##   X.1083 X.548 X.777 X.1 X.103 X.145 X.212 X.130 X.136 X.73  X3 X22
## 1     60   153    47  -1    -9   -15    -7   -17   -11   -7   5   7
## 2    -36  -218   -33  -2   -18   -24   -25   -14   -10  -11 -27 -46
## 3  -1107  -483  -791   2   -76  -106  -180   -99  -115  -55  25  61
## 4  -3300 -2701 -3079  62   361   231  -465    20  -201  -73 -57  44
## 5   -880   214    51  14  -154  -144  -200   -97  -142  -69 -47 -50
## 6  -2420 -2915 -3130  48   515   375  -265   117   -59   -4 -10  94
tail(df)
##                      X10180    X   X.2          Abilene..TX
## 2754                  49540   NA    NA       Yazoo City, MS
## 2755                  49540   NA 28163     Yazoo County, MS
## 2756                  49780   NA    NA       Zanesville, OH
## 2757                  49780   NA 39119 Muskingum County, OH
## 2758                          NA    NA                     
## 2759 Release Date: March 19 2009    NA                     
##      Metropolitan.Statistical.Area X160245 X160241 X160107 X158284 X158099
## 2754 Micropolitan Statistical Area   28149   28149   28129   27767   27672
## 2755          County or equivalent   28149   28149   28129   27767   27672
## 2756 Micropolitan Statistical Area   84585   84585   84708   84695   85074
## 2757          County or equivalent   84585   84585   84708   84695   85074
## 2758                                    NA      NA      NA      NA      NA
## 2759                                    NA      NA      NA      NA      NA
##      X158029 X158547 X158681 X158734 X159156 X159521 X.134 X.1823 X.185
## 2754   27795   27775   27821   28753   28693   28464   -20   -362   -95
## 2755   27795   27775   27821   28753   28693   28464   -20   -362   -95
## 2756   85092   85155   85138   85378   85230   85087   123    -13   379
## 2757   85092   85155   85138   85378   85230   85087   123    -13   379
## 2758      NA      NA      NA      NA      NA      NA    NA     NA    NA
## 2759      NA      NA      NA      NA      NA      NA    NA     NA    NA
##      X.70 X518 X134 X53 X422 X365 X240 X829 X760 X773 X808 X915 X946 X922
## 2754  123  -20   46 932  -60 -229   48  101  135  149  107  190  149  234
## 2755  123  -20   46 932  -60 -229   48  101  135  149  107  190  149  234
## 2756   18   63  -17 240 -148 -143  108  189  229  225  178  203  187  253
## 2757   18   63  -17 240 -148 -143  108  189  229  225  178  203  187  253
## 2758   NA   NA   NA  NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 2759   NA   NA   NA  NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
##      X978 X582 X2376 X2337 X2244 X2381 X2457 X2454 X2429 X2476 X342 X1547
## 2754  213  117   430   417   432   405   447   445   489   460   69   329
## 2755  213  117   430   417   432   405   447   445   489   460   69   329
## 2756  255  305  1031  1075  1081  1089  1064  1062  1138  1122  197   842
## 2757  255  305  1031  1075  1081  1089  1064  1062  1138  1122  197   842
## 2758   NA   NA    NA    NA    NA    NA    NA    NA    NA    NA   NA    NA
## 2759   NA   NA    NA    NA    NA    NA    NA    NA    NA    NA   NA    NA
##      X1577 X1471 X1573 X1542 X1508 X1507 X1498 X.373 X.2549 X.800 X.631
## 2754   282   283   298   257   296   255   247   -67   -446  -208    14
## 2755   282   283   298   257   296   255   247   -67   -446  -208    14
## 2756   846   856   911   861   875   885   867    12   -121   234   -90
## 2757   846   856   911   861   875   885   867    12   -121   234   -90
## 2758    NA    NA    NA    NA    NA    NA    NA    NA     NA    NA    NA
## 2759    NA    NA    NA    NA    NA    NA    NA    NA     NA    NA    NA
##      X.160 X.645 X.820 X.503 X.635 X89 X116 X.110 X.617 X441 X86 X263 X45
## 2754  -107  -119   800  -304  -458   1   12    11     8   10  10   10   7
## 2755  -107  -119   800  -304  -458   1   12    11     8   10  10   10   7
## 2756   -57  -147    95  -412  -438   2   14    11     2   14   8   12   8
## 2757   -57  -147    95  -412  -438   2   14    11     2   14   8   12   8
## 2758    NA    NA    NA    NA    NA  NA   NA    NA    NA   NA  NA   NA  NA
## 2759    NA    NA    NA    NA    NA  NA   NA    NA    NA   NA  NA   NA  NA
##      X142 X.462 X.2665 X.690 X.14 X.601 X.731 X.1083 X.548 X.777 X.1 X.103
## 2754    7   -68   -458  -219    6  -117  -129    790  -311  -465  -1   -17
## 2755    7   -68   -458  -219    6  -117  -129    790  -311  -465  -1   -17
## 2756    8    10   -135   223  -92   -71  -155     83  -420  -446   3   -81
## 2757    8    10   -135   223  -92   -71  -155     83  -420  -446   3   -81
## 2758   NA    NA     NA    NA   NA    NA    NA     NA    NA    NA  NA    NA
## 2759   NA    NA     NA    NA   NA    NA    NA     NA    NA    NA  NA    NA
##      X.145 X.212 X.130 X.136 X.73 X3 X22
## 2754   -22   -40   -20   -25  -17 10  16
## 2755   -22   -40   -20   -25  -17 10  16
## 2756   -84  -117   -58   -73  -42 11  40
## 2757   -84  -117   -58   -73  -42 11  40
## 2758    NA    NA    NA    NA   NA NA  NA
## 2759    NA    NA    NA    NA   NA NA  NA
df <- df[-((nrow(df) - 1):nrow(df)), ]
# read the names from raw cbsa and name the df as the same i added [[1]]
# because it just got me the concatenated string with slashes
names(df) <- tolower(strsplit(readLines("data/census-population/raw-cbsa.csv", 
    1), ",")[[1]])
# Remove non-yearly variables
df$census2000pop <- NULL
df$estimatesbase2000 <- NULL
# Melt data and separate variable name from year
head(df)
##    cbsa mdiv stcou                name                          lsad
## 1 10180   NA 48059 Callahan County, TX          County or equivalent
## 2 10180   NA 48253    Jones County, TX          County or equivalent
## 3 10180   NA 48441   Taylor County, TX          County or equivalent
## 4 10420   NA    NA           Akron, OH Metropolitan Statistical Area
## 5 10420   NA 39133  Portage County, OH          County or equivalent
## 6 10420   NA 39153   Summit County, OH          County or equivalent
##   popestimate2000 popestimate2001 popestimate2002 popestimate2003
## 1           12917           12717           12802           12889
## 2           20723           20125           19982           19911
## 3          126467          125442          125315          125229
## 4          695961          698205          699463          700110
## 5          152383          153026          153692          154473
## 6          543578          545179          545771          545637
##   popestimate2004 popestimate2005 popestimate2006 popestimate2007
## 1           13107           13211           13288           13446
## 2           19726           19544           19497           19257
## 3          125714          125926          125949          126453
## 4          700382          700403          699255          699082
## 5          154837          155056          154581          155348
## 6          545545          545347          544674          543734
##   popestimate2008 npopchg_2000 npopchg_2001 npopchg_2002 npopchg_2003
## 1           13533           12         -200           85           87
## 2           19197          -62         -598         -143          -71
## 3          126791          -84        -1025         -127          -86
## 4          698553          999         2244         1258          647
## 5          155991          322          643          666          781
## 6          542562          677         1601          592         -134
##   npopchg_2004 npopchg_2005 npopchg_2006 npopchg_2007 npopchg_2008
## 1          218          104           77          158           87
## 2         -185         -182          -47         -240          -60
## 3          485          212           23          504          338
## 4          272           21        -1148         -173         -529
## 5          364          219         -475          767          643
## 6          -92         -198         -673         -940        -1172
##   naturalinc2000 naturalinc2001 naturalinc2002 naturalinc2003
## 1             -9            -14             -6            -26
## 2            -22            -18            -53            -26
## 3            271            861            819            825
## 4            810           2286           2070           1772
## 5            193            659            473            457
## 6            617           1627           1597           1315
##   naturalinc2004 naturalinc2005 naturalinc2006 naturalinc2007
## 1             -7             -3             19             -3
## 2             -5             10             -6              3
## 3            820            908            933            922
## 4           1651           1451           1677           2102
## 5            479            343            377            515
## 6           1172           1108           1300           1587
##   naturalinc2008 births2000 births2001 births2002 births2003 births2004
## 1             28         24        145        130        116        147
## 2             15         41        200        189        173        170
## 3            935        517       2031       2018       1955       2064
## 4           2021       2252       8826       8527       8352       8320
## 5            557        442       1732       1603       1609       1651
## 6           1464       1810       7094       6924       6743       6669
##   births2005 births2006 births2007 births2008 deaths2000 deaths2001
## 1        129        159        156        164         33        159
## 2        211        197        177        185         63        218
## 3       2117       2098       2096       2127        246       1170
## 4       8272       8124       8518       8548       1442       6540
## 5       1659       1602       1683       1734        249       1073
## 6       6613       6522       6835       6814       1193       5467
##   deaths2002 deaths2003 deaths2004 deaths2005 deaths2006 deaths2007
## 1        136        142        154        132        140        159
## 2        242        199        175        201        203        174
## 3       1199       1130       1244       1209       1165       1174
## 4       6457       6580       6669       6821       6447       6416
## 5       1130       1152       1172       1316       1225       1168
## 6       5327       5428       5497       5505       5222       5248
##   deaths2008 netmig2000 netmig2001 netmig2002 netmig2003 netmig2004
## 1        136         22       -177        106        120        242
## 2        170        -38       -562        -66        -20       -166
## 3       1192       -357      -1810       -840       -731       -236
## 4       6527        127       -403      -1043       -660      -1399
## 5       1177        115        138        337        524        -18
## 6       5350         12       -541      -1380      -1184      -1381
##   netmig2005 netmig2006 netmig2007 netmig2008 internationalmig2000
## 1        118         65        156         52                    1
## 2       -182        -30       -216        -29                    1
## 3       -581       -855       -443       -658                   87
## 4      -1229      -2752      -2218      -2594                  223
## 5         18       -783        299        136                   33
## 6      -1247      -1969      -2517      -2730                  190
##   internationalmig2001 internationalmig2002 internationalmig2003
## 1                    5                    4                   -2
## 2                    6                    4                   -2
## 3                  105                 -118                 -613
## 4                  709                  653                  539
## 5                  119                  110                   89
## 6                  590                  543                  450
##   internationalmig2004 internationalmig2005 internationalmig2006
## 1                    6                    3                    5
## 2                    7                    4                    6
## 3                  428                   79                  252
## 4                  521                  533                  548
## 5                   93                   94                   97
## 6                  428                  439                  451
##   internationalmig2007 internationalmig2008 domesticmig2000
## 1                    3                    5              21
## 2                    2                    4             -39
## 3                   40                  133            -444
## 4                  483                  485             -96
## 5                   85                   85              82
## 6                  398                  400            -178
##   domesticmig2001 domesticmig2002 domesticmig2003 domesticmig2004
## 1            -182             102             122             236
## 2            -568             -70             -18            -173
## 3           -1915            -722            -118            -664
## 4           -1112           -1696           -1199           -1920
## 5              19             227             435            -111
## 6           -1131           -1923           -1634           -1809
##   domesticmig2005 domesticmig2006 domesticmig2007 domesticmig2008
## 1             115              60             153              47
## 2            -186             -36            -218             -33
## 3            -660           -1107            -483            -791
## 4           -1762           -3300           -2701           -3079
## 5             -76            -880             214              51
## 6           -1686           -2420           -2915           -3130
##   residual2000 residual2001 residual2002 residual2003 residual2004
## 1           -1           -9          -15           -7          -17
## 2           -2          -18          -24          -25          -14
## 3            2          -76         -106         -180          -99
## 4           62          361          231         -465           20
## 5           14         -154         -144         -200          -97
## 6           48          515          375         -265          117
##   residual2005 residual2006 residual2007 residual2008
## 1          -11           -7            5            7
## 2          -10          -11          -27          -46
## 3         -115          -55           25           61
## 4         -201          -73          -57           44
## 5         -142          -69          -47          -50
## 6          -59           -4          -10           94
library(reshape2)
dfm <- melt(df, id = 1:5)
nc <- nchar(as.character(dfm$variable))
dfm$year <- as.numeric(substr(dfm$variable, nc - 3, nc))
dfm$variable <- substr(dfm$variable, 1, nc - 4)
# Remove 2001 because numbers are only for April-July
dfm <- subset(dfm, year != 2000)
# we only need the msas
head(dfm)
##       cbsa mdiv stcou                name                          lsad
## 2758 10180   NA 48059 Callahan County, TX          County or equivalent
## 2759 10180   NA 48253    Jones County, TX          County or equivalent
## 2760 10180   NA 48441   Taylor County, TX          County or equivalent
## 2761 10420   NA    NA           Akron, OH Metropolitan Statistical Area
## 2762 10420   NA 39133  Portage County, OH          County or equivalent
## 2763 10420   NA 39153   Summit County, OH          County or equivalent
##         variable  value year
## 2758 popestimate  12717 2001
## 2759 popestimate  20125 2001
## 2760 popestimate 125442 2001
## 2761 popestimate 698205 2001
## 2762 popestimate 153026 2001
## 2763 popestimate 545179 2001
msa <- subset(dfm, lsad == "Metropolitan Statistical Area")
msa$lsad <- NULL
msa$stcou <- NULL
msa$mdiv <- NULL
head(msa)
##       cbsa                              name    variable  value year
## 2761 10420                         Akron, OH popestimate 698205 2001
## 2764 10500                        Albany, GA popestimate 159814 2001
## 2770 10580       Albany-Schenectady-Troy, NY popestimate 829321 2001
## 2776 10740                   Albuquerque, NM popestimate 739327 2001
## 2781 10780                    Alexandria, LA popestimate 144915 2001
## 2784 10900 Allentown-Bethlehem-Easton, PA-NJ popestimate 748523 2001
msa2 <- dcast(msa, name + year ~ variable)
head(msa2)
##        name year births deaths domesticmig internationalmig naturalinc
## 1 Akron, OH 2001   8826   6540       -1112              709       2286
## 2 Akron, OH 2002   8527   6457       -1696              653       2070
## 3 Akron, OH 2003   8352   6580       -1199              539       1772
## 4 Akron, OH 2004   8320   6669       -1920              521       1651
## 5 Akron, OH 2005   8272   6821       -1762              533       1451
## 6 Akron, OH 2006   8124   6447       -3300              548       1677
##   netmig npopchg_ popestimate residual
## 1   -403     2244      698205      361
## 2  -1043     1258      699463      231
## 3   -660      647      700110     -465
## 4  -1399      272      700382       20
## 5  -1229       21      700403     -201
## 6  -2752    -1148      699255      -73
msa_codes <- read.csv("data/msa-changes/msa-codes.csv")
# remove the column name and add a column city with the values in name but
# after removing the comma which can be done by gsub
msa2$city <- gsub(",| /1", "", msa2$name)
msa2$name <- NULL
head(msa2)
##   year births deaths domesticmig internationalmig naturalinc netmig
## 1 2001   8826   6540       -1112              709       2286   -403
## 2 2002   8527   6457       -1696              653       2070  -1043
## 3 2003   8352   6580       -1199              539       1772   -660
## 4 2004   8320   6669       -1920              521       1651  -1399
## 5 2005   8272   6821       -1762              533       1451  -1229
## 6 2006   8124   6447       -3300              548       1677  -2752
##   npopchg_ popestimate residual     city
## 1     2244      698205      361 Akron OH
## 2     1258      699463      231 Akron OH
## 3      647      700110     -465 Akron OH
## 4      272      700382       20 Akron OH
## 5       21      700403     -201 Akron OH
## 6    -1148      699255      -73 Akron OH
msa3 <- merge(msa2, msa_codes, by = "city", all.x = TRUE)
head(msa3)
##       city year births deaths domesticmig internationalmig naturalinc
## 1 Akron OH 2001   8826   6540       -1112              709       2286
## 2 Akron OH 2002   8527   6457       -1696              653       2070
## 3 Akron OH 2003   8352   6580       -1199              539       1772
## 4 Akron OH 2004   8320   6669       -1920              521       1651
## 5 Akron OH 2005   8272   6821       -1762              533       1451
## 6 Akron OH 2006   8124   6447       -3300              548       1677
##   netmig npopchg_ popestimate residual msa_code
## 1   -403     2244      698205      361    10420
## 2  -1043     1258      699463      231    10420
## 3   -660      647      700110     -465    10420
## 4  -1399      272      700382       20    10420
## 5  -1229       21      700403     -201    10420
## 6  -2752    -1148      699255      -73    10420
write.table(msa3, "population-msa.csv", sep = ",", row = FALSE)