Cleaning for getting population MSA
- using the rcbsa and msacodes the population msa is formed
- cleared the unwanted data
- comments are best describers
df <- read.csv("data/census-population/raw-cbsa.csv", skip = 4, stringsAsFactors = FALSE)
head(df)
## X10180 X X.2 Abilene..TX Metropolitan.Statistical.Area
## 1 10180 NA 48059 Callahan County, TX County or equivalent
## 2 10180 NA 48253 Jones County, TX County or equivalent
## 3 10180 NA 48441 Taylor County, TX County or equivalent
## 4 10420 NA NA Akron, OH Metropolitan Statistical Area
## 5 10420 NA 39133 Portage County, OH County or equivalent
## 6 10420 NA 39153 Summit County, OH County or equivalent
## X160245 X160241 X160107 X158284 X158099 X158029 X158547 X158681 X158734
## 1 12905 12905 12917 12717 12802 12889 13107 13211 13288
## 2 20785 20785 20723 20125 19982 19911 19726 19544 19497
## 3 126555 126551 126467 125442 125315 125229 125714 125926 125949
## 4 694960 694962 695961 698205 699463 700110 700382 700403 699255
## 5 152061 152061 152383 153026 153692 154473 154837 155056 154581
## 6 542899 542901 543578 545179 545771 545637 545545 545347 544674
## X159156 X159521 X.134 X.1823 X.185 X.70 X518 X134 X53 X422 X365 X240
## 1 13446 13533 12 -200 85 87 218 104 77 158 87 -9
## 2 19257 19197 -62 -598 -143 -71 -185 -182 -47 -240 -60 -22
## 3 126453 126791 -84 -1025 -127 -86 485 212 23 504 338 271
## 4 699082 698553 999 2244 1258 647 272 21 -1148 -173 -529 810
## 5 155348 155991 322 643 666 781 364 219 -475 767 643 193
## 6 543734 542562 677 1601 592 -134 -92 -198 -673 -940 -1172 617
## X829 X760 X773 X808 X915 X946 X922 X978 X582 X2376 X2337 X2244 X2381
## 1 -14 -6 -26 -7 -3 19 -3 28 24 145 130 116 147
## 2 -18 -53 -26 -5 10 -6 3 15 41 200 189 173 170
## 3 861 819 825 820 908 933 922 935 517 2031 2018 1955 2064
## 4 2286 2070 1772 1651 1451 1677 2102 2021 2252 8826 8527 8352 8320
## 5 659 473 457 479 343 377 515 557 442 1732 1603 1609 1651
## 6 1627 1597 1315 1172 1108 1300 1587 1464 1810 7094 6924 6743 6669
## X2457 X2454 X2429 X2476 X342 X1547 X1577 X1471 X1573 X1542 X1508 X1507
## 1 129 159 156 164 33 159 136 142 154 132 140 159
## 2 211 197 177 185 63 218 242 199 175 201 203 174
## 3 2117 2098 2096 2127 246 1170 1199 1130 1244 1209 1165 1174
## 4 8272 8124 8518 8548 1442 6540 6457 6580 6669 6821 6447 6416
## 5 1659 1602 1683 1734 249 1073 1130 1152 1172 1316 1225 1168
## 6 6613 6522 6835 6814 1193 5467 5327 5428 5497 5505 5222 5248
## X1498 X.373 X.2549 X.800 X.631 X.160 X.645 X.820 X.503 X.635 X89 X116
## 1 136 22 -177 106 120 242 118 65 156 52 1 5
## 2 170 -38 -562 -66 -20 -166 -182 -30 -216 -29 1 6
## 3 1192 -357 -1810 -840 -731 -236 -581 -855 -443 -658 87 105
## 4 6527 127 -403 -1043 -660 -1399 -1229 -2752 -2218 -2594 223 709
## 5 1177 115 138 337 524 -18 18 -783 299 136 33 119
## 6 5350 12 -541 -1380 -1184 -1381 -1247 -1969 -2517 -2730 190 590
## X.110 X.617 X441 X86 X263 X45 X142 X.462 X.2665 X.690 X.14 X.601 X.731
## 1 4 -2 6 3 5 3 5 21 -182 102 122 236 115
## 2 4 -2 7 4 6 2 4 -39 -568 -70 -18 -173 -186
## 3 -118 -613 428 79 252 40 133 -444 -1915 -722 -118 -664 -660
## 4 653 539 521 533 548 483 485 -96 -1112 -1696 -1199 -1920 -1762
## 5 110 89 93 94 97 85 85 82 19 227 435 -111 -76
## 6 543 450 428 439 451 398 400 -178 -1131 -1923 -1634 -1809 -1686
## X.1083 X.548 X.777 X.1 X.103 X.145 X.212 X.130 X.136 X.73 X3 X22
## 1 60 153 47 -1 -9 -15 -7 -17 -11 -7 5 7
## 2 -36 -218 -33 -2 -18 -24 -25 -14 -10 -11 -27 -46
## 3 -1107 -483 -791 2 -76 -106 -180 -99 -115 -55 25 61
## 4 -3300 -2701 -3079 62 361 231 -465 20 -201 -73 -57 44
## 5 -880 214 51 14 -154 -144 -200 -97 -142 -69 -47 -50
## 6 -2420 -2915 -3130 48 515 375 -265 117 -59 -4 -10 94
tail(df)
## X10180 X X.2 Abilene..TX
## 2754 49540 NA NA Yazoo City, MS
## 2755 49540 NA 28163 Yazoo County, MS
## 2756 49780 NA NA Zanesville, OH
## 2757 49780 NA 39119 Muskingum County, OH
## 2758 NA NA
## 2759 Release Date: March 19 2009 NA
## Metropolitan.Statistical.Area X160245 X160241 X160107 X158284 X158099
## 2754 Micropolitan Statistical Area 28149 28149 28129 27767 27672
## 2755 County or equivalent 28149 28149 28129 27767 27672
## 2756 Micropolitan Statistical Area 84585 84585 84708 84695 85074
## 2757 County or equivalent 84585 84585 84708 84695 85074
## 2758 NA NA NA NA NA
## 2759 NA NA NA NA NA
## X158029 X158547 X158681 X158734 X159156 X159521 X.134 X.1823 X.185
## 2754 27795 27775 27821 28753 28693 28464 -20 -362 -95
## 2755 27795 27775 27821 28753 28693 28464 -20 -362 -95
## 2756 85092 85155 85138 85378 85230 85087 123 -13 379
## 2757 85092 85155 85138 85378 85230 85087 123 -13 379
## 2758 NA NA NA NA NA NA NA NA NA
## 2759 NA NA NA NA NA NA NA NA NA
## X.70 X518 X134 X53 X422 X365 X240 X829 X760 X773 X808 X915 X946 X922
## 2754 123 -20 46 932 -60 -229 48 101 135 149 107 190 149 234
## 2755 123 -20 46 932 -60 -229 48 101 135 149 107 190 149 234
## 2756 18 63 -17 240 -148 -143 108 189 229 225 178 203 187 253
## 2757 18 63 -17 240 -148 -143 108 189 229 225 178 203 187 253
## 2758 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2759 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## X978 X582 X2376 X2337 X2244 X2381 X2457 X2454 X2429 X2476 X342 X1547
## 2754 213 117 430 417 432 405 447 445 489 460 69 329
## 2755 213 117 430 417 432 405 447 445 489 460 69 329
## 2756 255 305 1031 1075 1081 1089 1064 1062 1138 1122 197 842
## 2757 255 305 1031 1075 1081 1089 1064 1062 1138 1122 197 842
## 2758 NA NA NA NA NA NA NA NA NA NA NA NA
## 2759 NA NA NA NA NA NA NA NA NA NA NA NA
## X1577 X1471 X1573 X1542 X1508 X1507 X1498 X.373 X.2549 X.800 X.631
## 2754 282 283 298 257 296 255 247 -67 -446 -208 14
## 2755 282 283 298 257 296 255 247 -67 -446 -208 14
## 2756 846 856 911 861 875 885 867 12 -121 234 -90
## 2757 846 856 911 861 875 885 867 12 -121 234 -90
## 2758 NA NA NA NA NA NA NA NA NA NA NA
## 2759 NA NA NA NA NA NA NA NA NA NA NA
## X.160 X.645 X.820 X.503 X.635 X89 X116 X.110 X.617 X441 X86 X263 X45
## 2754 -107 -119 800 -304 -458 1 12 11 8 10 10 10 7
## 2755 -107 -119 800 -304 -458 1 12 11 8 10 10 10 7
## 2756 -57 -147 95 -412 -438 2 14 11 2 14 8 12 8
## 2757 -57 -147 95 -412 -438 2 14 11 2 14 8 12 8
## 2758 NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2759 NA NA NA NA NA NA NA NA NA NA NA NA NA
## X142 X.462 X.2665 X.690 X.14 X.601 X.731 X.1083 X.548 X.777 X.1 X.103
## 2754 7 -68 -458 -219 6 -117 -129 790 -311 -465 -1 -17
## 2755 7 -68 -458 -219 6 -117 -129 790 -311 -465 -1 -17
## 2756 8 10 -135 223 -92 -71 -155 83 -420 -446 3 -81
## 2757 8 10 -135 223 -92 -71 -155 83 -420 -446 3 -81
## 2758 NA NA NA NA NA NA NA NA NA NA NA NA
## 2759 NA NA NA NA NA NA NA NA NA NA NA NA
## X.145 X.212 X.130 X.136 X.73 X3 X22
## 2754 -22 -40 -20 -25 -17 10 16
## 2755 -22 -40 -20 -25 -17 10 16
## 2756 -84 -117 -58 -73 -42 11 40
## 2757 -84 -117 -58 -73 -42 11 40
## 2758 NA NA NA NA NA NA NA
## 2759 NA NA NA NA NA NA NA
df <- df[-((nrow(df) - 1):nrow(df)), ]
# read the names from raw cbsa and name the df as the same i added [[1]]
# because it just got me the concatenated string with slashes
names(df) <- tolower(strsplit(readLines("data/census-population/raw-cbsa.csv",
1), ",")[[1]])
# Remove non-yearly variables
df$census2000pop <- NULL
df$estimatesbase2000 <- NULL
# Melt data and separate variable name from year
head(df)
## cbsa mdiv stcou name lsad
## 1 10180 NA 48059 Callahan County, TX County or equivalent
## 2 10180 NA 48253 Jones County, TX County or equivalent
## 3 10180 NA 48441 Taylor County, TX County or equivalent
## 4 10420 NA NA Akron, OH Metropolitan Statistical Area
## 5 10420 NA 39133 Portage County, OH County or equivalent
## 6 10420 NA 39153 Summit County, OH County or equivalent
## popestimate2000 popestimate2001 popestimate2002 popestimate2003
## 1 12917 12717 12802 12889
## 2 20723 20125 19982 19911
## 3 126467 125442 125315 125229
## 4 695961 698205 699463 700110
## 5 152383 153026 153692 154473
## 6 543578 545179 545771 545637
## popestimate2004 popestimate2005 popestimate2006 popestimate2007
## 1 13107 13211 13288 13446
## 2 19726 19544 19497 19257
## 3 125714 125926 125949 126453
## 4 700382 700403 699255 699082
## 5 154837 155056 154581 155348
## 6 545545 545347 544674 543734
## popestimate2008 npopchg_2000 npopchg_2001 npopchg_2002 npopchg_2003
## 1 13533 12 -200 85 87
## 2 19197 -62 -598 -143 -71
## 3 126791 -84 -1025 -127 -86
## 4 698553 999 2244 1258 647
## 5 155991 322 643 666 781
## 6 542562 677 1601 592 -134
## npopchg_2004 npopchg_2005 npopchg_2006 npopchg_2007 npopchg_2008
## 1 218 104 77 158 87
## 2 -185 -182 -47 -240 -60
## 3 485 212 23 504 338
## 4 272 21 -1148 -173 -529
## 5 364 219 -475 767 643
## 6 -92 -198 -673 -940 -1172
## naturalinc2000 naturalinc2001 naturalinc2002 naturalinc2003
## 1 -9 -14 -6 -26
## 2 -22 -18 -53 -26
## 3 271 861 819 825
## 4 810 2286 2070 1772
## 5 193 659 473 457
## 6 617 1627 1597 1315
## naturalinc2004 naturalinc2005 naturalinc2006 naturalinc2007
## 1 -7 -3 19 -3
## 2 -5 10 -6 3
## 3 820 908 933 922
## 4 1651 1451 1677 2102
## 5 479 343 377 515
## 6 1172 1108 1300 1587
## naturalinc2008 births2000 births2001 births2002 births2003 births2004
## 1 28 24 145 130 116 147
## 2 15 41 200 189 173 170
## 3 935 517 2031 2018 1955 2064
## 4 2021 2252 8826 8527 8352 8320
## 5 557 442 1732 1603 1609 1651
## 6 1464 1810 7094 6924 6743 6669
## births2005 births2006 births2007 births2008 deaths2000 deaths2001
## 1 129 159 156 164 33 159
## 2 211 197 177 185 63 218
## 3 2117 2098 2096 2127 246 1170
## 4 8272 8124 8518 8548 1442 6540
## 5 1659 1602 1683 1734 249 1073
## 6 6613 6522 6835 6814 1193 5467
## deaths2002 deaths2003 deaths2004 deaths2005 deaths2006 deaths2007
## 1 136 142 154 132 140 159
## 2 242 199 175 201 203 174
## 3 1199 1130 1244 1209 1165 1174
## 4 6457 6580 6669 6821 6447 6416
## 5 1130 1152 1172 1316 1225 1168
## 6 5327 5428 5497 5505 5222 5248
## deaths2008 netmig2000 netmig2001 netmig2002 netmig2003 netmig2004
## 1 136 22 -177 106 120 242
## 2 170 -38 -562 -66 -20 -166
## 3 1192 -357 -1810 -840 -731 -236
## 4 6527 127 -403 -1043 -660 -1399
## 5 1177 115 138 337 524 -18
## 6 5350 12 -541 -1380 -1184 -1381
## netmig2005 netmig2006 netmig2007 netmig2008 internationalmig2000
## 1 118 65 156 52 1
## 2 -182 -30 -216 -29 1
## 3 -581 -855 -443 -658 87
## 4 -1229 -2752 -2218 -2594 223
## 5 18 -783 299 136 33
## 6 -1247 -1969 -2517 -2730 190
## internationalmig2001 internationalmig2002 internationalmig2003
## 1 5 4 -2
## 2 6 4 -2
## 3 105 -118 -613
## 4 709 653 539
## 5 119 110 89
## 6 590 543 450
## internationalmig2004 internationalmig2005 internationalmig2006
## 1 6 3 5
## 2 7 4 6
## 3 428 79 252
## 4 521 533 548
## 5 93 94 97
## 6 428 439 451
## internationalmig2007 internationalmig2008 domesticmig2000
## 1 3 5 21
## 2 2 4 -39
## 3 40 133 -444
## 4 483 485 -96
## 5 85 85 82
## 6 398 400 -178
## domesticmig2001 domesticmig2002 domesticmig2003 domesticmig2004
## 1 -182 102 122 236
## 2 -568 -70 -18 -173
## 3 -1915 -722 -118 -664
## 4 -1112 -1696 -1199 -1920
## 5 19 227 435 -111
## 6 -1131 -1923 -1634 -1809
## domesticmig2005 domesticmig2006 domesticmig2007 domesticmig2008
## 1 115 60 153 47
## 2 -186 -36 -218 -33
## 3 -660 -1107 -483 -791
## 4 -1762 -3300 -2701 -3079
## 5 -76 -880 214 51
## 6 -1686 -2420 -2915 -3130
## residual2000 residual2001 residual2002 residual2003 residual2004
## 1 -1 -9 -15 -7 -17
## 2 -2 -18 -24 -25 -14
## 3 2 -76 -106 -180 -99
## 4 62 361 231 -465 20
## 5 14 -154 -144 -200 -97
## 6 48 515 375 -265 117
## residual2005 residual2006 residual2007 residual2008
## 1 -11 -7 5 7
## 2 -10 -11 -27 -46
## 3 -115 -55 25 61
## 4 -201 -73 -57 44
## 5 -142 -69 -47 -50
## 6 -59 -4 -10 94
library(reshape2)
dfm <- melt(df, id = 1:5)
nc <- nchar(as.character(dfm$variable))
dfm$year <- as.numeric(substr(dfm$variable, nc - 3, nc))
dfm$variable <- substr(dfm$variable, 1, nc - 4)
# Remove 2001 because numbers are only for April-July
dfm <- subset(dfm, year != 2000)
# we only need the msas
head(dfm)
## cbsa mdiv stcou name lsad
## 2758 10180 NA 48059 Callahan County, TX County or equivalent
## 2759 10180 NA 48253 Jones County, TX County or equivalent
## 2760 10180 NA 48441 Taylor County, TX County or equivalent
## 2761 10420 NA NA Akron, OH Metropolitan Statistical Area
## 2762 10420 NA 39133 Portage County, OH County or equivalent
## 2763 10420 NA 39153 Summit County, OH County or equivalent
## variable value year
## 2758 popestimate 12717 2001
## 2759 popestimate 20125 2001
## 2760 popestimate 125442 2001
## 2761 popestimate 698205 2001
## 2762 popestimate 153026 2001
## 2763 popestimate 545179 2001
msa <- subset(dfm, lsad == "Metropolitan Statistical Area")
msa$lsad <- NULL
msa$stcou <- NULL
msa$mdiv <- NULL
head(msa)
## cbsa name variable value year
## 2761 10420 Akron, OH popestimate 698205 2001
## 2764 10500 Albany, GA popestimate 159814 2001
## 2770 10580 Albany-Schenectady-Troy, NY popestimate 829321 2001
## 2776 10740 Albuquerque, NM popestimate 739327 2001
## 2781 10780 Alexandria, LA popestimate 144915 2001
## 2784 10900 Allentown-Bethlehem-Easton, PA-NJ popestimate 748523 2001
msa2 <- dcast(msa, name + year ~ variable)
head(msa2)
## name year births deaths domesticmig internationalmig naturalinc
## 1 Akron, OH 2001 8826 6540 -1112 709 2286
## 2 Akron, OH 2002 8527 6457 -1696 653 2070
## 3 Akron, OH 2003 8352 6580 -1199 539 1772
## 4 Akron, OH 2004 8320 6669 -1920 521 1651
## 5 Akron, OH 2005 8272 6821 -1762 533 1451
## 6 Akron, OH 2006 8124 6447 -3300 548 1677
## netmig npopchg_ popestimate residual
## 1 -403 2244 698205 361
## 2 -1043 1258 699463 231
## 3 -660 647 700110 -465
## 4 -1399 272 700382 20
## 5 -1229 21 700403 -201
## 6 -2752 -1148 699255 -73
msa_codes <- read.csv("data/msa-changes/msa-codes.csv")
# remove the column name and add a column city with the values in name but
# after removing the comma which can be done by gsub
msa2$city <- gsub(",| /1", "", msa2$name)
msa2$name <- NULL
head(msa2)
## year births deaths domesticmig internationalmig naturalinc netmig
## 1 2001 8826 6540 -1112 709 2286 -403
## 2 2002 8527 6457 -1696 653 2070 -1043
## 3 2003 8352 6580 -1199 539 1772 -660
## 4 2004 8320 6669 -1920 521 1651 -1399
## 5 2005 8272 6821 -1762 533 1451 -1229
## 6 2006 8124 6447 -3300 548 1677 -2752
## npopchg_ popestimate residual city
## 1 2244 698205 361 Akron OH
## 2 1258 699463 231 Akron OH
## 3 647 700110 -465 Akron OH
## 4 272 700382 20 Akron OH
## 5 21 700403 -201 Akron OH
## 6 -1148 699255 -73 Akron OH
msa3 <- merge(msa2, msa_codes, by = "city", all.x = TRUE)
head(msa3)
## city year births deaths domesticmig internationalmig naturalinc
## 1 Akron OH 2001 8826 6540 -1112 709 2286
## 2 Akron OH 2002 8527 6457 -1696 653 2070
## 3 Akron OH 2003 8352 6580 -1199 539 1772
## 4 Akron OH 2004 8320 6669 -1920 521 1651
## 5 Akron OH 2005 8272 6821 -1762 533 1451
## 6 Akron OH 2006 8124 6447 -3300 548 1677
## netmig npopchg_ popestimate residual msa_code
## 1 -403 2244 698205 361 10420
## 2 -1043 1258 699463 231 10420
## 3 -660 647 700110 -465 10420
## 4 -1399 272 700382 20 10420
## 5 -1229 21 700403 -201 10420
## 6 -2752 -1148 699255 -73 10420
write.table(msa3, "population-msa.csv", sep = ",", row = FALSE)