data = read.csv(file.choose())
getwd()
## [1] "C:/Users/karol/Desktop/census-master"
setwd("C:/Users/karol/Desktop/census-master")
data = read.csv("oecd.csv")
data
colnames(data)
## [1] "REG_ID" "Region" "Country" "totpop05"
## [5] "totpop20" "pop65plus05" "pop65plus20" "workagepop05"
## [9] "workagepop20" "areasqkm" "popsqkm05" "popsqkm20"
## [13] "pctba20" "pm2pt5_05" "pm2pt5_19" "co2tot08"
## [17] "co2cap08" "broadband19" "inc05" "inc19"
## [21] "unemp05" "unemp20" "covidvax2021q2"
nrow(data)
## [1] 453
head(data)
## REG_ID Region Country totpop05 totpop20 pop65plus05 pop65plus20
## 1 US39 Ohio US 11463300 11790600 1527300 2097640
## 2 US42 Pennsylvania US 12450000 12989600 1887080 2447690
## 3 IL05 Tel Aviv District IL 1183300 1452400 175700 NA
## 4 ITF6 Calabria IT 1989500 1894110 358068 419874
## 5 KR04 Jeolla Region KR 5113250 5044160 666751 980374
## 6 KR07 Jeju KR 541833 670207 54792 101153
## workagepop05 workagepop20 areasqkm popsqkm05 popsqkm20 pctba20 pm2pt5_05
## 1 7623540 7472740 106056 108.09 111.17 NA 12.64
## 2 8242130 8175650 116075 107.26 111.91 NA 13.20
## 3 759100 NA 172 6879.65 8444.19 NA 25.32
## 4 1321450 1226510 15179 131.07 124.78 0.16000 14.27
## 5 3440470 3446440 20539 248.95 245.59 0.51786 23.38
## 6 367675 472575 1846 293.52 363.06 0.57684 19.66
## pm2pt5_19 co2tot08 co2cap08 broadband19 inc05 inc19 unemp05 unemp20
## 1 8.70 199757.00 17.3 0.854 34335 42164 0.060 0.085
## 2 8.56 252782.00 20.0 0.856 38409 48361 0.049 0.089
## 3 20.98 3751.66 3.0 NA 13021 NA 0.073 NA
## 4 11.22 8842.47 4.5 0.770 16228 15236 0.145 0.207
## 5 27.58 60238.20 11.9 NA 13642 19438 0.032 0.029
## 6 22.67 1351.04 2.5 NA 13533 18724 0.027 0.025
## covidvax2021q2
## 1 0.6420
## 2 0.6810
## 3 NA
## 4 0.6359
## 5 0.6635
## 6 0.6326
summary(data)
## REG_ID Region Country totpop05
## Length:453 Length:453 Length:453 Min. : 25461
## Class :character Class :character Class :character 1st Qu.: 655329
## Mode :character Mode :character Mode :character Median : 1394730
## Mean : 2907246
## 3rd Qu.: 3259100
## Max. :40442800
##
## totpop20 pop65plus05 pop65plus20 workagepop05
## Min. : 29884 Min. : 791 Min. : 1611 Min. : 11943
## 1st Qu.: 715115 1st Qu.: 66751 1st Qu.: 116458 1st Qu.: 395865
## Median : 1518000 Median : 172288 Median : 262396 Median : 893579
## Mean : 3186488 Mean : 388793 Mean : 556600 Mean : 1963877
## 3rd Qu.: 3534170 3rd Qu.: 462390 3rd Qu.: 599662 3rd Qu.: 2238690
## Max. :46289300 Max. :5993140 Max. :9274000 Max. :27632900
## NA's :27 NA's :58 NA's :27
## workagepop20 areasqkm popsqkm05 popsqkm20
## Min. : 18156 Min. : 14 Min. : 0.02 Min. : 0.02
## 1st Qu.: 438709 1st Qu.: 10140 1st Qu.: 24.91 1st Qu.: 26.82
## Median : 996186 Median : 23047 Median : 71.89 Median : 76.44
## Mean : 1950689 Mean : 95999 Mean : 262.73 Mean : 299.87
## 3rd Qu.: 2236910 3rd Qu.: 58303 3rd Qu.: 167.03 3rd Qu.: 186.52
## Max. :26107600 Max. :2532420 Max. :6879.65 Max. :8444.19
## NA's :58
## pctba20 pm2pt5_05 pm2pt5_19 co2tot08
## Min. :0.0000 Min. : 4.15 Min. : 3.70 Min. : 3.9
## 1st Qu.:0.2175 1st Qu.:10.53 1st Qu.: 7.78 1st Qu.: 4905.5
## Median :0.2893 Median :14.51 Median :11.96 Median : 13076.3
## Mean :0.3040 Mean :15.94 Mean :13.72 Mean : 35476.3
## 3rd Qu.:0.3822 3rd Qu.:21.18 3rd Qu.:18.05 3rd Qu.: 40117.6
## Max. :0.6280 Max. :38.65 Max. :41.93 Max. :511789.0
## NA's :162 NA's :60 NA's :60 NA's :96
## co2cap08 broadband19 inc05 inc19
## Min. : 0.10 Min. :0.5517 Min. : 4715 Min. : 7445
## 1st Qu.: 4.80 1st Qu.:0.8280 1st Qu.:15008 1st Qu.:17100
## Median : 7.40 Median :0.8680 Median :18768 Median :21840
## Mean : 12.62 Mean :0.8611 Mean :20662 Mean :25107
## 3rd Qu.: 13.20 3rd Qu.:0.9100 3rd Qu.:25437 3rd Qu.:27170
## Max. :340.70 Max. :1.0000 Max. :54104 Max. :67034
## NA's :96 NA's :198 NA's :145 NA's :186
## unemp05 unemp20 covidvax2021q2
## Min. :0.01800 Min. :0.01900 Min. :0.0510
## 1st Qu.:0.04900 1st Qu.:0.04400 1st Qu.:0.5581
## Median :0.06800 Median :0.07000 Median :0.6464
## Mean :0.08266 Mean :0.08071 Mean :0.6090
## 3rd Qu.:0.10375 3rd Qu.:0.09900 3rd Qu.:0.7209
## Max. :0.29900 Max. :0.33600 Max. :0.8466
## NA's :139 NA's :121 NA's :237
View(data)
pop = data$totpop20
pop
## [1] 11790600 12989600 1452400 1894110 5044160 670207 1539280 3055150
## [9] 2180980 44712 728157 543000 644100 1246400 757634 1081650
## [17] 783204 1118580 1442660 484547 12785200 3188670 3114070 1096230
## [25] 887099 6920120 3281680 1330600 870165 1512670 1399040 1214570
## [33] 14245000 626108 1886580 5639080 631181 11423000 1018450 1340000
## [41] 393893 173811 1238000 251521 229516 715115 254500 1641060
## [49] 294436 1490280 521364 7993610 3479530 690093 1961460 29217700
## [57] 642495 532644 1206220 2050980 2316140 3575340 2863270 1385140
## [65] 9597000 1515110 1654750 3961950 9616620 965718 1427030 460083
## [73] 3738900 636504 1872100 561293 13124700 1608190 2690180 17947200
## [81] 1060760 994549 2864810 1847770 7022220 10067700 9279740 778962
## [89] 7718790 577267 1293940 4879130 5755700 5225000 438406 315931
## [97] 8125070 405835 1324280 1518000 1701800 1846020 1378650 36914000
## [105] 20542000 7652350 351491 4367250 1590250 4064050 21292700 410521
## [113] 2280910 435195 421912 1361470 1027560 3184220 995849 2322000
## [121] 194300 1716900 5536000 514564 2442690 2084450 1684290 6639010
## [129] 1755740 2194780 2903770 233034 1162410 2085950 2879530 732441
## [137] 991886 1789800 1053400 2401310 382773 991063 1014340 1328980
## [145] 5136000 5685060 1061980 521829 3227410 861773 3534170 2318820
## [153] 426806 1497440 668213 1357080 589110 264670 1100010 1910410
## [161] 244600 102800 288086 540780 397139 2467570 1118370 3829970
## [169] 1907680 6288080 1231090 585866 387285 5707170 2956870 20154900
## [177] 5158730 125034 5712140 553254 545425 6747070 39155 1278240
## [185] 2377080 1727400 818962 15519300 1831150 4207710 46289300 1063450
## [193] 1627590 555401 79020 271904 347512 2547430 32800 2565730
## [201] 3313430 1149390 8167530 431380 2307330 649957 1319230 4503960
## [209] 10457200 3962030 4311220 3953310 1611620 8612000 242796 582388
## [217] 14924000 5029340 857762 1171160 894470 3281480 7113540 7743960
## [225] 1295390 544764 1122620 961055 50636 302831 430736 370974
## [233] 203149 651065 1024120 51100 125200 856858 3358520 5951850
## [241] 5077580 820511 511551 5176190 1770380 246143 1223360 11100400
## [249] 2174670 493682 2562960 1117200 465136 3371960 1451910 4651200
## [257] 6172680 4241540 5130730 1380650 2233000 1524830 4875290 3692560
## [265] 2217290 1330330 2045550 836096 891440 252110 1200540 1148790
## [273] 29884 8478080 1537430 1183810 3351540 14930600 7114600 7252500
## [281] 556002 2377100 924870 1406630 499300 6018420 1373800 2319040
## [289] 558410 14745700 681202 131100 1001410 10725800 45372 1159900
## [297] 669592 1336790 2189140 314709 1960170 1326340 7833380 870200
## [305] 376157 2485650 4039280 3526220 2722130 1828950 3243000 1620320
## [313] 4532150 63692 112958 1180640 10628500 572151 2901380 3818420
## [321] 345867 877832 2663560 1911190 1042760 8578300 1847250 1608140
## [329] 4093900 4071970 423021 4475460 3600260 21569900 8632040 1179300
## [337] 300516 1627700 2702590 1018900 1663700 1233980 837359 3697000
## [345] 84473 2236990 1796460 17366200 11516800 2809390 1491940 192740
## [353] 2047950 598613 162700 2794520 1218740 6696670 3660070 981889
## [361] 3321760 2838320 5024800 3012230 5784310 6785640 6154480 1086190
## [369] 1377850 5892320 42174 704558 1823790 691854 1131940 1115630
## [377] 7254000 84085 2059730 4078370 8690750 9187100 949252 294206
## [385] 86657 2016770 1310790 333265 2314830 773450 338400 179700
## [393] 359821 5987800 1750220 2829950 3669490 2521890 986887 383488
## [401] 1671610 39499700 1362280 2117570 4420030 1469400 4464120 656509
## [409] 107297 1223110 589936 1689730 25957900 5074710 1504870 1770780
## [417] 3131320 3119860 4163020 6677930 1242730 359127 369992 1129850
## [425] 365317 975182 1771480 2094260 412682 12291600 8064150 1297100
## [433] 2445550 1973580 161329 793437 2133380 874573 3744300 7177990
## [441] 2935880 10027600 942233 254254 178362 1210730 3162510 1453710
## [449] 760267 899648 278926 2088290 3082400
length(pop)
## [1] 453
mean(pop)
## [1] 3186488
median(pop)
## [1] 1518000
min(pop)
## [1] 29884
max(pop)
## [1] 46289300
summary(pop)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29884 715115 1518000 3186488 3534170 46289300
hi_unemp = subset(data, unemp20 >= 0.1)
nrow(hi_unemp)
## [1] 80
nrow(hi_unemp)/nrow(data)
## [1] 0.1766004
hist(data$unemp20, breaks = 20, xlim = c(0,max(data$unemp20, na.rm=T)), xlab="2020 Unemployment Rate",
ylab = "Number of Regions", col = "blue", border = "red", main = "Unemployment in World Regions")
(2) DESCRIBING DATA IN R
ls() # list current objects (data frames, vectors, model outputs, etc.)
## [1] "data" "hi_unemp" "pop"
rm(hi_unemp) # remove an object - the matrix unemp, in this example
rm(list=ls()) # remove all objects from memory
options(scipen=999) # avoids printing things in scientific notation
options(stringsAsFactors=FALSE) # avoids reading in factors when reading in files
setwd("C:/Users/karol/Desktop/census-master")
data = read.csv("oecd.csv")
data$pctsenior20 = data$pop65plus20/data$totpop20
summary(data$pctsenior20)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02662 0.14497 0.18364 0.17604 0.21266 0.33487 58
View(data) # clicking on column headers sorts the data temporarily
data2 = data[order(data$pctsrincr),]
head(data2)
## REG_ID Region Country totpop05 totpop20 pop65plus05
## 247 BE1 Brussels Capital Region BE 1006750 1223360 155511
## 45 EL41 North Aegean EL 199177 229516 45128
## 10 CO97 Vaup\xe9s CO 25461 44712 1022
## 327 DE6 Hamburg DE 1734830 1847250 310915
## 267 ES42 Castile-La Mancha ES 1874020 2045550 350650
## 345 ES64 Melilla ES 64910 84473 6951
## pop65plus20 workagepop05 workagepop20 areasqkm popsqkm05 popsqkm20 pctba20
## 247 159659 666848 825104 162 6214.50 7551.63 0.493
## 45 47245 123781 143588 3801 52.40 60.38 0.293
## 10 1795 11943 23166 54135 0.47 0.83 NA
## 327 336359 1196580 1246260 710 2443.42 2601.76 0.360
## 267 390189 1233200 1349730 79109 23.69 25.86 0.308
## 345 9364 43182 55234 14 4636.43 6033.79 0.343
## pm2pt5_05 pm2pt5_19 co2tot08 co2cap08 broadband19 inc05 inc19 unemp05
## 247 18.74 14.93 5055.730 4.8 0.87 21121 21773 0.164
## 45 10.11 7.73 407.930 2.0 NA 16327 13363 0.104
## 10 23.39 22.83 NA NA NA NA NA NA
## 327 15.31 12.17 14313.600 8.1 0.96 27826 28429 0.106
## 267 12.32 9.76 23508.700 11.5 0.87 17169 17010 0.092
## 345 28.56 28.84 3.888 0.1 0.91 19623 15981 0.120
## unemp20 covidvax2021q2 pctsenior20 pctsenior05 pctsrincr
## 247 0.125 0.4987 0.13050860 0.15446834 -0.023959739446
## 45 0.169 NA 0.20584622 0.22657235 -0.020726129686
## 10 NA NA 0.04014582 0.04013982 0.000006000463
## 327 0.049 0.6990 0.18208634 0.17921929 0.002867055068
## 267 0.178 NA 0.19075016 0.18711113 0.003639034908
## 345 0.244 0.5857 0.11085199 0.10708674 0.003765252729
data2 = data[order(-data$pctsrincr),] # this repeats the above, but the negative sign makes it in descending order
data2[1:10,] # data frames can be sliced using brackets. The format is row, column. This isolates rows 1 through 10, and leaves the columns untouched
## REG_ID Region Country totpop05 totpop20 pop65plus05
## 94 JPA Hokkaido JP 5628000 5225000 1205690
## 65 JPC Northern-Kanto, Koshin JP 10097000 9597000 2099640
## 105 JPG Kansai region JP 20893000 20542000 4055550
## 344 JPI Shikoku JP 4086000 3697000 991186
## 393 FRY2 Martinique FR 395982 359821 52303
## 51 CA10 Newfoundland and Labrador CA 514332 521364 67928
## 214 JPB Tohoku JP 9635000 8612000 2230000
## 145 JPE Hokuriku JP 5539000 5136000 1270300
## 217 JPF Toukai JP 15021000 14924000 2870530
## 429 FRY1 Guadeloupe FR 441511 412682 52755
## pop65plus20 workagepop05 workagepop20 areasqkm popsqkm05 popsqkm20 pctba20
## 94 1679000 3696060 2989000 83456 67.44 62.61 NA
## 65 2904000 6543740 5566000 35355 285.59 271.45 NA
## 105 5893000 13834200 12197000 26232 796.47 783.09 NA
## 344 1238000 2544500 2037000 18648 219.11 198.25 NA
## 393 80327 259203 223328 1102 359.33 326.52 NA
## 51 116181 366005 335313 370514 1.39 1.41 NA
## 214 2772000 6060250 4885000 66504 144.88 129.50 NA
## 145 1629000 3492910 2907000 32723 169.27 156.95 NA
## 217 4133000 9927250 8916000 22338 672.44 668.10 NA
## 429 84319 284856 255777 1680 262.80 245.64 NA
## pm2pt5_05 pm2pt5_19 co2tot08 co2cap08 broadband19 inc05 inc19 unemp05
## 94 8.79 10.42 50114.30 9.1 0.551745 18195 NA 0.053
## 65 11.59 12.67 99805.00 10.0 0.690894 20292 NA 0.038
## 105 12.62 13.99 171129.00 8.2 0.731207 21315 NA 0.054
## 344 11.60 13.54 46079.20 11.5 0.590374 18390 NA 0.048
## 393 9.49 10.59 NA NA 0.760000 16669 21302 0.185
## 51 5.97 4.47 8015.61 15.7 NA 15317 NA 0.152
## 214 9.64 10.79 103584.00 11.0 0.618307 17846 NA 0.053
## 145 11.05 12.11 52947.70 9.7 0.703075 21816 NA 0.036
## 217 11.86 13.10 140793.00 9.3 0.709757 22311 NA 0.033
## 429 8.37 9.32 NA NA 0.720000 16585 18979 0.255
## unemp20 covidvax2021q2 pctsenior20 pctsenior05 pctsrincr
## 94 0.030 NA 0.3213397 0.2142306 0.10710908
## 65 0.027 NA 0.3025946 0.2079469 0.09464765
## 105 0.032 NA 0.2868757 0.1941105 0.09276520
## 344 0.031 NA 0.3348661 0.2425810 0.09228510
## 393 0.128 0.3032 0.2232416 0.1320843 0.09115727
## 51 0.139 0.7948 0.2228405 0.1320703 0.09077014
## 214 0.031 NA 0.3218765 0.2314478 0.09042861
## 145 0.025 NA 0.3171729 0.2293374 0.08783547
## 217 0.025 NA 0.2769365 0.1911011 0.08583535
## 429 0.178 0.2942 0.2043195 0.1194874 0.08483215
data2[1:10,1:5] # this isolates rows 1-10 and columns 1-5
## REG_ID Region Country totpop05 totpop20
## 94 JPA Hokkaido JP 5628000 5225000
## 65 JPC Northern-Kanto, Koshin JP 10097000 9597000
## 105 JPG Kansai region JP 20893000 20542000
## 344 JPI Shikoku JP 4086000 3697000
## 393 FRY2 Martinique FR 395982 359821
## 51 CA10 Newfoundland and Labrador CA 514332 521364
## 214 JPB Tohoku JP 9635000 8612000
## 145 JPE Hokuriku JP 5539000 5136000
## 217 JPF Toukai JP 15021000 14924000
## 429 FRY1 Guadeloupe FR 441511 412682
srrank = seq(1:length(data2$pctsrincr)) # seq makes a vector from 1 until a specified value
srrank
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## [217] 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## [235] 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## [253] 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## [271] 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## [289] 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## [307] 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## [325] 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## [343] 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## [361] 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## [379] 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## [397] 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## [415] 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## [433] 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## [451] 451 452 453
data2 = cbind(data2, srrank) # cbind stands for column bind. this pastes srrank to data2 and overwrites data2 with the result.
par(mfrow=c(1,2)) # par stands for graphical parameter. mfrow specifies the number of rows and columns in the plot window
hist(data$pctsenior05) # left plot
hist(data$pctsenior20) # right plot
par(mfrow=c(1,2)) # doing this again clears the plots
hist(data$pctsenior05, breaks=seq(0,0.35,0.05), main="2005", ylab="World Regions", xlab="Percent Senior", col="forestgreen")
hist(data$pctsenior20, breaks=seq(0,0.35,0.05), main="2020", ylab="World Regions", xlab="Percent Senior", col="dodgerblue")
par(mfrow=c(1,2))
hist(data$pctsenior05, breaks=seq(0,0.35,0.05), main="2005", ylab="World Regions", xlab="Percent Senior", col="forestgreen")
abline(v=mean(data$pctsenior05, na.rm=T), col="black", lty=3, lwd=2)
hist(data$pctsenior20, breaks=seq(0,0.35,0.05), main="2020", ylab="World Regions", xlab="Percent Senior", col="dodgerblue")
abline(v=mean(data$pctsenior20, na.rm=T), col="black", lty=3, lwd=2)
boxplot(data$pctsenior05, data$pctsenior20)
par(mfrow=c(1,1)) # reset graphics parameters to a single plot
plot(data$inc19, data$pm2pt5_19) # plot command will plot 2 vectors against each other
x = data$inc19 # assign this vector a shorter, more convenient name.
y = data$pm2pt5_19
model = lm(y ~ x) # runs a 2-variable regression and saves the output to an object I've called 'model'
summary(model) # regression output
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5695 -2.6575 -0.6021 1.4821 17.6324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.85882873 0.71838502 24.860 <0.0000000000000002 ***
## x -0.00023598 0.00002597 -9.087 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.928 on 265 degrees of freedom
## (186 observations deleted due to missingness)
## Multiple R-squared: 0.2376, Adjusted R-squared: 0.2347
## F-statistic: 82.57 on 1 and 265 DF, p-value: < 0.00000000000000022
outcorr = cor.test(data$inc19, data$pm2pt5_19) # sometimes it's convenient to assign the output of a test to a model so you can manipulate it later
outcorr
##
## Pearson's product-moment correlation
##
## data: data$inc19 and data$pm2pt5_19
## t = -9.0869, df = 265, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5738772 -0.3901950
## sample estimates:
## cor
## -0.4874099
outcorr$estimate
## cor
## -0.4874099
plot(x,y, xlab="2019 Personal Income ($USD)", ylab = "Average air pollution level (PM2.5)",
main = "Income and particulate emissions across world regions")
abline(model) # abline can also add model results to a plot
legend("topright", inset=0.045, legend=c("r =", round(outcorr$estimate,4)))
plot(x, y, xlab="2019 Personal Income ($USD)", ylab = "Average air pollution level (PM2.5)",
main = "Income and particulate emissions across world regions", pch=20, font.main=4,
ylim=c(0,40), xlim=c(5000,70000))
abline(model, col="blue", lwd=2)
rp = vector('expression', 3)
rp[1] = substitute(expression(italic(B)[1] == MYVALUE3), list(MYVALUE3=format(summary(model)$coefficients[2], dig=4)))[2]
rp[2] = substitute(expression(italic(p) == MYVALUE2), list(MYVALUE2=format(summary(model)$coefficients[2,4], dig=3)))[2]
rp[3] = substitute(expression(italic(r) == MYVALUE), list(MYVALUE=format(sqrt(summary(model)$r.squared), dig=4)))[2]
legend("topright", legend=rp, bty='n')
text(data$inc19, data$pm2pt5_19-1, labels=data$Country, cex=0.5)
summary(data)
## REG_ID Region Country totpop05
## Length:453 Length:453 Length:453 Min. : 25461
## Class :character Class :character Class :character 1st Qu.: 655329
## Mode :character Mode :character Mode :character Median : 1394730
## Mean : 2907246
## 3rd Qu.: 3259100
## Max. :40442800
##
## totpop20 pop65plus05 pop65plus20 workagepop05
## Min. : 29884 Min. : 791 Min. : 1611 Min. : 11943
## 1st Qu.: 715115 1st Qu.: 66751 1st Qu.: 116458 1st Qu.: 395865
## Median : 1518000 Median : 172288 Median : 262396 Median : 893579
## Mean : 3186488 Mean : 388793 Mean : 556600 Mean : 1963877
## 3rd Qu.: 3534170 3rd Qu.: 462390 3rd Qu.: 599662 3rd Qu.: 2238690
## Max. :46289300 Max. :5993140 Max. :9274000 Max. :27632900
## NA's :27 NA's :58 NA's :27
## workagepop20 areasqkm popsqkm05 popsqkm20
## Min. : 18156 Min. : 14 Min. : 0.02 Min. : 0.02
## 1st Qu.: 438709 1st Qu.: 10140 1st Qu.: 24.91 1st Qu.: 26.82
## Median : 996186 Median : 23047 Median : 71.89 Median : 76.44
## Mean : 1950689 Mean : 95999 Mean : 262.73 Mean : 299.87
## 3rd Qu.: 2236910 3rd Qu.: 58303 3rd Qu.: 167.03 3rd Qu.: 186.52
## Max. :26107600 Max. :2532420 Max. :6879.65 Max. :8444.19
## NA's :58
## pctba20 pm2pt5_05 pm2pt5_19 co2tot08
## Min. :0.0000 Min. : 4.15 Min. : 3.70 Min. : 3.9
## 1st Qu.:0.2175 1st Qu.:10.53 1st Qu.: 7.78 1st Qu.: 4905.5
## Median :0.2893 Median :14.51 Median :11.96 Median : 13076.3
## Mean :0.3040 Mean :15.94 Mean :13.72 Mean : 35476.3
## 3rd Qu.:0.3822 3rd Qu.:21.18 3rd Qu.:18.05 3rd Qu.: 40117.6
## Max. :0.6280 Max. :38.65 Max. :41.93 Max. :511789.0
## NA's :162 NA's :60 NA's :60 NA's :96
## co2cap08 broadband19 inc05 inc19
## Min. : 0.10 Min. :0.5517 Min. : 4715 Min. : 7445
## 1st Qu.: 4.80 1st Qu.:0.8280 1st Qu.:15008 1st Qu.:17100
## Median : 7.40 Median :0.8680 Median :18768 Median :21840
## Mean : 12.62 Mean :0.8611 Mean :20662 Mean :25107
## 3rd Qu.: 13.20 3rd Qu.:0.9100 3rd Qu.:25437 3rd Qu.:27170
## Max. :340.70 Max. :1.0000 Max. :54104 Max. :67034
## NA's :96 NA's :198 NA's :145 NA's :186
## unemp05 unemp20 covidvax2021q2 pctsenior20
## Min. :0.01800 Min. :0.01900 Min. :0.0510 Min. :0.02662
## 1st Qu.:0.04900 1st Qu.:0.04400 1st Qu.:0.5581 1st Qu.:0.14497
## Median :0.06800 Median :0.07000 Median :0.6464 Median :0.18364
## Mean :0.08266 Mean :0.08071 Mean :0.6090 Mean :0.17604
## 3rd Qu.:0.10375 3rd Qu.:0.09900 3rd Qu.:0.7209 3rd Qu.:0.21266
## Max. :0.29900 Max. :0.33600 Max. :0.8466 Max. :0.33487
## NA's :139 NA's :121 NA's :237 NA's :58
## pctsenior05 pctsrincr
## Min. :0.02163 Min. :-0.02396
## 1st Qu.:0.08195 1st Qu.: 0.03006
## Median :0.13533 Median : 0.04206
## Mean :0.12839 Mean : 0.04235
## 3rd Qu.:0.16581 3rd Qu.: 0.05277
## Max. :0.26441 Max. : 0.10711
## NA's :27 NA's :85
mean(data$broadband19)
## [1] NA
mean(data$broadband19, na.rm=T)
## [1] 0.8610923
data3 = data[!is.na(data$broadband19),] # isolate rows wherein broadband19 is NA, remove them, and save to 'data3'
nrow(data)
## [1] 453
nrow(data3) # notice how many fewer observations are now in data4
## [1] 255
data$pctsrincr = NULL # remove this column
data4 = data[c(1:4)] # extract just the first 4 columns
(3) WORKING WITH MULTIPLE DATA SOURCES
MERGE DATASETS FROM DIFFERENT SOURCES AND ANALYZE THEM
setwd("C:/Users/karol/Desktop/census-master")
dof = read.csv("ca_dof_popchg_e6.csv")
head(dof)
## county id_old id mpo pop10 births10 deaths10 natinc10
## 1 Alameda 6001 0500000US06001 ABAG 1515354 4873 2157 2716
## 2 Alpine 6003 0500000US06003 non-MPO 1175 3 3 0
## 3 Amador 6005 0500000US06005 non-MPO 38069 78 100 -22
## 4 Butte 6007 0500000US06007 non-MPO 220193 622 551 71
## 5 Calaveras 6009 0500000US06009 non-MPO 45535 75 118 -43
## 6 Colusa 6011 0500000US06011 non-MPO 21465 69 23 46
## immig10 dommig10 pop17 births17 deaths17 natinc17 immig17 dommig17 pop21
## 1 2081 286 1650818 19443 10112 9331 12421 -8110 1671741
## 2 0 0 1141 4 13 -9 1 -1 1181
## 3 2 -2 37050 306 437 -131 20 -20 40316
## 4 63 59 226470 2476 2406 70 191 1448 201158
## 5 2 -2 44609 371 509 -138 35 -35 45111
## 6 16 -16 22580 313 161 152 126 -126 22059
## births21 deaths21 natinc21 immig21 dommig21
## 1 17296 11592 5704 1558 -17221
## 2 13 33 -20 0 2
## 3 272 503 -231 6 35
## 4 1983 2476 -493 -35 -9530
## 5 360 586 -226 4 56
## 6 292 193 99 18 116
nrow(dof)
## [1] 58
setwd("C:/Users/karol/Desktop/census-master")
acs = read.csv(text=readLines("./ACSDT5Y2020.B25077_data_with_overlays_2022-05-24T172640.csv")[(-2)])
head(acs)
## B25077_001E B25077_001M GEO_ID NAME
## 1 825300 4462 0500000US06001 Alameda County, California
## 2 372500 23650 0500000US06003 Alpine County, California
## 3 329300 11372 0500000US06005 Amador County, California
## 4 304700 7078 0500000US06007 Butte County, California
## 5 340000 9138 0500000US06009 Calaveras County, California
## 6 274100 17719 0500000US06011 Colusa County, California
setwd("C:/Users/karol/Desktop/census-master")
bls = read.csv("blsemp_ca.csv")
head(bls)
## STATEFP COUNTYFP GEOID_OLD GEOID COUNTYNS NAME NAMELSAD
## 1 6 1 6001 0500000US06001 1675839 Alameda Alameda County
## 2 6 3 6003 0500000US06003 1675840 Alpine Alpine County
## 3 6 5 6005 0500000US06005 1675841 Amador Amador County
## 4 6 7 6007 0500000US06007 1675842 Butte Butte County
## 5 6 9 6009 0500000US06009 1675885 Calaveras Calaveras County
## 6 6 11 6011 0500000US06011 1675902 Colusa Colusa County
## X emp_jun15 emp_jun16 emp_jun17 emp_jun18 emp_jun19 emp_jun20 emp_jun21
## 1 NA 732579 755866 778911 793656 795938 706180 761279
## 2 NA 503 618 554 589 625 472 600
## 3 NA 11833 12053 12399 12473 12469 11562 12058
## 4 NA 78335 80680 82771 84185 81236 73239 76870
## 5 NA 8678 9325 9641 9903 9957 9725 10317
## 6 NA 9649 9681 9249 9812 9905 8482 9607
dof$hoval20 = acs$B25077_001E[match(dof$id, acs$GEO_ID)]
dof$emp21 = bls$emp_jun21[match(dof$id, bls$GEOID)] # same thing for bls data
dof$emp15 = bls$emp_jun15[match(dof$id, bls$GEOID)] # one time for each row
dof$popgr1721 = (dof$pop21 - dof$pop17)/dof$pop17
dof$empgr1521 = (dof$emp21 - dof$emp15)/dof$emp15
ANALYZE THE DATA USING PLOTS AND A REGRESSION MODEL
hist(dof$pop21, breaks=20)
# Taking the logarithm of non-zero data can get rid of skew issues #
Right skew is common in social science data - high values pull up the
means. Using a log transformation is particularly useful when comparing
data, e.g. understanding the correlation between two variables.
hist(log(dof$pop21))
boxplot(dof$popgr1721, dof$empgr1521) # side-by-side boxplot.
x = dof$hoval20
y = log(dof$pop21)
plot(x,y)
cor.test(x, y)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 4.274, df = 56, p-value = 0.00007531
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2725710 0.6685972
## sample estimates:
## cor
## 0.4959499
model = lm(y ~ x) # stores the results of a simple linear regression in the object 'model'
plot(x, y, ylab="Log of Population, 2021", xlab="Median Home Value, 2020", main="California Counties") # see help(par) to view all plot options
abline(model, col="dodgerblue", lwd=3)
rp = vector('expression', 3)
rp[1] = substitute(expression(italic(B)[1] == MYVALUE3), list(MYVALUE3=format(summary(model)$coefficients[2], dig=4)))[2]
rp[2] = substitute(expression(italic(p) == MYVALUE2), list(MYVALUE2=format(summary(model)$coefficients[2,4], dig=3)))[2]
rp[3] = substitute(expression(italic(r) == MYVALUE), list(MYVALUE=round(sqrt(summary(model)$r.squared), 4)))[2]
legend("bottomright", legend=rp, bty='n') # can put this at any corner of the plot
ADDITIONAL FUNCTIONALITY WITH PACKAGES
library(sf)
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(foreign)
library(doBy)
unique(unlist(dof$mpo))
## [1] "ABAG" "non-MPO" "SACOG" "SCAG" "SANDAG"
length(unique(unlist(dof$mpo)))
## [1] 5
setwd("C:/Users/karol/Desktop/census-master")
data = read.csv("oecd.csv")
table(data$Country)
##
## AT AU BE BG BR CA CH CL CO CR CZ DE DK EE EL ES FI FR HR HU IE IL IS IT JP KR
## 9 8 3 6 27 13 7 16 33 6 8 16 5 1 13 19 5 18 1 8 3 6 2 21 10 7
## LT LU LV MT NL NO NZ PE PL PT RO SE SI SK TR US
## 2 1 1 1 12 3 14 25 17 7 8 8 2 4 26 51
sort(table(data$Country), decreasing=TRUE) # the sort command, with the argument decreasing=TRUE will sort the table in decreasing order.
##
## US CO BR TR PE IT ES FR PL CL DE NZ CA EL NL JP AT AU CZ HU RO SE CH KR PT BG
## 51 33 27 26 25 21 19 18 17 16 16 14 13 13 12 10 9 8 8 8 8 8 7 7 7 6
## CR IL DK FI SK BE IE NO IS LT SI EE HR LU LV MT
## 6 6 5 5 4 3 3 3 2 2 2 1 1 1 1 1
barplot(sort(table(data$Country), decreasing=TRUE), cex.names=0.8,
main="Number of regions per country") # a barplot can be made (can also add colors, etc.). cex.names makes the text smaller so all the country names fit.
dofsum = summaryBy(pop21 + emp21 ~ mpo, data=dof, FUN=sum) # takes the sum total of emp21 and pop21 by MPO
dofsum
## mpo pop21.sum emp21.sum
## 1 ABAG 7696482 3781151
## 2 non-MPO 7061468 2596575
## 3 SACOG 2587772 1065151
## 4 SANDAG 3288455 1430039
## 5 SCAG 18734436 7707865
dofmean = summaryBy(hoval20 + immig21 ~ mpo, data=dof, FUN=mean) # mean across counties in each MPO. Ensure this makes sense!
dofmean
## mpo hoval20.mean immig21.mean
## 1 ABAG 850722.2 803.5556
## 2 non-MPO 330647.2 85.0000
## 3 SACOG 397516.7 428.0000
## 4 SANDAG 595600.0 1898.0000
## 5 SCAG 475300.0 2111.0000
setwd("C:/Users/karol/Desktop/census-master")
write.csv(dofsum, "dofsumtable.csv")
barplot(dofsum$pop21.sum, names.arg=dofsum$mpo)
barplot(dofsum$pop21.sum, names.arg=dofmean$mpo, axes=F, col="dodgerblue", main="2021 Population by MPO (millions)")
axis(2, at=seq(0,20000000,5000000), lab=seq(0,20,5))
abline(h=seq(0,20000000,5000000), lty=3, col="darkgrey")
box()
out = summaryBy(areasqkm + totpop20 + co2cap08 ~ Country, data=data, FUN=sum)
out2 = out[order(-out$co2cap08.sum),] # sort by CO2
out2
## Country areasqkm.sum totpop20.sum co2cap08.sum
## 42 US 9161929 331501023 1071.6
## 33 NZ 263357 5089400 544.0
## 5 BR 8511230 211755654 199.6
## 15 EL 130048 10718562 184.7
## 12 DE 353296 83166649 183.8
## 23 IS 100450 364134 160.2
## 2 AU 7703349 25692633 143.5
## 31 NL 34188 17407594 135.2
## 24 IT 297825 59641498 129.4
## 16 ES 502654 47332616 129.3
## 17 FI 304316 5525294 106.7
## 8 CL 742979 19458310 105.9
## 11 CZ 77212 10693940 101.8
## 41 TR 766509 83155037 97.3
## 25 JP 373530 126146000 92.4
## 26 KR 99461 51780527 82.2
## 1 AT 82519 8901072 77.7
## 38 SE 407300 10327588 55.5
## 22 IL 21643 8698700 52.1
## 13 DK 41987 5822765 44.9
## 7 CH 39860 8606033 42.5
## 40 SK 48702 5457872 34.9
## 36 PT 90996 10295914 33.1
## 3 BE 30452 11522440 27.7
## 32 NO 195571 1336968 23.4
## 39 SI 20145 2095859 22.9
## 28 LU 2586 626108 22.1
## 14 EE 43110 1328980 16.4
## 4 BG 110001 6951487 NA
## 6 CA 8965591 38037197 NA
## 9 CO 1141766 50372444 NA
## 10 CR 51709 5111221 NA
## 18 FR 633886 67320260 NA
## 19 HR 24515 1373800 NA
## 20 HU 91248 9769532 NA
## 21 IE 68655 4964442 NA
## 27 LT 62643 2794091 NA
## 29 LV 63290 1907680 NA
## 30 MT 313 514564 NA
## 34 PE 1285216 32625989 NA
## 35 PL 307236 37958173 NA
## 37 RO 234270 19328850 NA
cor.test(out2$co2cap08.sum, out2$totpop20.sum) # are larger countries higher emitters?
##
## Pearson's product-moment correlation
##
## data: out2$co2cap08.sum and out2$totpop20.sum
## t = 5.5136, df = 26, p-value = 0.000008713
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4973013 0.8691760
## sample estimates:
## cor
## 0.734167
m = lm(dommig21 ~ pop21 + hoval20 + empgr1521, data=dof)
summary(m)
##
## Call:
## lm(formula = dommig21 ~ pop21 + hoval20 + empgr1521, data = dof)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10465.3 -2285.2 -208.8 1408.9 20778.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3146.8375822 1345.7188993 2.338 0.023099 *
## pop21 -0.0076389 0.0004264 -17.915 < 0.0000000000000002 ***
## hoval20 -0.0088608 0.0025451 -3.482 0.000995 ***
## empgr1521 24097.9287781 8873.0081937 2.716 0.008860 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4494 on 54 degrees of freedom
## Multiple R-squared: 0.8846, Adjusted R-squared: 0.8782
## F-statistic: 138 on 3 and 54 DF, p-value: < 0.00000000000000022
(5) MANIPULATING SHAPEFILES
setwd("C:/Users/karol/Desktop/census-master")
shape = read.dbf("cb_2017_ca_county_500k.dbf")
head(shape)
## STATEFP COUNTYFP COUNTYNS AFFGEOID GEOID NAME LSAD ALAND
## 1 06 001 01675839 0500000US06001 06001 Alameda 06 1909616630
## 2 06 005 01675841 0500000US06005 06005 Amador 06 1539933576
## 3 06 013 01675903 0500000US06013 06013 Contra Costa 06 1857310903
## 4 06 023 01681908 0500000US06023 06023 Humboldt 06 9241251740
## 5 06 037 00277283 0500000US06037 06037 Los Angeles 06 10510588451
## 6 06 047 00277288 0500000US06047 06047 Merced 06 5012175320
## AWATER
## 1 216916717
## 2 29470568
## 3 225193562
## 4 1254039383
## 5 1794793532
## 6 112509475
shape$GEOID2 = as.numeric(as.character(shape$GEOID))
shape$pop21 = dof$pop21[match(shape$GEOID2, dof$id_old)]
shape$hoval20 = dof$hoval20[match(shape$GEOID2, dof$id_old)]
shape$emp21 = dof$emp21[match(shape$GEOID2, dof$id_old)]
summary(shape)
## STATEFP COUNTYFP COUNTYNS AFFGEOID GEOID
## 06:58 001 : 1 00277273: 1 0500000US06001: 1 06001 : 1
## 003 : 1 00277274: 1 0500000US06003: 1 06003 : 1
## 005 : 1 00277275: 1 0500000US06005: 1 06005 : 1
## 007 : 1 00277277: 1 0500000US06007: 1 06007 : 1
## 009 : 1 00277280: 1 0500000US06009: 1 06009 : 1
## 011 : 1 00277281: 1 0500000US06011: 1 06011 : 1
## (Other):52 (Other) :52 (Other) :52 (Other):52
## NAME LSAD ALAND AWATER
## Alameda : 1 06:58 Min. : 121485107 Min. : 4721397
## Alpine : 1 1st Qu.: 2485200412 1st Qu.: 41652226
## Amador : 1 Median : 3978042624 Median : 147555173
## Butte : 1 Mean : 6956606590 Mean : 353183413
## Calaveras: 1 3rd Qu.: 8948208104 3rd Qu.: 475186943
## Colusa : 1 Max. :51948123813 Max. :2729814515
## (Other) :52
## GEOID2 pop21 hoval20 emp21
## Min. :6001 Min. : 1181 Min. : 153600 Min. : 600
## 1st Qu.:6030 1st Qu.: 47536 1st Qu.: 262875 1st Qu.: 14547
## Median :6058 Median : 187032 Median : 344250 Median : 70836
## Mean :6058 Mean : 678769 Mean : 437798 Mean : 285876
## 3rd Qu.:6086 3rd Qu.: 705776 3rd Qu.: 593825 3rd Qu.: 252805
## Max. :6115 Max. :9944953 Max. :1163100 Max. :4219893
##
write.dbf(shape, "cb_2017_ca_county_500k.dbf")
(6) MAPPING MADE SIMPLE(ISH)
library(sf)
counties = read_sf(dsn = ".", layer = "cb_2017_ca_county_500k")
head(counties)
## Simple feature collection with 6 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -124.4096 ymin: 32.80146 xmax: -117.6464 ymax: 41.46584
## Geodetic CRS: NAD83
## # A tibble: 6 × 14
## STATEFP COUNT…¹ COUNT…² AFFGE…³ GEOID NAME LSAD ALAND AWATER GEOID2 pop21
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 06 001 016758… 050000… 06001 Alam… 06 1.91e 9 2.17e8 6001 1.67e6
## 2 06 005 016758… 050000… 06005 Amad… 06 1.54e 9 2.95e7 6005 4.03e4
## 3 06 013 016759… 050000… 06013 Cont… 06 1.86e 9 2.25e8 6013 1.16e6
## 4 06 023 016819… 050000… 06023 Humb… 06 9.24e 9 1.25e9 6023 1.35e5
## 5 06 037 002772… 050000… 06037 Los … 06 1.05e10 1.79e9 6037 9.94e6
## 6 06 047 002772… 050000… 06047 Merc… 06 5.01e 9 1.13e8 6047 2.83e5
## # … with 3 more variables: hoval20 <dbl>, emp21 <dbl>,
## # geometry <MULTIPOLYGON [°]>, and abbreviated variable names ¹COUNTYFP,
## # ²COUNTYNS, ³AFFGEOID
cnty = data.frame(counties) # if you just want to extract the attribute/data table
plot(counties["ALAND"])
plot(counties["pop21"])
plot(counties["pop21"], breaks="jenks", key.pos=2, pal=sf.colors(10),
main="2021 Population in California counties", axes=TRUE)
library(tidycensus)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(sf)
library(doBy)
census_api_key("5de05eeaa3869c3180eed1726a07e0f27e6c9e20", install=TRUE, overwrite=TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY").
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "5de05eeaa3869c3180eed1726a07e0f27e6c9e20"
medrent00 = get_decennial(geography = "state", variables = "H060001", sumfile="sf3", year = 2000)
## Getting data from the 2000 decennial Census
## Using Census Summary File 3
barplot(medrent00$value, names.arg=medrent00$NAME, cex.names=0.7, las=2)
medrent00b = medrent00[order(-medrent00$value),] # sort descending by making a new data frame
barplot(medrent00b$value, names.arg=medrent00b$NAME, cex.names=0.7, las=2,
main="State median rent, 2000", col="dodgerblue") #cex.names makes labels smaller, las=2 rotates them.
abline(h=seq(0,700,100), col="darkgrey", lty=3)
box()
acsvars = load_variables(2020, "acs5", cache = TRUE)
head(acsvars)
## # A tibble: 6 × 4
## name label concept geogr…¹
## <chr> <chr> <chr> <chr>
## 1 B01001A_001 Estimate!!Total: SEX BY AGE (WHITE… tract
## 2 B01001A_002 Estimate!!Total:!!Male: SEX BY AGE (WHITE… tract
## 3 B01001A_003 Estimate!!Total:!!Male:!!Under 5 years SEX BY AGE (WHITE… tract
## 4 B01001A_004 Estimate!!Total:!!Male:!!5 to 9 years SEX BY AGE (WHITE… tract
## 5 B01001A_005 Estimate!!Total:!!Male:!!10 to 14 years SEX BY AGE (WHITE… tract
## 6 B01001A_006 Estimate!!Total:!!Male:!!15 to 17 years SEX BY AGE (WHITE… tract
## # … with abbreviated variable name ¹geography
nrow(acsvars) # tells us how many variables
## [1] 27850
length(unique(unlist(acsvars$concept))) # tells us how many unique "concepts" there are
## [1] 1138
write.csv(acsvars, "acsvars.csv") # export to .csv so you explore easily in Excel
tr = get_acs(geography="tract", state="CA", county="Orange", variables="B25031_001",
year=2020, geometry=TRUE)
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
tr$medrent = tr$estimate # create a renamed to avoid future confusion
tr$estimate = NULL # get rid of the old one
varlist = c("B01001_001", "B19013_001", "B25035_001", "B08101_009")
tr_plus = get_acs(geography="tract", state="CA", county="Orange", variables=varlist, year=2020)
## Getting data from the 2016-2020 5-year ACS
tr_a = subset(tr_plus, variable=="B01001_001")
tr$totpop = tr_a$estimate[match(tr$GEOID, tr_a$GEOID)]
tr_b = subset(tr_plus, variable=="B19013_001")
tr$medhhinc = tr_b$estimate[match(tr$GEOID, tr_b$GEOID)]
tr_c = subset(tr_plus, variable=="B25035_001")
tr$medhomeage = tr_c$estimate[match(tr$GEOID, tr_c$GEOID)]
plot(tr['medhhinc'])
mystate = "CA"
mycounty = "Imperial"
myyear = 2020
mysurvey = "acs5"
cnty = get_acs(geography="tract", state=mystate, county=mycounty, variables="B01001_001", year=myyear, survey=mysurvey, geometry=TRUE)
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
colnames(cnty)[4] = "totpop"
varlist = c('B19001_001', 'B01002_001', 'B03002_001', 'B03002_003', 'B03002_004', 'B03002_012', 'B05001_001', 'B05001_006',
'B07001_001', 'B07001_017', 'B07001_033', 'B07001_049', 'B07001_065', 'B08014_001', 'B08014_002', 'B08014_003',
'B08014_004', 'B08014_005', 'B08014_006', 'B08014_007', 'B08101_001', 'B08101_009', 'B08101_033', 'B08101_017',
'B08101_025', 'B08101_041', 'B08101_049', 'B15003_001', 'B15003_002', 'B15003_003', 'B15003_004', 'B15003_005',
'B15003_006', 'B15003_007', 'B15003_008', 'B15003_009', 'B15003_010', 'B15003_011', 'B15003_012', 'B15003_013',
'B15003_014', 'B15003_015', 'B15003_016', 'B15003_017', 'B15003_018', 'B15003_019', 'B15003_020', 'B15003_021',
'B15003_022', 'B15003_023', 'B15003_024', 'B15003_025', 'B19001_002', 'B19001_003', 'B19001_004', 'B19001_005',
'B19001_006', 'B19001_007', 'B19001_008', 'B19001_009', 'B19001_010', 'B19001_011', 'B19001_012', 'B19001_013',
'B19001_014', 'B19001_015', 'B19001_016', 'B19001_017', 'B19013_001', 'B23025_001', 'B23025_002', 'B25031_001',
'B25077_001', 'B25034_001', 'B25034_002', 'B25034_003', 'B25034_004', 'B25034_005', 'B25034_006', 'B25034_007',
'B25034_008', 'B25034_009', 'B25034_010', 'B25034_011', 'B25035_001', 'B25024_002', 'B25024_003', 'B25024_004',
'B25024_005', 'B25024_006', 'B25024_007', 'B25024_008', 'B25024_009', 'B25024_010', 'B25024_011', 'B25003_001',
'B25003_002', 'B25003_003', 'B01001_003', 'B01001_004', 'B01001_005', 'B01001_006', 'B01001_020', 'B01001_021',
'B01001_022', 'B01001_023', 'B01001_024', 'B01001_025', 'B01001_027', 'B01001_028', 'B01001_029', 'B01001_030',
'B01001_044', 'B01001_045', 'B01001_046', 'B01001_047', 'B01001_048', 'B01001_049')
cnty2 = get_acs(geography="tract", state=mystate, county=mycounty, variables=varlist, year=myyear, survey=mysurvey, geometry=TRUE)
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
varnames = c('totHH', 'medage', 'tot4race', 'wnh', 'bnh', 'hisp', 'tot4_cit', 'noncit', 'tot4_mv', 'no_mv',
'in_cnty_mv', 'in_st_mv', 'out_st_mv', 'tot4_vehic', 'vehic0', 'vehic1', 'vehic2', 'vehic3', 'vehic4', 'vehic5pl',
'tot4_comm', 'comm_sov', 'comm_walk', 'comm_pool', 'comm_trans', 'comm_oth', 'comm_wah', 'tot4_educ', 'educ_none',
'educ_nurs', 'educ_kind', 'educ_1st', 'educ_2nd', 'educ_3rd', 'educ_4th', 'educ_5th', 'educ_6th', 'educ_7th', 'educ_8th',
'educ_9th', 'educ_10th', 'educ_11th', 'educ_12th', 'educ_hs', 'educ_ged', 'educ_cnud1', 'educ_some', 'educ_asso',
'educ_bach', 'educ_mast', 'educ_prof', 'educ_doct', 'hincund10', 'hinc1014', 'hinc1519', 'hinc2024',
'hinc2529', 'hinc3034', 'hinc3539', 'hinc4044', 'hinc4549', 'hinc5059', 'hinc6074', 'hinc7599', 'hinc100124', 'hinc125149',
'hinc150199', 'hinc200pl', 'medhhinc', 'tot4_work', 'emp', 'medrent', 'medhoval', 'totHU', 'HU2014', 'HU1013', 'HU9',
'HU9099', 'HU8089', 'HU7079', 'HU6069', 'HU5059', 'HU4049', 'HU1939', 'medyrblt', 'HU_sfd', 'HU_sfa', 'HU_mf2', 'HU_mf3_4',
'HU_mf5_9', 'HU_mf1019', 'HU_mf2049', 'HU_mf50pl', 'HU_mob', 'HU_boatRV', 'tot4_tenu', 'ownerHH', 'renterHH',
'male0005', 'male0509', 'male1014', 'male1517', 'male6566', 'male6769', 'male7074', 'male7579', 'male8084',
'male8500', 'female0005','female0509', 'female1014', 'female1517', 'female6566', 'female6769', 'female7074',
'female7579', 'female8084', 'female8500')
for(i in 1:length(unique(unlist(cnty2$variable)))){
join = subset(cnty2, variable==varlist[i])
cnty[,ncol(cnty)+1] = join$estimate[match(cnty$GEOID, join$GEOID)]
colnames(cnty)[ncol(cnty)] = varnames[i]
}
cnty$hhinc_sub20 = cnty$hincund10 + cnty$hinc1014 + cnty$hinc1519
cnty$hhinc_over100 = cnty$hinc100124 + cnty$hinc125149 + cnty$hinc150199 + cnty$hinc200pl
cnty$BAplus = cnty$educ_doct + cnty$educ_prof + cnty$educ_mast + cnty$educ_bach
cnty$noHS = cnty$tot4_educ - cnty$BAplus - cnty$educ_asso - cnty$educ_some - cnty$educ_cnud1 - cnty$educ_ged - cnty$educ_hs
cnty$age0017 = cnty$male0005 + cnty$male0509 + cnty$male1014 + cnty$male1517 + cnty$female0005 + cnty$female0509 + cnty$female1014 + cnty$female1517
cnty$age65plus = cnty$male6566 + cnty$male6769 + cnty$male7074 + cnty$male7579 + cnty$male8084 + cnty$male8500 + cnty$female6566 + cnty$female6769 + cnty$female7074 + cnty$female7579 + cnty$female8084 + cnty$female8500
st_write(cnty, "IM_tracts_ACS20.shp")
## Warning in abbreviate_shapefile_names(obj): Field names abbreviated for ESRI
## Shapefile driver
## Writing layer `IM_tracts_ACS20' to data source
## `IM_tracts_ACS20.shp' using driver `ESRI Shapefile'
## Writing 40 features with 129 fields and geometry type Multi Polygon.
write.csv(cnty, "IM_tracts_ACS20.csv")
plot(cnty['medhhinc'])