the easy way. the tradeoff is that it’s harder to save your output.

data = read.csv(file.choose())

the harder way. retrieve and set your filepath to wherever your files are. This is known as your ‘working directory.’

getwd()
## [1] "C:/Users/karol/Desktop/census-master"
setwd("C:/Users/karol/Desktop/census-master")
data = read.csv("oecd.csv")

to look at your data, just type its name.

when you read in data, it becomes a DATA FRAME in R.

data

but usually all you want are the column names and the number of rows. Or a summary of the DATA FRAME.

colnames(data)  
##  [1] "REG_ID"         "Region"         "Country"        "totpop05"      
##  [5] "totpop20"       "pop65plus05"    "pop65plus20"    "workagepop05"  
##  [9] "workagepop20"   "areasqkm"       "popsqkm05"      "popsqkm20"     
## [13] "pctba20"        "pm2pt5_05"      "pm2pt5_19"      "co2tot08"      
## [17] "co2cap08"       "broadband19"    "inc05"          "inc19"         
## [21] "unemp05"        "unemp20"        "covidvax2021q2"
nrow(data)
## [1] 453
head(data)
##   REG_ID            Region Country totpop05 totpop20 pop65plus05 pop65plus20
## 1   US39              Ohio      US 11463300 11790600     1527300     2097640
## 2   US42      Pennsylvania      US 12450000 12989600     1887080     2447690
## 3   IL05 Tel Aviv District      IL  1183300  1452400      175700          NA
## 4   ITF6          Calabria      IT  1989500  1894110      358068      419874
## 5   KR04     Jeolla Region      KR  5113250  5044160      666751      980374
## 6   KR07              Jeju      KR   541833   670207       54792      101153
##   workagepop05 workagepop20 areasqkm popsqkm05 popsqkm20 pctba20 pm2pt5_05
## 1      7623540      7472740   106056    108.09    111.17      NA     12.64
## 2      8242130      8175650   116075    107.26    111.91      NA     13.20
## 3       759100           NA      172   6879.65   8444.19      NA     25.32
## 4      1321450      1226510    15179    131.07    124.78 0.16000     14.27
## 5      3440470      3446440    20539    248.95    245.59 0.51786     23.38
## 6       367675       472575     1846    293.52    363.06 0.57684     19.66
##   pm2pt5_19  co2tot08 co2cap08 broadband19 inc05 inc19 unemp05 unemp20
## 1      8.70 199757.00     17.3       0.854 34335 42164   0.060   0.085
## 2      8.56 252782.00     20.0       0.856 38409 48361   0.049   0.089
## 3     20.98   3751.66      3.0          NA 13021    NA   0.073      NA
## 4     11.22   8842.47      4.5       0.770 16228 15236   0.145   0.207
## 5     27.58  60238.20     11.9          NA 13642 19438   0.032   0.029
## 6     22.67   1351.04      2.5          NA 13533 18724   0.027   0.025
##   covidvax2021q2
## 1         0.6420
## 2         0.6810
## 3             NA
## 4         0.6359
## 5         0.6635
## 6         0.6326
summary(data)
##     REG_ID             Region            Country             totpop05       
##  Length:453         Length:453         Length:453         Min.   :   25461  
##  Class :character   Class :character   Class :character   1st Qu.:  655329  
##  Mode  :character   Mode  :character   Mode  :character   Median : 1394730  
##                                                           Mean   : 2907246  
##                                                           3rd Qu.: 3259100  
##                                                           Max.   :40442800  
##                                                                             
##     totpop20         pop65plus05       pop65plus20       workagepop05     
##  Min.   :   29884   Min.   :    791   Min.   :   1611   Min.   :   11943  
##  1st Qu.:  715115   1st Qu.:  66751   1st Qu.: 116458   1st Qu.:  395865  
##  Median : 1518000   Median : 172288   Median : 262396   Median :  893579  
##  Mean   : 3186488   Mean   : 388793   Mean   : 556600   Mean   : 1963877  
##  3rd Qu.: 3534170   3rd Qu.: 462390   3rd Qu.: 599662   3rd Qu.: 2238690  
##  Max.   :46289300   Max.   :5993140   Max.   :9274000   Max.   :27632900  
##                     NA's   :27        NA's   :58        NA's   :27        
##   workagepop20         areasqkm         popsqkm05         popsqkm20      
##  Min.   :   18156   Min.   :     14   Min.   :   0.02   Min.   :   0.02  
##  1st Qu.:  438709   1st Qu.:  10140   1st Qu.:  24.91   1st Qu.:  26.82  
##  Median :  996186   Median :  23047   Median :  71.89   Median :  76.44  
##  Mean   : 1950689   Mean   :  95999   Mean   : 262.73   Mean   : 299.87  
##  3rd Qu.: 2236910   3rd Qu.:  58303   3rd Qu.: 167.03   3rd Qu.: 186.52  
##  Max.   :26107600   Max.   :2532420   Max.   :6879.65   Max.   :8444.19  
##  NA's   :58                                                              
##     pctba20         pm2pt5_05       pm2pt5_19        co2tot08       
##  Min.   :0.0000   Min.   : 4.15   Min.   : 3.70   Min.   :     3.9  
##  1st Qu.:0.2175   1st Qu.:10.53   1st Qu.: 7.78   1st Qu.:  4905.5  
##  Median :0.2893   Median :14.51   Median :11.96   Median : 13076.3  
##  Mean   :0.3040   Mean   :15.94   Mean   :13.72   Mean   : 35476.3  
##  3rd Qu.:0.3822   3rd Qu.:21.18   3rd Qu.:18.05   3rd Qu.: 40117.6  
##  Max.   :0.6280   Max.   :38.65   Max.   :41.93   Max.   :511789.0  
##  NA's   :162      NA's   :60      NA's   :60      NA's   :96        
##     co2cap08       broadband19         inc05           inc19      
##  Min.   :  0.10   Min.   :0.5517   Min.   : 4715   Min.   : 7445  
##  1st Qu.:  4.80   1st Qu.:0.8280   1st Qu.:15008   1st Qu.:17100  
##  Median :  7.40   Median :0.8680   Median :18768   Median :21840  
##  Mean   : 12.62   Mean   :0.8611   Mean   :20662   Mean   :25107  
##  3rd Qu.: 13.20   3rd Qu.:0.9100   3rd Qu.:25437   3rd Qu.:27170  
##  Max.   :340.70   Max.   :1.0000   Max.   :54104   Max.   :67034  
##  NA's   :96       NA's   :198      NA's   :145     NA's   :186    
##     unemp05           unemp20        covidvax2021q2  
##  Min.   :0.01800   Min.   :0.01900   Min.   :0.0510  
##  1st Qu.:0.04900   1st Qu.:0.04400   1st Qu.:0.5581  
##  Median :0.06800   Median :0.07000   Median :0.6464  
##  Mean   :0.08266   Mean   :0.08071   Mean   :0.6090  
##  3rd Qu.:0.10375   3rd Qu.:0.09900   3rd Qu.:0.7209  
##  Max.   :0.29900   Max.   :0.33600   Max.   :0.8466  
##  NA's   :139       NA's   :121       NA's   :237
View(data)  

You can isolate a column of a DATA FRAME so that it’s just a VECTOR.

This is easiest using a dollar sign. Then, you can look at some descriptive statistics of the VECTOR.

pop = data$totpop20
pop
##   [1] 11790600 12989600  1452400  1894110  5044160   670207  1539280  3055150
##   [9]  2180980    44712   728157   543000   644100  1246400   757634  1081650
##  [17]   783204  1118580  1442660   484547 12785200  3188670  3114070  1096230
##  [25]   887099  6920120  3281680  1330600   870165  1512670  1399040  1214570
##  [33] 14245000   626108  1886580  5639080   631181 11423000  1018450  1340000
##  [41]   393893   173811  1238000   251521   229516   715115   254500  1641060
##  [49]   294436  1490280   521364  7993610  3479530   690093  1961460 29217700
##  [57]   642495   532644  1206220  2050980  2316140  3575340  2863270  1385140
##  [65]  9597000  1515110  1654750  3961950  9616620   965718  1427030   460083
##  [73]  3738900   636504  1872100   561293 13124700  1608190  2690180 17947200
##  [81]  1060760   994549  2864810  1847770  7022220 10067700  9279740   778962
##  [89]  7718790   577267  1293940  4879130  5755700  5225000   438406   315931
##  [97]  8125070   405835  1324280  1518000  1701800  1846020  1378650 36914000
## [105] 20542000  7652350   351491  4367250  1590250  4064050 21292700   410521
## [113]  2280910   435195   421912  1361470  1027560  3184220   995849  2322000
## [121]   194300  1716900  5536000   514564  2442690  2084450  1684290  6639010
## [129]  1755740  2194780  2903770   233034  1162410  2085950  2879530   732441
## [137]   991886  1789800  1053400  2401310   382773   991063  1014340  1328980
## [145]  5136000  5685060  1061980   521829  3227410   861773  3534170  2318820
## [153]   426806  1497440   668213  1357080   589110   264670  1100010  1910410
## [161]   244600   102800   288086   540780   397139  2467570  1118370  3829970
## [169]  1907680  6288080  1231090   585866   387285  5707170  2956870 20154900
## [177]  5158730   125034  5712140   553254   545425  6747070    39155  1278240
## [185]  2377080  1727400   818962 15519300  1831150  4207710 46289300  1063450
## [193]  1627590   555401    79020   271904   347512  2547430    32800  2565730
## [201]  3313430  1149390  8167530   431380  2307330   649957  1319230  4503960
## [209] 10457200  3962030  4311220  3953310  1611620  8612000   242796   582388
## [217] 14924000  5029340   857762  1171160   894470  3281480  7113540  7743960
## [225]  1295390   544764  1122620   961055    50636   302831   430736   370974
## [233]   203149   651065  1024120    51100   125200   856858  3358520  5951850
## [241]  5077580   820511   511551  5176190  1770380   246143  1223360 11100400
## [249]  2174670   493682  2562960  1117200   465136  3371960  1451910  4651200
## [257]  6172680  4241540  5130730  1380650  2233000  1524830  4875290  3692560
## [265]  2217290  1330330  2045550   836096   891440   252110  1200540  1148790
## [273]    29884  8478080  1537430  1183810  3351540 14930600  7114600  7252500
## [281]   556002  2377100   924870  1406630   499300  6018420  1373800  2319040
## [289]   558410 14745700   681202   131100  1001410 10725800    45372  1159900
## [297]   669592  1336790  2189140   314709  1960170  1326340  7833380   870200
## [305]   376157  2485650  4039280  3526220  2722130  1828950  3243000  1620320
## [313]  4532150    63692   112958  1180640 10628500   572151  2901380  3818420
## [321]   345867   877832  2663560  1911190  1042760  8578300  1847250  1608140
## [329]  4093900  4071970   423021  4475460  3600260 21569900  8632040  1179300
## [337]   300516  1627700  2702590  1018900  1663700  1233980   837359  3697000
## [345]    84473  2236990  1796460 17366200 11516800  2809390  1491940   192740
## [353]  2047950   598613   162700  2794520  1218740  6696670  3660070   981889
## [361]  3321760  2838320  5024800  3012230  5784310  6785640  6154480  1086190
## [369]  1377850  5892320    42174   704558  1823790   691854  1131940  1115630
## [377]  7254000    84085  2059730  4078370  8690750  9187100   949252   294206
## [385]    86657  2016770  1310790   333265  2314830   773450   338400   179700
## [393]   359821  5987800  1750220  2829950  3669490  2521890   986887   383488
## [401]  1671610 39499700  1362280  2117570  4420030  1469400  4464120   656509
## [409]   107297  1223110   589936  1689730 25957900  5074710  1504870  1770780
## [417]  3131320  3119860  4163020  6677930  1242730   359127   369992  1129850
## [425]   365317   975182  1771480  2094260   412682 12291600  8064150  1297100
## [433]  2445550  1973580   161329   793437  2133380   874573  3744300  7177990
## [441]  2935880 10027600   942233   254254   178362  1210730  3162510  1453710
## [449]   760267   899648   278926  2088290  3082400
length(pop) 
## [1] 453
mean(pop)   
## [1] 3186488
median(pop)
## [1] 1518000
min(pop)
## [1] 29884
max(pop)
## [1] 46289300
summary(pop)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    29884   715115  1518000  3186488  3534170 46289300

what if we are interested in looking only at regions with unemployment greater than or equal to 10% ?

You can create a new DATA FRAME using SUBSET

Note that == is “equals exactly” and != is “not equals” and non-numeric values such as place names need to be placed in quotes when subsetting

hi_unemp = subset(data, unemp20 >= 0.1)
nrow(hi_unemp)
## [1] 80
nrow(hi_unemp)/nrow(data)
## [1] 0.1766004

You can make a very basic histogram, then a fancier one by passing more arguments to the command “hist”

hist(data$unemp20, breaks = 20, xlim = c(0,max(data$unemp20, na.rm=T)), xlab="2020 Unemployment Rate", 
     ylab = "Number of Regions", col = "blue", border = "red", main = "Unemployment in World Regions")

QUIZ QUESTIONS - CHALLENGE YOURSELF

What is the unemployment rate in the Central Bohemia Region (code CZ02) of the Czech Republic (code CZ)?

Make a histogram of 2019 income levels in Italian regions. Italy’s country code is “IT” and you can use the subset command with ‘exactly equals’ . In the plot window, click “export.”

(2) DESCRIBING DATA IN R

Some useful tips

ls()            # list current objects (data frames, vectors, model outputs, etc.)
## [1] "data"     "hi_unemp" "pop"
rm(hi_unemp)        # remove an object - the matrix unemp, in this example
rm(list=ls())   # remove all objects from memory
options(scipen=999)   # avoids printing things in scientific notation
options(stringsAsFactors=FALSE)   # avoids reading in factors when reading in files

Return to the OECD data

setwd("C:/Users/karol/Desktop/census-master")
data = read.csv("oecd.csv")

Let’s make some percentage - and other - variables using mathematical operations

data$pctsenior20 = data$pop65plus20/data$totpop20 
summary(data$pctsenior20)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.02662 0.14497 0.18364 0.17604 0.21266 0.33487      58
View(data)   # clicking on column headers sorts the data temporarily 

Compare senior share in 2005 versus 2020

data$pctsenior05 = data$pop65plus05/data$totpop05 
data$pctsrincr = data$pctsenior20 - data$pctsenior05
hist(data$pctsrincr)

Sort in code. Actually, this makes a new data frame which sorts by the desired column.

data2 = data[order(data$pctsrincr),]   
head(data2)
##     REG_ID                  Region Country totpop05 totpop20 pop65plus05
## 247    BE1 Brussels Capital Region      BE  1006750  1223360      155511
## 45    EL41            North Aegean      EL   199177   229516       45128
## 10    CO97               Vaup\xe9s      CO    25461    44712        1022
## 327    DE6                 Hamburg      DE  1734830  1847250      310915
## 267   ES42       Castile-La Mancha      ES  1874020  2045550      350650
## 345   ES64                 Melilla      ES    64910    84473        6951
##     pop65plus20 workagepop05 workagepop20 areasqkm popsqkm05 popsqkm20 pctba20
## 247      159659       666848       825104      162   6214.50   7551.63   0.493
## 45        47245       123781       143588     3801     52.40     60.38   0.293
## 10         1795        11943        23166    54135      0.47      0.83      NA
## 327      336359      1196580      1246260      710   2443.42   2601.76   0.360
## 267      390189      1233200      1349730    79109     23.69     25.86   0.308
## 345        9364        43182        55234       14   4636.43   6033.79   0.343
##     pm2pt5_05 pm2pt5_19  co2tot08 co2cap08 broadband19 inc05 inc19 unemp05
## 247     18.74     14.93  5055.730      4.8        0.87 21121 21773   0.164
## 45      10.11      7.73   407.930      2.0          NA 16327 13363   0.104
## 10      23.39     22.83        NA       NA          NA    NA    NA      NA
## 327     15.31     12.17 14313.600      8.1        0.96 27826 28429   0.106
## 267     12.32      9.76 23508.700     11.5        0.87 17169 17010   0.092
## 345     28.56     28.84     3.888      0.1        0.91 19623 15981   0.120
##     unemp20 covidvax2021q2 pctsenior20 pctsenior05       pctsrincr
## 247   0.125         0.4987  0.13050860  0.15446834 -0.023959739446
## 45    0.169             NA  0.20584622  0.22657235 -0.020726129686
## 10       NA             NA  0.04014582  0.04013982  0.000006000463
## 327   0.049         0.6990  0.18208634  0.17921929  0.002867055068
## 267   0.178             NA  0.19075016  0.18711113  0.003639034908
## 345   0.244         0.5857  0.11085199  0.10708674  0.003765252729
data2 = data[order(-data$pctsrincr),]  # this repeats the above, but the negative sign makes it in descending order
data2[1:10,]   # data frames can be sliced using brackets. The format is row, column. This isolates rows 1 through 10, and leaves the columns untouched
##     REG_ID                    Region Country totpop05 totpop20 pop65plus05
## 94     JPA                  Hokkaido      JP  5628000  5225000     1205690
## 65     JPC    Northern-Kanto, Koshin      JP 10097000  9597000     2099640
## 105    JPG             Kansai region      JP 20893000 20542000     4055550
## 344    JPI                   Shikoku      JP  4086000  3697000      991186
## 393   FRY2                Martinique      FR   395982   359821       52303
## 51    CA10 Newfoundland and Labrador      CA   514332   521364       67928
## 214    JPB                    Tohoku      JP  9635000  8612000     2230000
## 145    JPE                  Hokuriku      JP  5539000  5136000     1270300
## 217    JPF                    Toukai      JP 15021000 14924000     2870530
## 429   FRY1                Guadeloupe      FR   441511   412682       52755
##     pop65plus20 workagepop05 workagepop20 areasqkm popsqkm05 popsqkm20 pctba20
## 94      1679000      3696060      2989000    83456     67.44     62.61      NA
## 65      2904000      6543740      5566000    35355    285.59    271.45      NA
## 105     5893000     13834200     12197000    26232    796.47    783.09      NA
## 344     1238000      2544500      2037000    18648    219.11    198.25      NA
## 393       80327       259203       223328     1102    359.33    326.52      NA
## 51       116181       366005       335313   370514      1.39      1.41      NA
## 214     2772000      6060250      4885000    66504    144.88    129.50      NA
## 145     1629000      3492910      2907000    32723    169.27    156.95      NA
## 217     4133000      9927250      8916000    22338    672.44    668.10      NA
## 429       84319       284856       255777     1680    262.80    245.64      NA
##     pm2pt5_05 pm2pt5_19  co2tot08 co2cap08 broadband19 inc05 inc19 unemp05
## 94       8.79     10.42  50114.30      9.1    0.551745 18195    NA   0.053
## 65      11.59     12.67  99805.00     10.0    0.690894 20292    NA   0.038
## 105     12.62     13.99 171129.00      8.2    0.731207 21315    NA   0.054
## 344     11.60     13.54  46079.20     11.5    0.590374 18390    NA   0.048
## 393      9.49     10.59        NA       NA    0.760000 16669 21302   0.185
## 51       5.97      4.47   8015.61     15.7          NA 15317    NA   0.152
## 214      9.64     10.79 103584.00     11.0    0.618307 17846    NA   0.053
## 145     11.05     12.11  52947.70      9.7    0.703075 21816    NA   0.036
## 217     11.86     13.10 140793.00      9.3    0.709757 22311    NA   0.033
## 429      8.37      9.32        NA       NA    0.720000 16585 18979   0.255
##     unemp20 covidvax2021q2 pctsenior20 pctsenior05  pctsrincr
## 94    0.030             NA   0.3213397   0.2142306 0.10710908
## 65    0.027             NA   0.3025946   0.2079469 0.09464765
## 105   0.032             NA   0.2868757   0.1941105 0.09276520
## 344   0.031             NA   0.3348661   0.2425810 0.09228510
## 393   0.128         0.3032   0.2232416   0.1320843 0.09115727
## 51    0.139         0.7948   0.2228405   0.1320703 0.09077014
## 214   0.031             NA   0.3218765   0.2314478 0.09042861
## 145   0.025             NA   0.3171729   0.2293374 0.08783547
## 217   0.025             NA   0.2769365   0.1911011 0.08583535
## 429   0.178         0.2942   0.2043195   0.1194874 0.08483215
data2[1:10,1:5]   # this isolates rows 1-10 and columns 1-5 
##     REG_ID                    Region Country totpop05 totpop20
## 94     JPA                  Hokkaido      JP  5628000  5225000
## 65     JPC    Northern-Kanto, Koshin      JP 10097000  9597000
## 105    JPG             Kansai region      JP 20893000 20542000
## 344    JPI                   Shikoku      JP  4086000  3697000
## 393   FRY2                Martinique      FR   395982   359821
## 51    CA10 Newfoundland and Labrador      CA   514332   521364
## 214    JPB                    Tohoku      JP  9635000  8612000
## 145    JPE                  Hokuriku      JP  5539000  5136000
## 217    JPF                    Toukai      JP 15021000 14924000
## 429   FRY1                Guadeloupe      FR   441511   412682

Adding a rank field

srrank = seq(1:length(data2$pctsrincr))   # seq makes a vector from 1 until a specified value
srrank
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## [217] 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## [235] 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## [253] 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## [271] 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## [289] 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## [307] 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## [325] 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## [343] 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## [361] 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## [379] 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## [397] 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## [415] 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## [433] 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## [451] 451 452 453
data2 = cbind(data2, srrank)   # cbind stands for column bind. this pastes srrank to data2 and overwrites data2 with the result.

Make comparative histograms

par(mfrow=c(1,2))  # par stands for graphical parameter. mfrow specifies the number of rows and columns in the plot window
hist(data$pctsenior05)   # left plot
hist(data$pctsenior20)   # right plot 

par(mfrow=c(1,2))   # doing this again clears the plots 
hist(data$pctsenior05, breaks=seq(0,0.35,0.05), main="2005", ylab="World Regions", xlab="Percent Senior", col="forestgreen")
hist(data$pctsenior20, breaks=seq(0,0.35,0.05), main="2020", ylab="World Regions", xlab="Percent Senior", col="dodgerblue")

Adding a guideline (abline) to each histogram. can be vertical (v) or horizontal (h)

par(mfrow=c(1,2))
hist(data$pctsenior05, breaks=seq(0,0.35,0.05), main="2005", ylab="World Regions", xlab="Percent Senior", col="forestgreen")
abline(v=mean(data$pctsenior05, na.rm=T), col="black", lty=3, lwd=2)
hist(data$pctsenior20, breaks=seq(0,0.35,0.05), main="2020", ylab="World Regions", xlab="Percent Senior", col="dodgerblue")
abline(v=mean(data$pctsenior20, na.rm=T), col="black", lty=3, lwd=2)

Make a side-by-side boxplot to compare

boxplot(data$pctsenior05, data$pctsenior20)

CORRELATION ANALYSIS

Scatterplot

par(mfrow=c(1,1))   # reset graphics parameters to a single plot
plot(data$inc19, data$pm2pt5_19)  # plot command will plot 2 vectors against each other 

Scatterplot and best-fit line. Actually requires you to run a simple linear regression (lm command)

x = data$inc19   # assign this vector a shorter, more convenient name. 
y = data$pm2pt5_19
model = lm(y ~ x)   # runs a 2-variable regression and saves the output to an object I've called 'model'
summary(model)   # regression output 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5695 -2.6575 -0.6021  1.4821 17.6324 
## 
## Coefficients:
##                Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept) 17.85882873  0.71838502  24.860 <0.0000000000000002 ***
## x           -0.00023598  0.00002597  -9.087 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.928 on 265 degrees of freedom
##   (186 observations deleted due to missingness)
## Multiple R-squared:  0.2376, Adjusted R-squared:  0.2347 
## F-statistic: 82.57 on 1 and 265 DF,  p-value: < 0.00000000000000022
outcorr = cor.test(data$inc19, data$pm2pt5_19)  # sometimes it's convenient to assign the output of a test to a model so you can manipulate it later 
outcorr
## 
##  Pearson's product-moment correlation
## 
## data:  data$inc19 and data$pm2pt5_19
## t = -9.0869, df = 265, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5738772 -0.3901950
## sample estimates:
##        cor 
## -0.4874099
outcorr$estimate
##        cor 
## -0.4874099
plot(x,y, xlab="2019 Personal Income ($USD)", ylab = "Average air pollution level (PM2.5)", 
     main = "Income and particulate emissions across world regions")
abline(model)   # abline can also add model results to a plot 
legend("topright", inset=0.045, legend=c("r =", round(outcorr$estimate,4)))

A cleaned-up version, starting with the line “plot” above

plot(x, y, xlab="2019 Personal Income ($USD)", ylab = "Average air pollution level (PM2.5)", 
     main = "Income and particulate emissions across world regions", pch=20, font.main=4, 
     ylim=c(0,40), xlim=c(5000,70000))
abline(model, col="blue", lwd=2)
rp = vector('expression', 3)
rp[1] = substitute(expression(italic(B)[1] == MYVALUE3), list(MYVALUE3=format(summary(model)$coefficients[2], dig=4)))[2]
rp[2] = substitute(expression(italic(p) == MYVALUE2), list(MYVALUE2=format(summary(model)$coefficients[2,4], dig=3)))[2]
rp[3] = substitute(expression(italic(r) == MYVALUE), list(MYVALUE=format(sqrt(summary(model)$r.squared), dig=4)))[2]
legend("topright", legend=rp, bty='n')
text(data$inc19, data$pm2pt5_19-1, labels=data$Country, cex=0.5)

DEALING WITH MISSING VALUES

summary(data)
##     REG_ID             Region            Country             totpop05       
##  Length:453         Length:453         Length:453         Min.   :   25461  
##  Class :character   Class :character   Class :character   1st Qu.:  655329  
##  Mode  :character   Mode  :character   Mode  :character   Median : 1394730  
##                                                           Mean   : 2907246  
##                                                           3rd Qu.: 3259100  
##                                                           Max.   :40442800  
##                                                                             
##     totpop20         pop65plus05       pop65plus20       workagepop05     
##  Min.   :   29884   Min.   :    791   Min.   :   1611   Min.   :   11943  
##  1st Qu.:  715115   1st Qu.:  66751   1st Qu.: 116458   1st Qu.:  395865  
##  Median : 1518000   Median : 172288   Median : 262396   Median :  893579  
##  Mean   : 3186488   Mean   : 388793   Mean   : 556600   Mean   : 1963877  
##  3rd Qu.: 3534170   3rd Qu.: 462390   3rd Qu.: 599662   3rd Qu.: 2238690  
##  Max.   :46289300   Max.   :5993140   Max.   :9274000   Max.   :27632900  
##                     NA's   :27        NA's   :58        NA's   :27        
##   workagepop20         areasqkm         popsqkm05         popsqkm20      
##  Min.   :   18156   Min.   :     14   Min.   :   0.02   Min.   :   0.02  
##  1st Qu.:  438709   1st Qu.:  10140   1st Qu.:  24.91   1st Qu.:  26.82  
##  Median :  996186   Median :  23047   Median :  71.89   Median :  76.44  
##  Mean   : 1950689   Mean   :  95999   Mean   : 262.73   Mean   : 299.87  
##  3rd Qu.: 2236910   3rd Qu.:  58303   3rd Qu.: 167.03   3rd Qu.: 186.52  
##  Max.   :26107600   Max.   :2532420   Max.   :6879.65   Max.   :8444.19  
##  NA's   :58                                                              
##     pctba20         pm2pt5_05       pm2pt5_19        co2tot08       
##  Min.   :0.0000   Min.   : 4.15   Min.   : 3.70   Min.   :     3.9  
##  1st Qu.:0.2175   1st Qu.:10.53   1st Qu.: 7.78   1st Qu.:  4905.5  
##  Median :0.2893   Median :14.51   Median :11.96   Median : 13076.3  
##  Mean   :0.3040   Mean   :15.94   Mean   :13.72   Mean   : 35476.3  
##  3rd Qu.:0.3822   3rd Qu.:21.18   3rd Qu.:18.05   3rd Qu.: 40117.6  
##  Max.   :0.6280   Max.   :38.65   Max.   :41.93   Max.   :511789.0  
##  NA's   :162      NA's   :60      NA's   :60      NA's   :96        
##     co2cap08       broadband19         inc05           inc19      
##  Min.   :  0.10   Min.   :0.5517   Min.   : 4715   Min.   : 7445  
##  1st Qu.:  4.80   1st Qu.:0.8280   1st Qu.:15008   1st Qu.:17100  
##  Median :  7.40   Median :0.8680   Median :18768   Median :21840  
##  Mean   : 12.62   Mean   :0.8611   Mean   :20662   Mean   :25107  
##  3rd Qu.: 13.20   3rd Qu.:0.9100   3rd Qu.:25437   3rd Qu.:27170  
##  Max.   :340.70   Max.   :1.0000   Max.   :54104   Max.   :67034  
##  NA's   :96       NA's   :198      NA's   :145     NA's   :186    
##     unemp05           unemp20        covidvax2021q2    pctsenior20     
##  Min.   :0.01800   Min.   :0.01900   Min.   :0.0510   Min.   :0.02662  
##  1st Qu.:0.04900   1st Qu.:0.04400   1st Qu.:0.5581   1st Qu.:0.14497  
##  Median :0.06800   Median :0.07000   Median :0.6464   Median :0.18364  
##  Mean   :0.08266   Mean   :0.08071   Mean   :0.6090   Mean   :0.17604  
##  3rd Qu.:0.10375   3rd Qu.:0.09900   3rd Qu.:0.7209   3rd Qu.:0.21266  
##  Max.   :0.29900   Max.   :0.33600   Max.   :0.8466   Max.   :0.33487  
##  NA's   :139       NA's   :121       NA's   :237      NA's   :58       
##   pctsenior05        pctsrincr       
##  Min.   :0.02163   Min.   :-0.02396  
##  1st Qu.:0.08195   1st Qu.: 0.03006  
##  Median :0.13533   Median : 0.04206  
##  Mean   :0.12839   Mean   : 0.04235  
##  3rd Qu.:0.16581   3rd Qu.: 0.05277  
##  Max.   :0.26441   Max.   : 0.10711  
##  NA's   :27        NA's   :85
mean(data$broadband19)
## [1] NA
mean(data$broadband19, na.rm=T)
## [1] 0.8610923
data3 = data[!is.na(data$broadband19),]   # isolate rows wherein broadband19 is NA, remove them, and save to 'data3'
nrow(data)
## [1] 453
nrow(data3)   # notice how many fewer observations are now in data4 
## [1] 255

Other helpful operations on a data frame

data$pctsrincr = NULL   # remove this column 
data4 = data[c(1:4)]   # extract just the first 4 columns

QUIZ QUESTIONS - CHALLENGE YOURSELF

(3) How many world regions have a 2020 population density over 500/sqkm?

(4) What is the correlation coefficient between COVID vaccination rates and broadband access in world? Round to 2 decimal places

(3) WORKING WITH MULTIPLE DATA SOURCES

MERGE DATASETS FROM DIFFERENT SOURCES AND ANALYZE THEM

What predicts domestic migration in California counties? Housing cost, or job growth?

Download county migration data which I have pre-processed. Data reflect changes ending 7/1 of year listed. Source: CA Dep’t of Finance, http://www.dof.ca.gov/Forecasting/Demographics/Estimates/E-6/

setwd("C:/Users/karol/Desktop/census-master")

dof = read.csv("ca_dof_popchg_e6.csv")
head(dof)
##      county id_old             id     mpo   pop10 births10 deaths10 natinc10
## 1   Alameda   6001 0500000US06001    ABAG 1515354     4873     2157     2716
## 2    Alpine   6003 0500000US06003 non-MPO    1175        3        3        0
## 3    Amador   6005 0500000US06005 non-MPO   38069       78      100      -22
## 4     Butte   6007 0500000US06007 non-MPO  220193      622      551       71
## 5 Calaveras   6009 0500000US06009 non-MPO   45535       75      118      -43
## 6    Colusa   6011 0500000US06011 non-MPO   21465       69       23       46
##   immig10 dommig10   pop17 births17 deaths17 natinc17 immig17 dommig17   pop21
## 1    2081      286 1650818    19443    10112     9331   12421    -8110 1671741
## 2       0        0    1141        4       13       -9       1       -1    1181
## 3       2       -2   37050      306      437     -131      20      -20   40316
## 4      63       59  226470     2476     2406       70     191     1448  201158
## 5       2       -2   44609      371      509     -138      35      -35   45111
## 6      16      -16   22580      313      161      152     126     -126   22059
##   births21 deaths21 natinc21 immig21 dommig21
## 1    17296    11592     5704    1558   -17221
## 2       13       33      -20       0        2
## 3      272      503     -231       6       35
## 4     1983     2476     -493     -35    -9530
## 5      360      586     -226       4       56
## 6      292      193       99      18      116
nrow(dof)
## [1] 58

Download Census data the old fashioned way (median home values)

Then, Geography -> Counties -> California -> check all counties in California

Then, enter B25077 in ‘Table ID’ and hit SEARCH at the bottom-right which is median value of owner-occupied housing units

Select the first entry (it’s a Table)

In the bar above the data, select ‘Transpose’

Then, select ‘ZIP’ to download (‘csv’ won’t work as a method to prevent you from not downloading metadata)

Run this fancy version of read.csv to read in WITHOUT the double-line-header the Bureau puts in. MAKE SURE the file name matches yours!

setwd("C:/Users/karol/Desktop/census-master")

acs = read.csv(text=readLines("./ACSDT5Y2020.B25077_data_with_overlays_2022-05-24T172640.csv")[(-2)])
head(acs)
##   B25077_001E B25077_001M         GEO_ID                         NAME
## 1      825300        4462 0500000US06001   Alameda County, California
## 2      372500       23650 0500000US06003    Alpine County, California
## 3      329300       11372 0500000US06005    Amador County, California
## 4      304700        7078 0500000US06007     Butte County, California
## 5      340000        9138 0500000US06009 Calaveras County, California
## 6      274100       17719 0500000US06011    Colusa County, California

Job info has been processed for you and is available from the bureau of labor statistics (https://data.bls.gov/cew/apps/data_views/data_views.htm#tab=Tables)

setwd("C:/Users/karol/Desktop/census-master")

bls = read.csv("blsemp_ca.csv")
head(bls)
##   STATEFP COUNTYFP GEOID_OLD          GEOID COUNTYNS      NAME         NAMELSAD
## 1       6        1      6001 0500000US06001  1675839   Alameda   Alameda County
## 2       6        3      6003 0500000US06003  1675840    Alpine    Alpine County
## 3       6        5      6005 0500000US06005  1675841    Amador    Amador County
## 4       6        7      6007 0500000US06007  1675842     Butte     Butte County
## 5       6        9      6009 0500000US06009  1675885 Calaveras Calaveras County
## 6       6       11      6011 0500000US06011  1675902    Colusa    Colusa County
##    X emp_jun15 emp_jun16 emp_jun17 emp_jun18 emp_jun19 emp_jun20 emp_jun21
## 1 NA    732579    755866    778911    793656    795938    706180    761279
## 2 NA       503       618       554       589       625       472       600
## 3 NA     11833     12053     12399     12473     12469     11562     12058
## 4 NA     78335     80680     82771     84185     81236     73239     76870
## 5 NA      8678      9325      9641      9903      9957      9725     10317
## 6 NA      9649      9681      9249      9812      9905      8482      9607

Join other data to ‘dof’ data using “match” (will work regardless of whether data are sorted, similar to a v-lookup in Excel)

creates a new column in ‘dof’ called ‘hoval20,’ which is equivalent to the column B25077_001E in the acs data

rows are matched using the unique id in ‘dof’ (id) and ‘acs’ (GEO.id2)

dof$hoval20 = acs$B25077_001E[match(dof$id, acs$GEO_ID)]
dof$emp21 = bls$emp_jun21[match(dof$id, bls$GEOID)]   # same thing for bls data 
dof$emp15 = bls$emp_jun15[match(dof$id, bls$GEOID)]   # one time for each row 

You can make a percent growth variable using basic math:

dof$popgr1721 = (dof$pop21 - dof$pop17)/dof$pop17
dof$empgr1521 = (dof$emp21 - dof$emp15)/dof$emp15

ANALYZE THE DATA USING PLOTS AND A REGRESSION MODEL

Compare the size of a county with its median home value.

Histograms help you understand how the data are distributed!

hist(dof$pop21, breaks=20) 

# Taking the logarithm of non-zero data can get rid of skew issues # Right skew is common in social science data - high values pull up the means. Using a log transformation is particularly useful when comparing data, e.g. understanding the correlation between two variables.

hist(log(dof$pop21))

boxplot(dof$popgr1721, dof$empgr1521)   # side-by-side boxplot.

Make a scatterplot and check the correlation

x = dof$hoval20
y = log(dof$pop21)
plot(x,y)

cor.test(x, y)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 4.274, df = 56, p-value = 0.00007531
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2725710 0.6685972
## sample estimates:
##       cor 
## 0.4959499

Make a fancier scatterplot with a best-fit line

model = lm(y ~ x)  # stores the results of a simple linear regression in the object 'model'
plot(x, y, ylab="Log of Population, 2021", xlab="Median Home Value, 2020", main="California Counties")   # see help(par) to view all plot options
abline(model, col="dodgerblue", lwd=3)   
rp = vector('expression', 3)
rp[1] = substitute(expression(italic(B)[1] == MYVALUE3), list(MYVALUE3=format(summary(model)$coefficients[2], dig=4)))[2]
rp[2] = substitute(expression(italic(p) == MYVALUE2), list(MYVALUE2=format(summary(model)$coefficients[2,4], dig=3)))[2]
rp[3] = substitute(expression(italic(r) == MYVALUE), list(MYVALUE=round(sqrt(summary(model)$r.squared), 4)))[2]
legend("bottomright", legend=rp, bty='n')  # can put this at any corner of the plot

(4) SUMMARIZING DATA AT DIFFERENT SPATIAL SCALES

ADDITIONAL FUNCTIONALITY WITH PACKAGES

In addition to the functions included with the base version of R, there are THOUSANDS of packages with added functionality.

Making a package is fairly easy, so if someone develops a new method/technique/visualization/etc., it’s fairly easy to implement in R.

Visit a CRAN mirror page to see all the packages available: https://mirror.las.iastate.edu/CRAN/

Install packages using a single line of code (or, Tools -> Install Packages)

Each time you use R, you’ll need to activate the package.

A good practice is to include this at the beginning of any script which uses the package (this intro script is called a ‘preamble’)

library(sf)
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(foreign) 
library(doBy)

SUMMARY BY

A handy trick is to be able to summarize data at a different level, or spatial scale.

First, see how many unique values are in the ‘mpo’ field.

unique(unlist(dof$mpo))
## [1] "ABAG"    "non-MPO" "SACOG"   "SCAG"    "SANDAG"
length(unique(unlist(dof$mpo)))
## [1] 5

Return to the OECD data for a moment to see how many regions are in each country

setwd("C:/Users/karol/Desktop/census-master")

data = read.csv("oecd.csv")

Make a sorted table and a barplot of the number of regions per country

table(data$Country)
## 
## AT AU BE BG BR CA CH CL CO CR CZ DE DK EE EL ES FI FR HR HU IE IL IS IT JP KR 
##  9  8  3  6 27 13  7 16 33  6  8 16  5  1 13 19  5 18  1  8  3  6  2 21 10  7 
## LT LU LV MT NL NO NZ PE PL PT RO SE SI SK TR US 
##  2  1  1  1 12  3 14 25 17  7  8  8  2  4 26 51
sort(table(data$Country), decreasing=TRUE)  # the sort command, with the argument decreasing=TRUE will sort the table in decreasing order.
## 
## US CO BR TR PE IT ES FR PL CL DE NZ CA EL NL JP AT AU CZ HU RO SE CH KR PT BG 
## 51 33 27 26 25 21 19 18 17 16 16 14 13 13 12 10  9  8  8  8  8  8  7  7  7  6 
## CR IL DK FI SK BE IE NO IS LT SI EE HR LU LV MT 
##  6  6  5  5  4  3  3  3  2  2  2  1  1  1  1  1
barplot(sort(table(data$Country), decreasing=TRUE), cex.names=0.8,
        main="Number of regions per country")   # a barplot can be made (can also add colors, etc.). cex.names makes the text smaller so all the country names fit.

Use the summaryBy command (part of the doBy package) to summarize by larger spatial unit

a new data frame will be created which summarizes the variables to the left of the tilde at the unit defined to the right of the tilde

dofsum = summaryBy(pop21 + emp21 ~ mpo, data=dof, FUN=sum)   # takes the sum total of emp21 and pop21 by MPO
dofsum
##       mpo pop21.sum emp21.sum
## 1    ABAG   7696482   3781151
## 2 non-MPO   7061468   2596575
## 3   SACOG   2587772   1065151
## 4  SANDAG   3288455   1430039
## 5    SCAG  18734436   7707865
dofmean = summaryBy(hoval20 + immig21 ~ mpo, data=dof, FUN=mean)   # mean across counties in each MPO. Ensure this makes sense!
dofmean
##       mpo hoval20.mean immig21.mean
## 1    ABAG     850722.2     803.5556
## 2 non-MPO     330647.2      85.0000
## 3   SACOG     397516.7     428.0000
## 4  SANDAG     595600.0    1898.0000
## 5    SCAG     475300.0    2111.0000

Write the table output as a .csv to your working directory

setwd("C:/Users/karol/Desktop/census-master")

write.csv(dofsum, "dofsumtable.csv")

Make a single barplot by MPO

barplot(dofsum$pop21.sum, names.arg=dofsum$mpo)

Make the barplot look better

barplot(dofsum$pop21.sum, names.arg=dofmean$mpo, axes=F, col="dodgerblue", main="2021 Population by MPO (millions)")
axis(2, at=seq(0,20000000,5000000), lab=seq(0,20,5))
abline(h=seq(0,20000000,5000000), lty=3, col="darkgrey")
box()

Summarize some of the OECD data. Note that rate or percentage data doesn’t collapse as well.

out = summaryBy(areasqkm + totpop20 + co2cap08 ~ Country, data=data, FUN=sum)
out2 = out[order(-out$co2cap08.sum),]  # sort by CO2 
out2
##    Country areasqkm.sum totpop20.sum co2cap08.sum
## 42      US      9161929    331501023       1071.6
## 33      NZ       263357      5089400        544.0
## 5       BR      8511230    211755654        199.6
## 15      EL       130048     10718562        184.7
## 12      DE       353296     83166649        183.8
## 23      IS       100450       364134        160.2
## 2       AU      7703349     25692633        143.5
## 31      NL        34188     17407594        135.2
## 24      IT       297825     59641498        129.4
## 16      ES       502654     47332616        129.3
## 17      FI       304316      5525294        106.7
## 8       CL       742979     19458310        105.9
## 11      CZ        77212     10693940        101.8
## 41      TR       766509     83155037         97.3
## 25      JP       373530    126146000         92.4
## 26      KR        99461     51780527         82.2
## 1       AT        82519      8901072         77.7
## 38      SE       407300     10327588         55.5
## 22      IL        21643      8698700         52.1
## 13      DK        41987      5822765         44.9
## 7       CH        39860      8606033         42.5
## 40      SK        48702      5457872         34.9
## 36      PT        90996     10295914         33.1
## 3       BE        30452     11522440         27.7
## 32      NO       195571      1336968         23.4
## 39      SI        20145      2095859         22.9
## 28      LU         2586       626108         22.1
## 14      EE        43110      1328980         16.4
## 4       BG       110001      6951487           NA
## 6       CA      8965591     38037197           NA
## 9       CO      1141766     50372444           NA
## 10      CR        51709      5111221           NA
## 18      FR       633886     67320260           NA
## 19      HR        24515      1373800           NA
## 20      HU        91248      9769532           NA
## 21      IE        68655      4964442           NA
## 27      LT        62643      2794091           NA
## 29      LV        63290      1907680           NA
## 30      MT          313       514564           NA
## 34      PE      1285216     32625989           NA
## 35      PL       307236     37958173           NA
## 37      RO       234270     19328850           NA
cor.test(out2$co2cap08.sum, out2$totpop20.sum)   # are larger countries higher emitters? 
## 
##  Pearson's product-moment correlation
## 
## data:  out2$co2cap08.sum and out2$totpop20.sum
## t = 5.5136, df = 26, p-value = 0.000008713
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4973013 0.8691760
## sample estimates:
##      cor 
## 0.734167

REGRESSION ANALYSIS - THE VERY BASICS

the first command (lm) makes a linear model, and stores the output

the dependent variable goes first, followed by a tilde and the independent variables.

for convenience, you can declare the data frame separately and skip the dollar signs

This shows that net domestic migration in 2021 was a function of smaller population, lower home value, and higher job growth.

m = lm(dommig21 ~ pop21 + hoval20 + empgr1521, data=dof)
summary(m)
## 
## Call:
## lm(formula = dommig21 ~ pop21 + hoval20 + empgr1521, data = dof)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10465.3  -2285.2   -208.8   1408.9  20778.7 
## 
## Coefficients:
##                  Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)  3146.8375822  1345.7188993   2.338             0.023099 *  
## pop21          -0.0076389     0.0004264 -17.915 < 0.0000000000000002 ***
## hoval20        -0.0088608     0.0025451  -3.482             0.000995 ***
## empgr1521   24097.9287781  8873.0081937   2.716             0.008860 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4494 on 54 degrees of freedom
## Multiple R-squared:  0.8846, Adjusted R-squared:  0.8782 
## F-statistic:   138 on 3 and 54 DF,  p-value: < 0.00000000000000022

(5) MANIPULATING SHAPEFILES

GIS data often lack efficient or comprehensive data joining capability

By reading and writing the .dbf portion of an ESRI shapefile using the ‘foreign’ package, this task is a breeze

BE VERY CAREFUL not to add/delete rows, reorder, or accidentally overwrite the .dbf with something else, as this may ruin the shapefile

Read in the .dbf portion of the California counties shapefile

setwd("C:/Users/karol/Desktop/census-master")

shape = read.dbf("cb_2017_ca_county_500k.dbf")   
head(shape)
##   STATEFP COUNTYFP COUNTYNS       AFFGEOID GEOID         NAME LSAD       ALAND
## 1      06      001 01675839 0500000US06001 06001      Alameda   06  1909616630
## 2      06      005 01675841 0500000US06005 06005       Amador   06  1539933576
## 3      06      013 01675903 0500000US06013 06013 Contra Costa   06  1857310903
## 4      06      023 01681908 0500000US06023 06023     Humboldt   06  9241251740
## 5      06      037 00277283 0500000US06037 06037  Los Angeles   06 10510588451
## 6      06      047 00277288 0500000US06047 06047       Merced   06  5012175320
##       AWATER
## 1  216916717
## 2   29470568
## 3  225193562
## 4 1254039383
## 5 1794793532
## 6  112509475

When matching, you may have to watch out for strings, leading zeroes, or other issues that may cause ID fields not to match

Here, we can make a new numeric field in the shapefile to match with our dof file

The nested as.numeric(as.character()) commands pretty reliably coerce things into numeric format

shape$GEOID2 = as.numeric(as.character(shape$GEOID))

Match a couple of variables from the DOF data, then summarize to see if it worked

shape$pop21 = dof$pop21[match(shape$GEOID2, dof$id_old)]
shape$hoval20 = dof$hoval20[match(shape$GEOID2, dof$id_old)]
shape$emp21 = dof$emp21[match(shape$GEOID2, dof$id_old)]
summary(shape)
##  STATEFP    COUNTYFP      COUNTYNS            AFFGEOID      GEOID   
##  06:58   001    : 1   00277273: 1   0500000US06001: 1   06001  : 1  
##          003    : 1   00277274: 1   0500000US06003: 1   06003  : 1  
##          005    : 1   00277275: 1   0500000US06005: 1   06005  : 1  
##          007    : 1   00277277: 1   0500000US06007: 1   06007  : 1  
##          009    : 1   00277280: 1   0500000US06009: 1   06009  : 1  
##          011    : 1   00277281: 1   0500000US06011: 1   06011  : 1  
##          (Other):52   (Other) :52   (Other)       :52   (Other):52  
##         NAME    LSAD        ALAND                 AWATER          
##  Alameda  : 1   06:58   Min.   :  121485107   Min.   :   4721397  
##  Alpine   : 1           1st Qu.: 2485200412   1st Qu.:  41652226  
##  Amador   : 1           Median : 3978042624   Median : 147555173  
##  Butte    : 1           Mean   : 6956606590   Mean   : 353183413  
##  Calaveras: 1           3rd Qu.: 8948208104   3rd Qu.: 475186943  
##  Colusa   : 1           Max.   :51948123813   Max.   :2729814515  
##  (Other)  :52                                                     
##      GEOID2         pop21            hoval20            emp21        
##  Min.   :6001   Min.   :   1181   Min.   : 153600   Min.   :    600  
##  1st Qu.:6030   1st Qu.:  47536   1st Qu.: 262875   1st Qu.:  14547  
##  Median :6058   Median : 187032   Median : 344250   Median :  70836  
##  Mean   :6058   Mean   : 678769   Mean   : 437798   Mean   : 285876  
##  3rd Qu.:6086   3rd Qu.: 705776   3rd Qu.: 593825   3rd Qu.: 252805  
##  Max.   :6115   Max.   :9944953   Max.   :1163100   Max.   :4219893  
## 

CAREFULLY write new dbf. Then you’ll be able to open in a GIS software and map the new field!

write.dbf(shape, "cb_2017_ca_county_500k.dbf")

(6) MAPPING MADE SIMPLE(ISH)

SIMPLE MAP USING THE SF PACKAGE

Read in a county shapefile

dsn “.” indicates the shapefile resides in the current working directory

no file extension (e.g., .shp) is required for the layer input

library(sf)
counties = read_sf(dsn = ".", layer = "cb_2017_ca_county_500k")
head(counties)
## Simple feature collection with 6 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -124.4096 ymin: 32.80146 xmax: -117.6464 ymax: 41.46584
## Geodetic CRS:  NAD83
## # A tibble: 6 × 14
##   STATEFP COUNT…¹ COUNT…² AFFGE…³ GEOID NAME  LSAD    ALAND AWATER GEOID2  pop21
##   <chr>   <chr>   <chr>   <chr>   <chr> <chr> <chr>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 06      001     016758… 050000… 06001 Alam… 06    1.91e 9 2.17e8   6001 1.67e6
## 2 06      005     016758… 050000… 06005 Amad… 06    1.54e 9 2.95e7   6005 4.03e4
## 3 06      013     016759… 050000… 06013 Cont… 06    1.86e 9 2.25e8   6013 1.16e6
## 4 06      023     016819… 050000… 06023 Humb… 06    9.24e 9 1.25e9   6023 1.35e5
## 5 06      037     002772… 050000… 06037 Los … 06    1.05e10 1.79e9   6037 9.94e6
## 6 06      047     002772… 050000… 06047 Merc… 06    5.01e 9 1.13e8   6047 2.83e5
## # … with 3 more variables: hoval20 <dbl>, emp21 <dbl>,
## #   geometry <MULTIPOLYGON [°]>, and abbreviated variable names ¹​COUNTYFP,
## #   ²​COUNTYNS, ³​AFFGEOID
cnty = data.frame(counties)   # if you just want to extract the attribute/data table 

Draw a chloropleth map of the “ALAND” field in the counties shapefile

plot(counties["ALAND"])

Plot a variable which may actually be meaningful

plot(counties["pop21"])

A more advanced version

Review plot options at https://cran.r-project.org/web/packages/sf/vignettes/sf5.html#geometry_with_attributes:_sf

plot(counties["pop21"], breaks="jenks", key.pos=2, pal=sf.colors(10), 
     main="2021 Population in California counties", axes=TRUE)

(7) Using the Census API

Install TidyCensus & tigris, and load other packages we’ll need to use the Census API

Request a key from the Census to access their data

You’ll need to request a Census Key to access their API.

This takes only one minute. Then, enter your key below (mine is in there now)

http://api.census.gov/data/key_signup.html

library(tidycensus)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(sf)
library(doBy)
census_api_key("5de05eeaa3869c3180eed1726a07e0f27e6c9e20", install=TRUE, overwrite=TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). 
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "5de05eeaa3869c3180eed1726a07e0f27e6c9e20"

Extract your first variable from the decennial census using the function get_decennial

medrent00 = get_decennial(geography = "state", variables = "H060001", sumfile="sf3", year = 2000)
## Getting data from the 2000 decennial Census
## Using Census Summary File 3

Make a barplot

barplot(medrent00$value, names.arg=medrent00$NAME, cex.names=0.7, las=2)

Make a better barplot

medrent00b = medrent00[order(-medrent00$value),]        # sort descending by making a new data frame
barplot(medrent00b$value, names.arg=medrent00b$NAME, cex.names=0.7, las=2,
        main="State median rent, 2000", col="dodgerblue")   #cex.names makes labels smaller, las=2 rotates them.
abline(h=seq(0,700,100), col="darkgrey", lty=3)
box()

FINDING GOOD CENSUS VARIABLES TO USE

Focusing on the most recent ACS 5-year estimates (2020 at the time of this writing) is a good way to go.

All the Census API available datasets are at https://api.census.gov/data.html

See also, “useful_2020_census_vars_KKguide.xlsx”

You can also extract a list of variables using the code here.

acsvars = load_variables(2020, "acs5", cache = TRUE)
head(acsvars)
## # A tibble: 6 × 4
##   name        label                                   concept            geogr…¹
##   <chr>       <chr>                                   <chr>              <chr>  
## 1 B01001A_001 Estimate!!Total:                        SEX BY AGE (WHITE… tract  
## 2 B01001A_002 Estimate!!Total:!!Male:                 SEX BY AGE (WHITE… tract  
## 3 B01001A_003 Estimate!!Total:!!Male:!!Under 5 years  SEX BY AGE (WHITE… tract  
## 4 B01001A_004 Estimate!!Total:!!Male:!!5 to 9 years   SEX BY AGE (WHITE… tract  
## 5 B01001A_005 Estimate!!Total:!!Male:!!10 to 14 years SEX BY AGE (WHITE… tract  
## 6 B01001A_006 Estimate!!Total:!!Male:!!15 to 17 years SEX BY AGE (WHITE… tract  
## # … with abbreviated variable name ¹​geography
nrow(acsvars)   # tells us how many variables
## [1] 27850
length(unique(unlist(acsvars$concept)))   # tells us how many unique "concepts" there are
## [1] 1138
write.csv(acsvars, "acsvars.csv")   # export to .csv so you explore easily in Excel 

ASSEMBLE A SET OF TRACT-LEVEL VARIABLES FOR A COUNTY

Get a single variable (Median rent)

tr = get_acs(geography="tract", state="CA", county="Orange", variables="B25031_001", 
               year=2020, geometry=TRUE)
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%
tr$medrent = tr$estimate  # create a renamed to avoid future confusion
tr$estimate = NULL   # get rid of the old one 

Add additional variables with a single extraction by putting them into a list.

Total population: B00101_001

Median household income: B19013_001

Median age of housing: B25035_001

varlist = c("B01001_001", "B19013_001", "B25035_001", "B08101_009")
tr_plus =  get_acs(geography="tract", state="CA", county="Orange", variables=varlist, year=2020)
## Getting data from the 2016-2020 5-year ACS

Use a match command to bind each new variable to the original data frame (vent). name the variables something intuitive.

Since the data are stored long, each match operation requires a subset operation first.

tr_a = subset(tr_plus, variable=="B01001_001")
tr$totpop = tr_a$estimate[match(tr$GEOID, tr_a$GEOID)]

tr_b = subset(tr_plus, variable=="B19013_001")
tr$medhhinc = tr_b$estimate[match(tr$GEOID, tr_b$GEOID)]

tr_c = subset(tr_plus, variable=="B25035_001")
tr$medhomeage = tr_c$estimate[match(tr$GEOID, tr_c$GEOID)]

Remember you can plot this as a map too!

plot(tr['medhhinc'])

8.) ALL VARIABLES AT THE TRACT LEVEL

I’ve built this loop to extract all of “Kevin’s commonly used variables” in one step.

Extract first variable (total population) + geometry

You can declare your parameters below:

mystate = "CA"
mycounty = "Imperial"
myyear = 2020
mysurvey = "acs5"
cnty = get_acs(geography="tract", state=mystate, county=mycounty, variables="B01001_001", year=myyear, survey=mysurvey, geometry=TRUE)
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.

Extract all other variables

colnames(cnty)[4] = "totpop"

varlist = c('B19001_001', 'B01002_001', 'B03002_001', 'B03002_003', 'B03002_004', 'B03002_012', 'B05001_001', 'B05001_006', 
            'B07001_001', 'B07001_017', 'B07001_033', 'B07001_049', 'B07001_065', 'B08014_001', 'B08014_002', 'B08014_003', 
            'B08014_004', 'B08014_005', 'B08014_006', 'B08014_007', 'B08101_001', 'B08101_009', 'B08101_033', 'B08101_017', 
            'B08101_025', 'B08101_041', 'B08101_049', 'B15003_001', 'B15003_002', 'B15003_003', 'B15003_004', 'B15003_005', 
            'B15003_006', 'B15003_007', 'B15003_008', 'B15003_009', 'B15003_010', 'B15003_011', 'B15003_012', 'B15003_013', 
            'B15003_014', 'B15003_015', 'B15003_016', 'B15003_017', 'B15003_018', 'B15003_019', 'B15003_020', 'B15003_021', 
            'B15003_022', 'B15003_023', 'B15003_024', 'B15003_025', 'B19001_002', 'B19001_003', 'B19001_004', 'B19001_005', 
            'B19001_006', 'B19001_007', 'B19001_008', 'B19001_009', 'B19001_010', 'B19001_011', 'B19001_012', 'B19001_013', 
            'B19001_014', 'B19001_015', 'B19001_016', 'B19001_017', 'B19013_001', 'B23025_001', 'B23025_002', 'B25031_001', 
            'B25077_001', 'B25034_001', 'B25034_002', 'B25034_003', 'B25034_004', 'B25034_005', 'B25034_006', 'B25034_007', 
            'B25034_008', 'B25034_009', 'B25034_010', 'B25034_011', 'B25035_001', 'B25024_002', 'B25024_003', 'B25024_004', 
            'B25024_005', 'B25024_006', 'B25024_007', 'B25024_008', 'B25024_009', 'B25024_010', 'B25024_011', 'B25003_001', 
            'B25003_002', 'B25003_003', 'B01001_003', 'B01001_004', 'B01001_005', 'B01001_006', 'B01001_020', 'B01001_021', 
            'B01001_022', 'B01001_023', 'B01001_024', 'B01001_025', 'B01001_027', 'B01001_028', 'B01001_029', 'B01001_030', 
            'B01001_044', 'B01001_045', 'B01001_046', 'B01001_047', 'B01001_048', 'B01001_049')
cnty2 =  get_acs(geography="tract", state=mystate, county=mycounty, variables=varlist, year=myyear, survey=mysurvey, geometry=TRUE)
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
varnames = c('totHH', 'medage', 'tot4race', 'wnh', 'bnh', 'hisp', 'tot4_cit', 'noncit', 'tot4_mv', 'no_mv', 
             'in_cnty_mv', 'in_st_mv', 'out_st_mv', 'tot4_vehic', 'vehic0', 'vehic1', 'vehic2', 'vehic3', 'vehic4', 'vehic5pl', 
             'tot4_comm', 'comm_sov', 'comm_walk', 'comm_pool', 'comm_trans', 'comm_oth', 'comm_wah', 'tot4_educ', 'educ_none', 
             'educ_nurs', 'educ_kind', 'educ_1st', 'educ_2nd', 'educ_3rd', 'educ_4th', 'educ_5th', 'educ_6th', 'educ_7th', 'educ_8th', 
             'educ_9th', 'educ_10th', 'educ_11th', 'educ_12th', 'educ_hs', 'educ_ged', 'educ_cnud1', 'educ_some', 'educ_asso', 
             'educ_bach', 'educ_mast', 'educ_prof', 'educ_doct', 'hincund10', 'hinc1014', 'hinc1519', 'hinc2024', 
             'hinc2529', 'hinc3034', 'hinc3539', 'hinc4044', 'hinc4549', 'hinc5059', 'hinc6074', 'hinc7599', 'hinc100124', 'hinc125149',
             'hinc150199', 'hinc200pl', 'medhhinc', 'tot4_work', 'emp', 'medrent', 'medhoval', 'totHU', 'HU2014', 'HU1013', 'HU9', 
             'HU9099', 'HU8089', 'HU7079', 'HU6069', 'HU5059', 'HU4049', 'HU1939', 'medyrblt', 'HU_sfd', 'HU_sfa', 'HU_mf2', 'HU_mf3_4', 
             'HU_mf5_9', 'HU_mf1019', 'HU_mf2049', 'HU_mf50pl', 'HU_mob', 'HU_boatRV', 'tot4_tenu', 'ownerHH', 'renterHH',
             'male0005', 'male0509', 'male1014', 'male1517', 'male6566', 'male6769', 'male7074', 'male7579', 'male8084',
             'male8500', 'female0005','female0509', 'female1014', 'female1517', 'female6566', 'female6769', 'female7074',
             'female7579', 'female8084', 'female8500')

for(i in 1:length(unique(unlist(cnty2$variable)))){
  join = subset(cnty2, variable==varlist[i])
  cnty[,ncol(cnty)+1] = join$estimate[match(cnty$GEOID, join$GEOID)]
  colnames(cnty)[ncol(cnty)] = varnames[i]  
}

Prep some useful summary variables

cnty$hhinc_sub20 = cnty$hincund10 + cnty$hinc1014 + cnty$hinc1519
cnty$hhinc_over100 = cnty$hinc100124 + cnty$hinc125149 + cnty$hinc150199 + cnty$hinc200pl
cnty$BAplus = cnty$educ_doct + cnty$educ_prof + cnty$educ_mast + cnty$educ_bach
cnty$noHS = cnty$tot4_educ - cnty$BAplus - cnty$educ_asso - cnty$educ_some - cnty$educ_cnud1 - cnty$educ_ged - cnty$educ_hs
cnty$age0017 = cnty$male0005 + cnty$male0509 + cnty$male1014 + cnty$male1517 + cnty$female0005 + cnty$female0509 + cnty$female1014 + cnty$female1517
cnty$age65plus = cnty$male6566 + cnty$male6769 + cnty$male7074 + cnty$male7579 + cnty$male8084 + cnty$male8500 + cnty$female6566 + cnty$female6769 + cnty$female7074 + cnty$female7579 + cnty$female8084 + cnty$female8500

Carefully write to a .shp

st_write(cnty, "IM_tracts_ACS20.shp")
## Warning in abbreviate_shapefile_names(obj): Field names abbreviated for ESRI
## Shapefile driver
## Writing layer `IM_tracts_ACS20' to data source 
##   `IM_tracts_ACS20.shp' using driver `ESRI Shapefile'
## Writing 40 features with 129 fields and geometry type Multi Polygon.

Carefully write to a .csv

write.csv(cnty, "IM_tracts_ACS20.csv")

View a variable

plot(cnty['medhhinc'])